In [3]:
import pandas as pd
import spacy
import numpy as np
import os
from glob import glob
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import functools
from pprint import pprint
from sklearn.preprocessing import normalize

# Helper

## `extract_context_words`

In [4]:
def extract_context_words(text, nlp, context_pos=('NOUN', 'VERB', 'ADJ')):
    """입력된 text로부터 명사/동사 추출
    
    Args:
        text (str): Context 추출할 텍스트
        nlp (spacy.lang.en.English): spaCy 모델
        context_pos (list): 추출할 품사
    Returns:
        list: 텍스트에서 추출한 문맥 단어 리스트. ['단어 소문자'] 형태. stopword 제외
    """
    doc = nlp(text)
#     return [(tok.text.lower(), tok.pos_) for tok in doc if tok.pos_ in ('NOUN', 'VERB')]
    return [tok.lemma_.lower() for tok in doc if tok.pos_ in context_pos and not tok.is_stop]
#     return [tok.text.lower() for tok in doc if not tok.is_stop and tok.is_alpha]

In [5]:
text = 'Tokenization big standards are based on the OntoNotes 5 corpus. The tokenizer differs from most by including tokens for significant whitespace.'
nlp = spacy.load('en')
extract_context_words(text, nlp)

['tokenization',
 'big',
 'standard',
 'base',
 'corpus',
 'tokenizer',
 'differ',
 'include',
 'token',
 'significant',
 'whitespace']

## `load_word2vec`, `load_nnse`

In [6]:
def load_word2vec():
    """word2vec 임베딩 행렬을 `pandas.DataFrame` 형태로 로드
    
    Returns:
        pandas.DataFrame: Index: 단어, Column: 차원 값 형태
    """
    embeddings = pd.read_parquet('./data/embeddings/word2vec_300.parquet')
    return embeddings

def load_nnse():
    """NNSE 임베딩 행렬을 `pandas.DataFrame` 형태로 로드
    
    Returns:
        pandas.DataFrame: Index: 단어, Column: 차원 값 형태
    """
    embeddings = pd.read_parquet('./data/embeddings/nnse_2500.parquet')
    return embeddings

In [7]:
emb_word2vec = load_word2vec()
emb_nnse = load_nnse()
emb_word2vec.head()

Unnamed: 0_level_0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d291,d292,d293,d294,d295,d296,d297,d298,d299,d300
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
in,0.070312,0.086914,0.087891,0.0625,0.069336,-0.108887,-0.081543,-0.154297,0.020752,0.131836,...,-0.168945,-0.088867,-0.080566,0.064941,0.061279,-0.047363,-0.058838,-0.047607,0.014465,-0.0625
for,-0.01178,-0.047363,0.044678,0.063477,-0.018188,-0.063965,-0.001312,-0.072266,0.064453,0.086426,...,-0.022583,0.003723,-0.08252,0.081543,0.007935,0.000477,0.018433,0.071289,-0.034912,0.02417
that,-0.015747,-0.02832,0.083496,0.050293,-0.110352,0.031738,-0.014221,-0.089844,0.117676,0.118164,...,-0.011292,-0.015625,-0.033447,-0.02063,-0.019409,0.063965,0.020142,0.006866,0.061035,-0.148438
is,0.00705,-0.073242,0.171875,0.022583,-0.132812,0.198242,0.112793,-0.10791,0.071777,0.020874,...,-0.233398,-0.036377,-0.09375,0.182617,0.0271,0.12793,-0.02478,0.01123,0.164062,0.106934
on,0.026733,-0.09082,0.027832,0.204102,0.006226,-0.090332,0.022583,-0.161133,0.132812,0.061035,...,0.026855,-0.027954,0.030884,0.040527,-0.130859,0.083008,0.015747,-0.116699,-0.029419,-0.070801


In [8]:
# index = 단어
emb_word2vec.head().index.tolist()

['in', 'for', 'that', 'is', 'on']

## `get_embeddings_for_words`

In [9]:
def get_embeddings_for_words(words, embeddings):
    """주어진 단어들의 임베딩을 `pandas.DataFrame` 형태로 반환
    
    Args:
        words (list): 단어 리스트
        embeddings (pandas.DataFrame): 임베딩 행렬 (`load_word2vec`, `load_nnse` 반환 형태)
    Returns:
        pandas.DataFrame: 주어진 단어들의 임베딩
        
        단어가 임베딩 행렬에 없는 경우 제외
    """
    return embeddings.loc[embeddings.index.intersection(words)].copy()

In [10]:
words = ['standards', 'differs', 'including', 'abcc'] # 'abcc'는 행렬에 없음
word_embeddings = get_embeddings_for_words(words, emb_nnse)
word_embeddings

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d2491,d2492,d2493,d2494,d2495,d2496,d2497,d2498,d2499,d2500
including,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
standards,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
differs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
apple = get_embeddings_for_words(['apple'], emb_nnse)
apple

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d2491,d2492,d2493,d2494,d2495,d2496,d2497,d2498,d2499,d2500
apple,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
apple = get_embeddings_for_words(['computer', 'tech', 'company'], emb_nnse)
apple

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d2491,d2492,d2493,d2494,d2495,d2496,d2497,d2498,d2499,d2500
computer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tech,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
company,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## `explain_dim`, `explain_dims`, `get_sig_dims`

In [11]:
desc_cache = {}
def _explain_dim(index, emb, embeddings):
    k = 5
    col = 'd{}'.format(index+1)
    if not col in desc_cache:
        desc_cache[col] = embeddings.sort_values(by=col, ascending=False).index.tolist()[:k]
    desc =  desc_cache[col]
    return (col, ', '.join(desc), emb[index])

def explain_dims(indices, emb, embeddings, k=5):
    results = [_explain_dim(i, emb, embeddings) for i in indices]
    results = sorted(results, key=lambda item: item[2], reverse=True)
    results = [(item[0], item[1], '{:.5f}'.format(item[2])) for item in results]
    return results

def get_sig_dims(emb, thres=0.01):
    """값이 `thres` 이상인 차원 index 반환
    
    Args:
        emb (numpy.array): 차원을 추출할 임베딩. 1d array.
    Returns:
        numpy.array: sig 차원이 표시된 mask (예: array([False, True, False, ...]))
    """
    if len(emb.shape) > 1:
        raise ValueError('`emb` argument should be 1D array')
    if type(thres) != float:
        raise ValueError('`float` argument should be float')
    
    return np.where((emb > thres) == True)[0]

In [12]:
emb = get_embeddings_for_words(['ebay'], emb_nnse).values.flatten()
dims = get_sig_dims(emb)
explain_dims(dims, emb, emb_nnse)

[('d1694', 'seller, specifics, ebay, item, bid', '0.66279'),
 ('d1578', 'bids, valid, matching, enlarge, ebay', '0.49128'),
 ('d1257', 'antiques, auction, auctions, collectibles, antique', '0.13379'),
 ('d639', 'helsinki, dashes, weber, fraser, booksellers', '0.08437'),
 ('d579', 'quot, amp, lt, wal, gt', '0.06565'),
 ('d40', 'retail, kroger, kmart, safeway, wholesale', '0.03290'),
 ('d2398', 'thrift, duty-free, penney, second-hand, souvenir', '0.02616'),
 ('d1209', 'selling, buying, purchasing, sell, sells', '0.01678'),
 ('d1233', 'nike, dunk, adidas, sneaker, converse', '0.01630'),
 ('d2329', 'vous, sur, les, par, le', '0.01574'),
 ('d865', 'rated, rating, ratings, visitor, sort', '0.01328'),
 ('d2110', 'watchlist, drafts, artists, labels, submissions', '0.01228')]

## `compose_embeddings_sum` (Baseline)

In [13]:
def compose_embeddings_sum(target_embs, context_embs):
    """[Baseline] 주어진 단어 임베딩들의 합을 `pandas.DataFrame` 형태로 반환.
    
    단순 덧셈
    
    Args:
        target_embs (pandas.DataFrame): 합성할 단어 임베딩. `get_embeddings_for_words`의 반환값
        context_embs (pandas.DataFrame): 합성할 단어 임베딩. `get_embeddings_for_words`의 반환값
    Returns:
        pandas.DataFrame: 합성된 단어 임베딩 (덧셈). shape은 (1, #_of_dimensions)
    """
    # 임베딩 행렬 복제본에 작업
    target_embs = target_embs.copy()
    context_embs = context_embs.copy()
    
    embs = pd.concat([target_embs, context_embs])
    return embs.sum(axis=0).values.reshape(1, -1).copy()

In [14]:
# 합성 결과물 shape = (1, #_of_dimensions)
compose_embeddings_sum(word_embeddings.iloc[1:], word_embeddings).shape

(1, 2500)

## `compose_embeddings_reactive` (My)

In [49]:
def softmax(inputs):
    """
    Calculate the softmax for the give inputs (array)
    :param inputs:
    :return:
    """
    return np.exp(inputs) / float(sum(np.exp(inputs)))

DEBUG = False
def compose_embeddings_reactive(target_embs, context_embs):
    """[Proposing] 주어진 단어 임베딩들의 contextualized 합을 `pandas.DataFrame` 형태로 반환.
    
    1. 
    2. 
    3. 
    
    Args:
        target_embs (pandas.DataFrame): 합성할 단어 임베딩. `get_embeddings_for_words`의 반환값
        context_embs (pandas.DataFrame): 합성할 단어 임베딩. `get_embeddings_for_words`의 반환값
    Returns:
        pandas.DataFrame: 합성된 단어 임베딩 (Contextualized). shape은 (1, #_of_dimensions)
    """
    if type(target_embs) != pd.core.frame.DataFrame and type(context_embs) != pd.core.frame.DataFrame:
        raise ValueError('target_embs and context_embs must be DataFrame')
    
    # 임베딩 행렬 복제본에 작업 (원본 행렬 유지)
    target_embs = target_embs.copy()
    context_embs = context_embs.copy()
    
    # 임베딩을 1차원 벡터로 변환 (context는 먼저 합친 후 변환)
    target = target_embs.sum().values
    context = context_embs.sum().values
    
    # target*context
    target = np.multiply(target, context)
    
    # deactivate weak dimensions
#     thres = 0.001
#     weak_dims = target < thres
#     target[weak_dims] = 0.0

    # 반환값
    result = normalize(target.reshape(1, -1))
#     result = target.reshape(1, -1)

    # 디버깅
    if DEBUG:
        print('[Words]', ', '.join(target_embs.index.tolist() + context_embs.index.tolist()))
        explain = explain_dims(result.nonzero()[1], result.flatten(), emb_nnse)
        pprint(explain)

#     return target.reshape(1, -1)
    return result

In [30]:
target = get_embeddings_for_words(['apple'], emb_nnse)

In [31]:
context = get_embeddings_for_words(['electronics', 'latest', 'fast'], emb_nnse)

res = compose_embeddings_reactive(target, context)
# dims = res.nonzero()[1]
# explain_dims(dims, emb_nnse)

[Words] apple, fast, latest, electronics
[('d1724', 'aac, ogg, rm, ripper, converter', '0.96822'),
 ('d2104', 'alcatel, lg, motorola, samsung, sony', '0.24150'),
 ('d872', 'bea, microsoft, enterprise, ria, oracle', '0.06416'),
 ('d1778', 'cool, crazy, gadget, animation, sexy', '0.00771'),
 ('d1478', 'amoeba, rampage, shootout, buster, swat', '0.00678')]


In [32]:
context = get_embeddings_for_words(['eat', 'ate', 'delicious'], emb_nnse)

res = compose_embeddings_reactive(target, context)
# dims = res.nonzero()[1]
# explain_dims(dims, emb_nnse)

[Words] apple, delicious, ate, eat
[('d868', 'peach, pear, raspberry, plum, mango', '0.76207'),
 ('d2095', 'oreo, crumb, kreme, krispy, shortbread', '0.60238'),
 ('d2203', 'godiva, starbucks, chocolate, candy, nestle', '0.21057'),
 ('d2239', 'rabe, raab, cheese, recipe, sauce', '0.10977')]


## `load_dataset`
![wsd-dataset](./images/wsd-dataset.png)

In [19]:
def load_dataset(file_path):
    """테스트 할 데이터셋을 `pandas.DataFrame` 형태로 로드
    
    
    Args:
        file_path (str): 데이터셋 경로
    Returns:
        pandas.DataFrame: 로드한 데이터셋 DataFrame
    """
    return pd.read_csv(file_path)

In [20]:
# 데이터셋 로드 예시
f = './data/sense-2017/all/2_senses_dev_Noun.csv'
dataset = load_dataset(f)
dataset.head()

Unnamed: 0,Lexeme,Target Sense Definition,Target Sense Sentence,Example Definition Sense 1,Example Sentence Sense 1,Example Definition Sense 2,Example Sentence Sense 2,Data Source
0,shadow,"used in reference to proximity, ominous oppres...",Uncertainty prevails in the shadows of the Ira...,"used in reference to proximity, ominous oppres...",We've lived in the shadow of the seven-inch si...,a weak or inferior remnant or version of somet...,"She had realized then, as she realized now, th...",Oxford
1,bond,"an insurance policy held by a company, which p...",Each union must buy an insurance bond to prote...,"an insurance policy held by a company, which p...",Insurance company capital-protected guaranteed...,"an agreement with legal force, in particular:","In this case, the defendant, a dyer, had given...",Oxford
2,future,a period of time following the moment of speak...,We cannot rule out the possibility of a conspi...,a period of time following the moment of speak...,Pat plans to release a further single in the n...,contracts for assets (especially commodities o...,"Then as soon as the cash market closed, the S&...",Oxford
3,knowledge,"true, justified belief; certain understanding,...",So the true question of objective knowledge is...,"true, justified belief; certain understanding,...","As a rationalist, he believed that the only pa...",the sum of what is known,He does experimental and anthropological resea...,Oxford
4,art,the expression or application of human creativ...,If the show can be taken as a barometer of vis...,the expression or application of human creativ...,A lot of people who know nothing about art say...,works produced by human creative skill and ima...,The most economical way to sum it all up is wi...,Oxford


# Main

In [21]:
# 임베딩 로드
emb_word2vec = load_word2vec()
emb_nnse = load_nnse()

# spaCy 모델(문맥 단어 추출에 사용) 로드
nlp = spacy.load('en')

In [22]:
# def get_target_word_pos(test_case):
#     """테스트 케이스 명에서 Target word의 품사가 무엇인지 판단
    
#     제안하는 방법은 (명사, 동사, 형용사) 중 target word의 품사를 제외한 나머지 품사만을 문맥 단어로 활용함. 
    
#     Args:
#         test_case (str): 테스트 케이스 이름 (e.g. "2_senses_dev_Adjective")
#     Returns:
#         str: 'NOUN', 'VERB', 'ADJ' 중 하나 (spacy의 태그 형식)
#     """
#     if 'Noun' in test_case:
#         return 'NOUN'
#     elif 'Verb' in test_case:
#         return 'VERB'
#     elif 'Adjective' in test_case:
#         return 'ADJ'

In [50]:
if __name__ == '__main__':    
    # 데이터셋 파일 목록 읽어오기
    files = glob('./data/sense-2017/all/*.csv')
    files = [f for f in files if
#                  '3_senses' in f and
                 'dev' in f and
                 'Noun' in f]
#     files = files[:1]
    
    # 각 데이터셋 별로
    corrects_all = {}
    debugs_all = {}
    for f in files:
        test_case = os.path.basename(f).split('.')[0]
        print('[{}]'.format(test_case))

        # 데이터셋 로드
        dataset = load_dataset(f)

        # 1. 데이터셋 각 row 별로,
        corrects_testcase = defaultdict(list) # row 별로 점답 여부 저장 
        debugs_testcase = defaultdict(list)   # row 별로 디버그 정보 저장 (합성 벡터, Option 별 점수 등)
        for i, row in dataset.iterrows():
            # 2. Target 단어 및 Sentence(Target, Option #) 추출
            #     sentences[0] = target-sentence, sentences[1:] = option-sentences
            target_word = row['Lexeme'].strip()                
            sentences = [row[col].strip() for col in row.keys() if 'Sentence' in col]
            if DEBUG:
                print('<{}>'.format(target_word))

            # 3. Target 단어 및 context 단어들의 임베딩 확보 (`emb_word2vec`, `emb_nnse`)
            embeddings = {
                'word2vec': {
                    'target': None,
                    'contexts': [None, None]
                },
                'nnse': {
                    'target': None,
                    'contexts': [None, None]
                }
            }
            ## Context 추출
            context_pos = {'NOUN', 'VERB', 'ADJ'} # spaCy Universal Part-of-speech Tags (https://spacy.io/api/annotation)
            contexts = [[cxt for cxt in extract_context_words(sent, nlp, context_pos) if cxt != target_word] # target word는 제거
                            for sent in sentences]
            
            ## word2vec 임베딩 확보 (Target & Context)
            embeddings['word2vec']['target'] = get_embeddings_for_words([target_word], emb_word2vec)
            embeddings['word2vec']['contexts'] = [get_embeddings_for_words(cxt, emb_word2vec) for cxt in contexts]

            ## NNSE 임베딩 확보 (Target & Context)
            embeddings['nnse']['target'] = get_embeddings_for_words([target_word], emb_nnse)
            embeddings['nnse']['contexts'] = [get_embeddings_for_words(cxt, emb_nnse) for cxt in contexts]

            # 5. 임베딩 유형 & 합성 방식 별 성능 비교 (word2vec vs. NNSE & sum vs. reactive)
            #     (target & contexts[0]) VS (target & contexts[1]), (target & contexts[2]), ...
            #     여기서 contexts[0]는 "Target Sentence"에서 추출된 문맥 임베딩, 나머지는 "Option Sentence"
            ## 합성 방식 별 비교
            compose_methods = {
                'sum': compose_embeddings_sum,
                'reactive': compose_embeddings_reactive
            }
            for compose_type in ['sum', 'reactive']:
                compose_method = compose_methods[compose_type]

                ## 임베딩 유형별 비교
                emb_types = ['word2vec', 'nnse'] if compose_type == 'sum' else ['nnse']
                for emb_type in emb_types:
                    # 주어진 유형의 임베딩 확보
                    emb_sets = embeddings[emb_type]

                    # 실험 프로세스
                    setting = '{}_{}'.format(compose_type, emb_type)  # 현재 설정 (포멧: "합성방법_임베딩유형")

                    if emb_sets['target'].size == 0:  # Target 단어가 임베딩 행렬에 없을 경우 생략
                        continue
                    else:                             # Target 단어가 있을 경우
                        # Target 합성 임베딩 계산
                        target_composed = compose_method(emb_sets['target'], emb_sets['contexts'][0])
                        # Option 합성 임베딩 계산
                        options_composed = [compose_method(emb_sets['target'], cxt) for cxt in emb_sets['contexts'][1:]]

                        # Target과 Option 간 cosine similarity 점수 계산 & Option 별 점수 랭킹 계산
                        # Option 1이 항상 정답임
                        try:
                            option_scores = [cosine_similarity(target_composed, opt)
                                         for opt in options_composed]
                            # 정답 여부 확인
                            ## score 계산이 잘못되어 모든 option의 점수가 같을 경우에도
                            ## 맨 앞 index인 0이 출력되어 맞다고 잘못 판정되는 것 방지
                            last_option_idx = len(option_scores)-1 
                            option_scores_rev = np.flip(option_scores, axis=0)
                            ## 정답 여부 체크
                            correct = ( np.argmax(option_scores_rev) == last_option_idx )
                        except ValueError: # NNSE의 경우 context 단어 벡터를 하나도 찾지 못하는 경우가 생김
                            # 틀렸다고 간주
                            correct = False
                        # 결과 저장 (row)
                        ## 정답 여부 저장
                        corrects_testcase[setting].append(correct)
                        ## 디버그 정보 저장
                        debug = {
                            'target_composed': target_composed,
                            'options_composed': options_composed,
                            'option_scores': option_scores
                        }
                        debugs_testcase[setting].append(debug)  

            # 결과 저장 (test case)
            ## 정답 여부 저장
            corrects_all[test_case] = corrects_testcase
            ## 디버그 정보 저장
            debugs_all[test_case] = debugs_testcase
            
            # TODO: 테스트 용. 나중에 제거.
#             if i > 3:
#                 break
            
        # 점수 출력 (per testcase)
        for setting, corrects in corrects_testcase.items():
            n_correct = sum(corrects)
            n_all = len(corrects)
            final_score = n_correct / n_all
            print('{}: \t{:.4f} ({} / {})'.format(setting, final_score, n_correct, n_all))
        print('')

[2_senses_dev_Noun]
sum_word2vec: 	0.7278 (123 / 169)
sum_nnse: 	0.5503 (93 / 169)
reactive_nnse: 	0.3018 (51 / 169)

[3_senses_dev_Noun]


KeyboardInterrupt: 

In [None]:
# debugs_all['2_senses_dev_Verb']['reactive_nnse']