In [195]:
import pandas as pd
import spacy
import numpy as np
import os
from glob import glob
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# Helper

## `extract_context_words`

In [30]:
def extract_context_words(text, nlp):
    """입력된 text로부터 명사/동사 추출
    
    Args:
        text (str): Context 추출할 텍스트
        nlp (spacy.lang.en.English): spaCy 모델
    Returns:
        list: 텍스트에서 추출한 명사/동사 리스트. ['단어 소문자'] 형태. stopword 제외
    """
    doc = nlp(text)
#     return [(tok.text.lower(), tok.pos_) for tok in doc if tok.pos_ in ('NOUN', 'VERB')]
    return [tok.text.lower() for tok in doc if tok.pos_ in ('NOUN', 'VERB') and not tok.is_stop]

In [31]:
text = 'Tokenization standards are based on the OntoNotes 5 corpus. The tokenizer differs from most by including tokens for significant whitespace.'
nlp = spacy.load('en')
extract_context_words(text, nlp)

['tokenization',
 'standards',
 'based',
 'corpus',
 'tokenizer',
 'differs',
 'including',
 'tokens',
 'whitespace']

## `load_word2vec`, `load_nnse`

In [5]:
def load_word2vec():
    """word2vec 임베딩 행렬을 `pandas.DataFrame` 형태로 로드
    
    Returns:
        pandas.DataFrame: Index: 단어, Column: 차원 값 형태
    """
    embeddings = pd.read_parquet('../data/embeddings/word2vec_300.parquet')
    return embeddings

def load_nnse():
    """NNSE 임베딩 행렬을 `pandas.DataFrame` 형태로 로드
    
    Returns:
        pandas.DataFrame: Index: 단어, Column: 차원 값 형태
    """
    embeddings = pd.read_parquet('../data/embeddings/nnse_2500.parquet')
    return embeddings

In [95]:
emb = load_word2vec()
emb.head()

Unnamed: 0_level_0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d291,d292,d293,d294,d295,d296,d297,d298,d299,d300
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
in,0.070312,0.086914,0.087891,0.0625,0.069336,-0.108887,-0.081543,-0.154297,0.020752,0.131836,...,-0.168945,-0.088867,-0.080566,0.064941,0.061279,-0.047363,-0.058838,-0.047607,0.014465,-0.0625
for,-0.01178,-0.047363,0.044678,0.063477,-0.018188,-0.063965,-0.001312,-0.072266,0.064453,0.086426,...,-0.022583,0.003723,-0.08252,0.081543,0.007935,0.000477,0.018433,0.071289,-0.034912,0.02417
that,-0.015747,-0.02832,0.083496,0.050293,-0.110352,0.031738,-0.014221,-0.089844,0.117676,0.118164,...,-0.011292,-0.015625,-0.033447,-0.02063,-0.019409,0.063965,0.020142,0.006866,0.061035,-0.148438
is,0.00705,-0.073242,0.171875,0.022583,-0.132812,0.198242,0.112793,-0.10791,0.071777,0.020874,...,-0.233398,-0.036377,-0.09375,0.182617,0.0271,0.12793,-0.02478,0.01123,0.164062,0.106934
on,0.026733,-0.09082,0.027832,0.204102,0.006226,-0.090332,0.022583,-0.161133,0.132812,0.061035,...,0.026855,-0.027954,0.030884,0.040527,-0.130859,0.083008,0.015747,-0.116699,-0.029419,-0.070801


In [96]:
emb.head().index.tolist()

['in', 'for', 'that', 'is', 'on']

## `get_embeddings_for_words`

In [97]:
def get_embeddings_for_words(words, embeddings):
    """주어진 단어들의 임베딩을 `pandas.DataFrame` 형태로 반환
    
    Args:
        words (list): 단어 리스트
        embeddings (pandas.DataFrame): 임베딩 행렬 (`load_word2vec`, `load_nnse` 반환 형태)
    Returns:
        pandas.DataFrame: 주어진 단어들의 임베딩
        
        단어가 임베딩 행렬에 없는 경우 제외
    """
    return embeddings.loc[embeddings.index.intersection(words)]

In [98]:
words = ['standards', 'differs', 'including', 'abcc'] # 'abcc'는 행렬에 없음
word_embeddings = get_embeddings_for_words(words, emb)
word_embeddings

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d291,d292,d293,d294,d295,d296,d297,d298,d299,d300
including,-0.060547,0.073242,0.024048,-0.01709,0.108398,0.002121,-0.038086,0.038818,0.078613,0.074707,...,-0.003387,-0.035156,0.052979,0.030029,-0.019531,-0.114746,0.102051,0.022339,0.012268,0.126953
standards,-0.251953,0.124023,0.392578,0.283203,-0.09375,0.065918,0.027588,-0.063477,0.086426,0.045166,...,0.046143,0.004761,-0.273438,0.07959,0.085449,0.053223,0.168945,-0.164062,0.057861,0.012207
differs,0.017212,-0.08252,0.074707,-0.001503,-0.069336,0.330078,0.121094,0.049561,0.201172,-0.037598,...,-0.049805,-0.121094,-0.025513,0.174805,-0.104492,0.122559,0.162109,-0.285156,-0.09082,0.384766


## `compose_embeddings_sum` (Baseline)

In [99]:
def compose_embeddings_sum(target_embs, context_embs):
    """[Baseline] 주어진 단어 임베딩들의 합을 `pandas.DataFrame` 형태로 반환.
    
    단순 덧셈
    
    Args:
        target_embs (pandas.DataFrame): 합성할 단어 임베딩. `get_embeddings_for_words`의 반환값
        context_embs (pandas.DataFrame): 합성할 단어 임베딩. `get_embeddings_for_words`의 반환값
    Returns:
        pandas.DataFrame: 합성된 단어 임베딩 (덧셈). shape은 (1, #_of_dimensions)
    """
    embs = pd.concat([target_embs, context_embs])
    return embs.sum(axis=0).values.reshape(1, -1)

In [101]:
compose_embeddings_sum(word_embeddings.iloc[1:], word_embeddings).shape

(1, 300)

## `compose_embeddings_reactive` (My)

In [200]:
def softmax(inputs):
    """
    Calculate the softmax for the give inputs (array)
    :param inputs:
    :return:
    """
    return np.exp(inputs) / float(sum(np.exp(inputs)))

def compose_embeddings_reactive(target_embs, context_embs):
    """[Proposing] 주어진 단어 임베딩들의 contextualized 합을 `pandas.DataFrame` 형태로 반환.
    
    1. 
    2. 
    3. 
    
    Args:
        target_embs (pandas.DataFrame): 합성할 단어 임베딩. `get_embeddings_for_words`의 반환값
        context_embs (pandas.DataFrame): 합성할 단어 임베딩. `get_embeddings_for_words`의 반환값
    Returns:
        pandas.DataFrame: 합성된 단어 임베딩 (Contextualized). shape은 (1, #_of_dimensions)
    """
    #  target에서 0이 아닌 차원 파악
    target_nonzero = target_embs.values.nonzero()
    
    # context에서 0이 아닌 차원을 값이 큰 순으로 정렬한 후, 상위 top_dim_k 만큼만 파악.
    # top_dim_k은 차원_갯수*0.01 (예: 3000*0.01 = 30)
    top_dim_k = int(len(target_embs) * 0.01)
    context = context_embs.sum()
    context_top_nonzero = np.flip(np.argsort(context.values), axis=0)[:top_dim_k]
    
    # both_nonzero_dims : target_nonzero, context_top_nonzero의 교집합
    both_nonzero_dims = np.intersect1d(target_nonzero, context_top_nonzero)
    
    # target_embs의 나머지 차원 비활성화
    final_emb = target_embs.values
    for d in range(0, len(final_emb)):
        if d not in both_nonzero_dims:
#             print('Setting {} dim = 0.0'.format(d))
            final_emb[d] = 0.0
    
    # 다음 추가 처리 후 반환
    #   1. 정규화 (softmax)
    #   2. (1, #_of_dimensions) 형태로 reshape
    final_emb = softmax(final_emb.reshape(-1, 1))
    return final_emb.reshape(1, -1)

## `explain_dim`, `explain_dims`

In [152]:
def explain_dim(dim, embeddings, k=5):
    return embeddings.sort_values(by=dim, ascending=False).index.tolist()[:k]

def explain_dims(dims, embeddings, k=5):
    return [', '.join(explain_dim(d, embeddings, k=k)) for d in dims]

In [204]:
explain_dim('d2', emb_nnse)

['girls', 'boys', 'powerpuff', 'backstreet', 'gilmore']

## `load_dataset`
![wsd-dataset](./images/wsd-dataset.png)

In [14]:
def load_dataset(file_path):
    """테스트 할 데이터셋을 `pandas.DataFrame` 형태로 로드
    
    
    Args:
        file_path (str): 데이터셋 경로
    Returns:
        pandas.DataFrame: 로드한 데이터셋 DataFrame
    """
    return pd.read_csv(file_path)

In [13]:
f = '../data/sense-2017/all/2_senses_dev_Noun.csv'
dataset = load_dataset(f)
dataset.head()

Unnamed: 0,Lexeme,Target Sense Definition,Target Sense Sentence,Example Definition Sense 1,Example Sentence Sense 1,Example Definition Sense 2,Example Sentence Sense 2,Data Source
0,shadow,"used in reference to proximity, ominous oppres...",Uncertainty prevails in the shadows of the Ira...,"used in reference to proximity, ominous oppres...",We've lived in the shadow of the seven-inch si...,a weak or inferior remnant or version of somet...,"She had realized then, as she realized now, th...",Oxford
1,bond,"an insurance policy held by a company, which p...",Each union must buy an insurance bond to prote...,"an insurance policy held by a company, which p...",Insurance company capital-protected guaranteed...,"an agreement with legal force, in particular:","In this case, the defendant, a dyer, had given...",Oxford
2,future,a period of time following the moment of speak...,We cannot rule out the possibility of a conspi...,a period of time following the moment of speak...,Pat plans to release a further single in the n...,contracts for assets (especially commodities o...,"Then as soon as the cash market closed, the S&...",Oxford
3,knowledge,"true, justified belief; certain understanding,...",So the true question of objective knowledge is...,"true, justified belief; certain understanding,...","As a rationalist, he believed that the only pa...",the sum of what is known,He does experimental and anthropological resea...,Oxford
4,art,the expression or application of human creativ...,If the show can be taken as a barometer of vis...,the expression or application of human creativ...,A lot of people who know nothing about art say...,works produced by human creative skill and ima...,The most economical way to sum it all up is wi...,Oxford


# Main

In [90]:
# 임베딩 로드
emb_word2vec = load_word2vec()
emb_nnse = load_nnse()

# spaCy 모델(문맥 단어 추출에 사용) 로드
nlp = spacy.load('en')

In [224]:
if __name__ == '__main__':    
    # 데이터셋 파일 목록 읽어오기
    files = glob('../data/sense-2017/all/*.csv')
#     files = [f for f in files if 'Noun' in f]
#     files = files[5:7]
    
    # 각 데이터셋 별로 실험 수행
    for f in files:
        test_case = os.path.basename(f).split('.')[0]
        print('[{}]'.format(test_case))
        
        # 데이터셋 DataFrame 로드
        dataset = load_dataset(f)
        
        # 1. 데이터셋 각 행(case) 별로,
        corrects_all = defaultdict(list) # row 별로 True, False 여부 저장 (계산은 for-loop 마지막에)
        for _, row in dataset.iterrows():
            # 2. Target 단어 및 Sentence(Target, Option #) 추출
            #     sentences[0] = target-sentence, sentences[1:] = option-sentences
            target_word = row['Lexeme']
            sentences = [row[col] for col in row.keys() if 'Sentence' in col]
            
            # 3. sentences에서 context 단어(명사, 동사) 추출
            contexts = [extract_context_words(sent, nlp) for sent in sentences]
            
            # 4. Target 단어 및 context 단어들의 임베딩 확보 (`emb_word2vec`, `emb_nnse`)
            embeddings = {
                'word2vec': {
                    'target': None,
                    'contexts': [None, None]
                },
                'nnse': {
                    'target': None,
                    'contexts': [None, None]
                }
            }
            ## word2vec 임베딩 확보
            emb = emb_word2vec
            embeddings['word2vec']['target'] = get_embeddings_for_words([target_word], emb)
            embeddings['word2vec']['contexts'] = [get_embeddings_for_words(cxt, emb) for cxt in contexts]
            
            ## NNSE 임베딩 확보
            emb = emb_nnse
            embeddings['nnse']['target'] = get_embeddings_for_words([target_word], emb)
            embeddings['nnse']['contexts'] = [get_embeddings_for_words(cxt, emb) for cxt in contexts]
            
            # 5. 임베딩 유형 & 합성 방식 별 성능 비교 (word2vec vs. NNSE & sum vs. reactive)
            #     (target & contexts[0]) VS (target & contexts[1]), (target & contexts[2]), ...
            #     여기서 contexts[0]는 "Target Sentence"에서 추출된 문맥 임베딩, 나머지는 "Option Sentence"
            
            ## 합성 방식 별 비교
            compose_methods = {
                'sum': compose_embeddings_sum,
                'reactive': compose_embeddings_reactive
            }
            for compose_type in ['sum', 'reactive']:
                compose_method = compose_methods[compose_type]
                
                ## 임베딩 유형별 비교
                emb_types = ['word2vec', 'nnse'] if compose_type == 'sum' else ['nnse']
                for emb_type in emb_types:
                    # 주어진 유형의 임베딩 확보
                    emb_sets = embeddings[emb_type]

                    # Target 합성 임베딩 계산
                    target_composed = compose_method(emb_sets['target'], emb_sets['contexts'][0])
                    # Option 합성 임베딩 계산
                    options_composed = [compose_method(emb_sets['target'], cxt) for cxt in emb_sets['contexts'][1:]]

                    # Target과 Option 간 cosine similarity 점수 계산 & Option 별 점수 랭킹 계산
                    # Option 1이 항상 정답임
                    setting = '{}_{}'.format(compose_type, emb_type)
                    try:
                        option_scores = [cosine_similarity(target_composed, opt)
                                     for opt in options_composed]
                        correct = (np.argmax(option_scores) == 0)
                    
                        # Save result
                        corrects_all[setting].append(correct)
                    except ValueError: # NNSE의 경우 context 단어 벡터를 하나도 찾지 못하는 경우가 생김
#                         print('Error:', target_word, setting)
                        corrects_all[setting].append(False) # 틀렸다고 간주
                    
                
        # 점수 출력
        for setting, corrects in corrects_all.items():
            n_correct = sum(corrects)
            n_all = len(corrects)
            final_score = n_correct / n_all
            print('{}: \t{:.4f} ({} / {})'.format(setting, final_score, n_correct, n_all))
        print('')

[2_senses_dev_Adjective]
sum_word2vec: 	0.6970 (46 / 66)
sum_nnse: 	0.6212 (41 / 66)
reactive_nnse: 	1.0000 (66 / 66)

[5_senses_test_Adjective]
sum_word2vec: 	0.6087 (70 / 115)
sum_nnse: 	0.5217 (60 / 115)
reactive_nnse: 	1.0000 (115 / 115)

[2_senses_test_Adjective]
sum_word2vec: 	0.7129 (149 / 209)
sum_nnse: 	0.7033 (147 / 209)
reactive_nnse: 	1.0000 (209 / 209)

[2_senses_test_Noun]
sum_word2vec: 	0.7395 (457 / 618)
sum_nnse: 	0.6796 (420 / 618)
reactive_nnse: 	0.9984 (617 / 618)

[3_senses_test_Verb]
sum_word2vec: 	0.6384 (226 / 354)
sum_nnse: 	0.5367 (190 / 354)
reactive_nnse: 	0.9718 (344 / 354)

[3_senses_dev_Adjective]
sum_word2vec: 	0.5745 (27 / 47)
sum_nnse: 	0.5532 (26 / 47)
reactive_nnse: 	1.0000 (47 / 47)

[2_senses_test_Verb]
sum_word2vec: 	0.7078 (310 / 438)
sum_nnse: 	0.6689 (293 / 438)
reactive_nnse: 	0.9749 (427 / 438)

[3_senses_test_Noun]
sum_word2vec: 	0.6012 (300 / 499)
sum_nnse: 	0.5170 (258 / 499)
reactive_nnse: 	0.9960 (497 / 499)

[5_senses_test_Noun]
sum_wor