In [3]:
import pandas as pd
import spacy
import numpy as np
import os
from glob import glob
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import functools
from pprint import pprint
from sklearn.preprocessing import normalize

# Helper

## `load_word2vec`, `load_nnse`

In [4]:
def load_word2vec():
    """word2vec 임베딩 행렬을 `pandas.DataFrame` 형태로 로드
    
    Returns:
        pandas.DataFrame: Index: 단어, Column: 차원 값 형태
    """
    embeddings = pd.read_parquet('./data/embeddings/word2vec_300.parquet')
    return embeddings

def load_nnse():
    """NNSE 임베딩 행렬을 `pandas.DataFrame` 형태로 로드
    
    Returns:
        pandas.DataFrame: Index: 단어, Column: 차원 값 형태
    """
    embeddings = pd.read_parquet('./data/embeddings/nnse_2500.parquet')
    return embeddings

In [23]:
emb_nnse = load_nnse()
emb_nnse.head()

Unnamed: 0_level_0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d2491,d2492,d2493,d2494,d2495,d2496,d2497,d2498,d2499,d2500
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
expletive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
measles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
proven,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
perverted,0.004916,0.0,0.0,0.0,0.0,0.006557,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.005495,0.0,0.0,0.0,0.0
inconsequential,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## `get_embeddings_for_words`

In [6]:
def get_embeddings_for_words(words, embeddings):
    """주어진 단어들의 임베딩을 `pandas.DataFrame` 형태로 반환
    
    Args:
        words (list): 단어 리스트
        embeddings (pandas.DataFrame): 임베딩 행렬 (`load_word2vec`, `load_nnse` 반환 형태)
    Returns:
        pandas.DataFrame: 주어진 단어들의 임베딩
        
        단어가 임베딩 행렬에 없는 경우 제외
    """
    return embeddings.loc[embeddings.index.intersection(words)].copy()

In [7]:
words = ['standards', 'differs', 'including', 'abcc'] # 'abcc'는 행렬에 없음
word_embeddings = get_embeddings_for_words(words, emb_nnse)
word_embeddings

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d2491,d2492,d2493,d2494,d2495,d2496,d2497,d2498,d2499,d2500
including,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
standards,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
differs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## `compose_embeddings_sum` (Baseline)

In [11]:
def compose_embeddings_sum(target_embs, context_embs):
    """[Baseline] 주어진 단어 임베딩들의 합을 `pandas.DataFrame` 형태로 반환.
    
    단순 덧셈
    
    Args:
        target_embs (pandas.DataFrame): 합성할 단어 임베딩. `get_embeddings_for_words`의 반환값
        context_embs (pandas.DataFrame): 합성할 단어 임베딩. `get_embeddings_for_words`의 반환값
    Returns:
        pandas.DataFrame: 합성된 단어 임베딩 (덧셈). shape은 (1, #_of_dimensions)
    """
    # 임베딩 행렬 복제본에 작업
    target_embs = target_embs.copy()
    context_embs = context_embs.copy()
    
    embs = pd.concat([target_embs, context_embs])
    return embs.sum(axis=0).values.reshape(1, -1).copy()

In [12]:
# 합성 결과물 shape = (1, #_of_dimensions)
compose_embeddings_sum(word_embeddings.iloc[1:], word_embeddings).shape

(1, 2500)

## `compose_embeddings_reactive` (My)

In [13]:
def softmax(inputs):
    """
    Calculate the softmax for the give inputs (array)
    :param inputs:
    :return:
    """
    return np.exp(inputs) / float(sum(np.exp(inputs)))

DEBUG = False
def compose_embeddings_reactive(target_embs, context_embs):
    """[Proposing] 주어진 단어 임베딩들의 contextualized 합을 `pandas.DataFrame` 형태로 반환.
    
    1. 
    2. 
    3. 
    
    Args:
        target_embs (pandas.DataFrame): 합성할 단어 임베딩. `get_embeddings_for_words`의 반환값
        context_embs (pandas.DataFrame): 합성할 단어 임베딩. `get_embeddings_for_words`의 반환값
    Returns:
        pandas.DataFrame: 합성된 단어 임베딩 (Contextualized). shape은 (1, #_of_dimensions)
    """
    if type(target_embs) != pd.core.frame.DataFrame and type(context_embs) != pd.core.frame.DataFrame:
        raise ValueError('target_embs and context_embs must be DataFrame')
    
    # 임베딩 행렬 복제본에 작업 (원본 행렬 유지)
    target_embs = target_embs.copy()
    context_embs = context_embs.copy()
    
    # 임베딩을 1차원 벡터로 변환 (context는 먼저 합친 후 변환)
    target = target_embs.sum().values
    context = context_embs.sum().values
    
    # target*context
    target = np.multiply(target, context)
    
    # deactivate weak dimensions
#     thres = 0.001
#     weak_dims = target < thres
#     target[weak_dims] = 0.0

    # 반환값
    result = normalize(target.reshape(1, -1))
#     result = target.reshape(1, -1)

    # 디버깅
    if DEBUG:
        print('[Words]', ', '.join(target_embs.index.tolist() + context_embs.index.tolist()))
        explain = explain_dims(result.nonzero()[1], result.flatten(), emb_nnse)
        pprint(explain)

#     return target.reshape(1, -1)
    return result

## `explain_dim`, `explain_dims`, `get_sig_dims`

In [25]:
desc_cache = {}
def _explain_dim(index, emb, embeddings):
    k = 5
    col = 'd{}'.format(index+1)
    if not col in desc_cache:
        desc_cache[col] = embeddings.sort_values(by=col, ascending=False).index.tolist()[:k]
    desc =  desc_cache[col]
    return (col, ', '.join(desc), emb[index])

def explain_dims(indices, emb, embeddings, k=5):
    results = [_explain_dim(i, emb, embeddings) for i in indices]
    results = sorted(results, key=lambda item: item[2], reverse=True)
    results = [(item[0], item[1], '{:.5f}'.format(item[2])) for item in results]
    return results

def get_sig_dims(emb, thres=0.01):
    """값이 `thres` 이상인 차원 index 반환
    
    Args:
        emb (numpy.array): 차원을 추출할 임베딩. 1d array.
    Returns:
        numpy.array: sig 차원이 표시된 mask (예: array([False, True, False, ...]))
    """
    if len(emb.shape) > 1:
        raise ValueError('`emb` argument should be 1D array')
    if type(thres) != float:
        raise ValueError('`float` argument should be float')
    
    return np.where((emb > thres) == True)[0]

In [43]:
emb = get_embeddings_for_words(['apple'], emb_nnse).values.flatten()
dims = get_sig_dims(emb)
explain_dims(dims, emb, emb_nnse)[:5]

[('d46', 'mac, macs, imac, macintosh, itunes', '0.74074'),
 ('d868', 'peach, pear, raspberry, plum, mango', '0.34687'),
 ('d111', 'crave, atlas, biz, chow, notebooks', '0.22558'),
 ('d2104', 'alcatel, lg, motorola, samsung, sony', '0.11278'),
 ('d1351', 'hewlett, packard, xerox, hp, compaq', '0.09343')]

# Main
주어진 다의어(예: apple)가 각기 다른 의미(sense)를 암시하는 문맥에 따라 의미가 변화하는 알고리즘을 **해석 가능한 형태**로 보여줌

In [49]:
target = get_embeddings_for_words(['apple'], emb_nnse)

In [47]:
context = get_embeddings_for_words(['electronics', 'latest', 'fast'], emb_nnse)

res = compose_embeddings_reactive(target, context)
dims = res.nonzero()[1]
explain_dims(dims, target.values.flatten(), emb_nnse)

[('d2104', 'alcatel, lg, motorola, samsung, sony', '0.11278'),
 ('d1724', 'aac, ogg, rm, ripper, converter', '0.06844'),
 ('d872', 'bea, microsoft, enterprise, ria, oracle', '0.05528'),
 ('d1778', 'cool, crazy, gadget, animation, sexy', '0.01466'),
 ('d1478', 'amoeba, rampage, shootout, buster, swat', '0.00589')]

In [48]:
context = get_embeddings_for_words(['eat', 'ate', 'delicious'], emb_nnse)

res = compose_embeddings_reactive(target, context)
dims = res.nonzero()[1]
explain_dims(dims, target.values.flatten(), emb_nnse)

[('d868', 'peach, pear, raspberry, plum, mango', '0.34687'),
 ('d2203', 'godiva, starbucks, chocolate, candy, nestle', '0.05765'),
 ('d2095', 'oreo, crumb, kreme, krispy, shortbread', '0.03297'),
 ('d2239', 'rabe, raab, cheese, recipe, sauce', '0.02305')]