In [60]:
import pandas as pd
from tqdm import tqdm
from pprint import pprint

# 임베딩 로드
- `word2vec`: Word2Vec
- `nnse`: Non-negative Sparse Embedding (NNSE)

In [3]:
word2vec = pd.read_parquet('./data/embeddings/word2vec_300.parquet')
nnse = pd.read_parquet('./data/embeddings/nnse_2500.parquet')

In [4]:
word2vec.head()

Unnamed: 0_level_0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d291,d292,d293,d294,d295,d296,d297,d298,d299,d300
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
in,0.070312,0.086914,0.087891,0.0625,0.069336,-0.108887,-0.081543,-0.154297,0.020752,0.131836,...,-0.168945,-0.088867,-0.080566,0.064941,0.061279,-0.047363,-0.058838,-0.047607,0.014465,-0.0625
for,-0.01178,-0.047363,0.044678,0.063477,-0.018188,-0.063965,-0.001312,-0.072266,0.064453,0.086426,...,-0.022583,0.003723,-0.08252,0.081543,0.007935,0.000477,0.018433,0.071289,-0.034912,0.02417
that,-0.015747,-0.02832,0.083496,0.050293,-0.110352,0.031738,-0.014221,-0.089844,0.117676,0.118164,...,-0.011292,-0.015625,-0.033447,-0.02063,-0.019409,0.063965,0.020142,0.006866,0.061035,-0.148438
is,0.00705,-0.073242,0.171875,0.022583,-0.132812,0.198242,0.112793,-0.10791,0.071777,0.020874,...,-0.233398,-0.036377,-0.09375,0.182617,0.0271,0.12793,-0.02478,0.01123,0.164062,0.106934
on,0.026733,-0.09082,0.027832,0.204102,0.006226,-0.090332,0.022583,-0.161133,0.132812,0.061035,...,0.026855,-0.027954,0.030884,0.040527,-0.130859,0.083008,0.015747,-0.116699,-0.029419,-0.070801


In [5]:
nnse.head()

Unnamed: 0_level_0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d2491,d2492,d2493,d2494,d2495,d2496,d2497,d2498,d2499,d2500
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
expletive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
measles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
proven,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
perverted,0.004916,0.0,0.0,0.0,0.0,0.006557,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.005495,0.0,0.0,0.0,0.0
inconsequential,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 응집도 비교

## 차원 별 대표 단어 확인 (정성적)
* **차원의 대표 단어**: 해당 차원의 값이 큰 단어 순으로 정렬했을 때 top K개의 단어
* 임베딩 차원 갯수
  * `word2vec`: 300차원
  * `nnse`: 2,500 차원

In [71]:
def _get_embeddings_for_words(words, embeddings):
    """주어진 단어들의 임베딩을 `pandas.DataFrame` 형태로 반환
    
    Args:
        words (list): 단어 리스트
        embeddings (pandas.DataFrame): 임베딩 행렬
    Returns:
        pandas.DataFrame: 주어진 단어들의 임베딩
        
        단어가 임베딩 행렬에 없는 경우 제외
    """
    return embeddings.loc[embeddings.index.intersection(words)].copy()

def _explain_dim(dim, embeddings, k):
    """단일 차원의 대표 단어 반환
    
    주어진 `embeddings` DataFrame을 입력된 차원값이 큰 순으로 정렬한 후, top k 단어 반환하는 방식
    
    Args:
        index (int): 차원 번호. 0부터 시작 (nnse의 경우 0~2499)
        embeddings (pandas.DataFrame): 전체 단어 임베딩
    Returns:
        tuple: (차원, 대표 단어)
    """
    desc = embeddings.sort_values(by=dim, ascending=False).index.tolist()[:k]
    return (dim, desc)

def explain_dims(dims, embeddings, k=5):
    """여러 차원의 대표 단어 반환
    
    주어진 `embeddings` DataFrame을 입력된 차원값이 큰 순으로 정렬한 후, top k 단어 반환하는 방식
    `_explain_dim` 활용
    
    Args:
        indices (list[int]): 차원 번호 리스트. 차원 번호는 0부터 시작 (nnse의 경우 0~2499)
        embeddings (pandas.DataFrame): 전체 단어 임베딩
    Returns:
        tuple: (차원, 대표 단어)
    """
    results = [_explain_dim(dim, embeddings, k) for dim in dims]
    pprint(results)

### word2vec
각 차원별로 일관된 개념이 드러나지 않음

In [72]:
dims_to_inspect = ['d4', 'd6']
explain_dims(dims_to_inspect, word2vec)

[('d4', ['declarer', 'repose', 'toppings', 'dictators', 'communism']),
 ('d6', ['mage', 'unclean', 'drinker', 'alcoholic', 'alibi'])]


### NNSE
각 차원별로 일관된 개념이 드러남

In [73]:
dims_to_inspect = ['d1703', 'd1255']
explain_dims(dims_to_inspect, nnse)

[('d1703', ['pharmacists', 'nurses', 'physicians', 'practitioners', 'doctors']),
 ('d1255',
  ['examine', 'investigate', 'investigating', 'exploring', 'examining'])]


## 차원 별 클러스터 응집도 확인 (정량적)
- 대표 단어 목록(클러스터)이 주어지면, 모든 단어 pair끼리의 `cosine` 값을 계산한 후 평균을 냄 (클러스터 질 평가)
- 평균 응집도 결과 (아래에서 계산)
  - word2vec: 0.1544
  - NNSE: 0.6443

In [74]:
from itertools import combinations
from scipy.spatial.distance import cosine

def caculate_similarity(w1, w2, embedding):
    w1_embed, w2_embed = embedding.loc[w1], embedding.loc[w2]
    return 1 - cosine(w1_embed, w2_embed)

def calculate_cohesion(words, embedding):
    if len(words) <= 1: # 단어 클러스터에 속한 단어가 1개 이하
        return 0.0
    pairs = combinations(words, 2)
    sim_scores = []
    for pair in pairs:
        w1, w2 = pair[0], pair[1]
        sim_score = caculate_similarity(w1, w2, embedding)
        sim_scores.append(sim_score)
    return sum(sim_scores) / len(sim_scores)

def get_avg_score(scores):
    return sum(scores) / len(scores)

### word2vec

In [75]:
dims = ['d{}'.format(d) for d in range(1, 301)]
dims[0], dims[-1]

('d1', 'd300')

In [76]:
embeddings = word2vec
scores = []
for d in dims:
    d, cluster = _explain_dim(d, embeddings, k=5)
    score = calculate_cohesion(cluster, embeddings)
    scores.append((d, score))

In [82]:
avg_score = get_avg_score([s[1] for s in word2vec_scores])
print('평균 응집도: {:.4f}'.format(avg_score))

평균 응집도: 0.1544


### NNSE

In [54]:
dims = ['d{}'.format(d) for d in range(1, 2501)]
dims[0], dims[-1]

('d1', 'd2500')

In [83]:
embeddings = nnse
scores = []
for d in dims:
    d, cluster = _explain_dim(d, embeddings, k=5)
    score = calculate_cohesion(cluster, embeddings)
    scores.append((d, score))

In [84]:
avg_score = get_avg_score([s[1] for s in scores])
print('평균 응집도: {:.4f}'.format(avg_score))

평균 응집도: 0.6443
