# Load packages and set a word

In [1]:
from gensim.models import Word2Vec
import os
import pandas as pd


WORD = '탄소/Noun'
FULL_DF = pd.DataFrame(columns = ['모델명', '단어', '상위유사단어', '단어벡터'])

def get_results(output_path):
    global FULL_DF
    
    model_path = [os.path.join(output_path, x) for x in os.listdir(output_path) if '.model' in x] 
    model_path = sorted(model_path)
    for p in model_path:
        model = Word2Vec.load(p)
        model_name = p.split("/")[-1]
        vector = model.wv[WORD] 
        sims = model.wv.most_similar(WORD, topn=5)
        top_sims_words = [x for x, y in sims]
        
        print(model_name, ":", len(model.wv))
        print(vector)
        print(top_sims_words)
        # print([(x, round(y, 3)) for x, y in sims])
        print("")
        
        values = [model_name.split(".")[0], WORD,
                  ", ".join(top_sims_words),
                  ", ".join([str(round(e, 8)) for e in vector.tolist()])]
        row_df = pd.DataFrame(values).T
        row_df.columns = ['모델명','단어', '상위유사단어', '단어벡터']
        FULL_DF = FULL_DF.append(row_df, ignore_index=True)
    print(FULL_DF.shape)
    return print("results from ", output_path)


# Set seed

- 데이터 수: 990
- 시드 고정

In [2]:
output_path = "./outputs_seed1_990"
get_results(output_path)

all_990_seed1.model : 7262
[-0.21501355  0.2669373  -0.32793513  0.07109815 -0.7518032  -0.8021537
  0.5654463  -0.17608576 -0.26945484 -0.554673    0.57860345  0.02802632
  0.06986153 -0.05360684 -0.531066    0.579893    0.28824037  0.8274705
 -0.6469382  -0.3504321   0.04056122  0.49567458  0.23931411 -0.46893662
 -0.13795087  0.5248348   0.5166346   0.69318825 -0.51077867  0.00476885
  0.5873238  -0.40719798  0.04866548 -0.07508707  0.26885834 -0.34047025
  0.75728357 -0.01293586  0.91969186 -0.12008905 -0.35514733  0.00880679
  0.4294355  -0.07531354  1.002809    0.5857314   0.30497193 -0.99702966
  0.6217088  -0.59860957]
['멀다/Adjective', '골재/Noun', '격리/Noun', '감용/Noun', '자발/Noun']

train_30_update_70_990_seed1.model : 7262
[ 0.3684375  -0.3736557   0.02165631 -0.41454932 -0.14418234 -0.07064438
  0.25542328 -0.0935227   0.19597712 -0.4496196   0.31569654 -0.64828205
  0.16590478 -0.25670144 -0.27755073  0.6337057   0.13858724  0.5046689
 -0.83868694 -0.7531964   0.41343173  0.339

- 데이터 수: 3320
- 시드 고정

In [3]:
output_path = "./outputs_seed1_3320"
get_results(output_path)

all_3320_seed1.model : 17986
[-0.16041155  0.23855872 -0.55493844 -0.47462612  0.34123296  0.13563189
 -0.01137527  0.23113863  0.41259104  0.40133646  0.41251954  0.08797286
  0.40029907 -0.32512918 -0.30116737  0.5551749   0.41097632  0.47776797
  0.2434772   0.11936503  0.7927437  -0.05132389  0.6795554  -0.17256601
  1.0563965   0.37161583 -0.2458966  -0.17828856 -0.26526964 -0.07686883
 -1.0772463  -0.32448187 -0.25504467 -0.52509934 -0.83153874 -0.33376673
  0.10473426  0.6952363   0.23181695  0.20059322  0.49812192 -0.44599473
  0.18417273  0.6361174   0.6341749   0.29861695  0.6925677  -0.5846624
  0.10810143  0.11120477]
['풀러렌/Noun', '격리/Noun', '탄소지/Noun', '원광/Noun', '상관분석/Noun']

train_30_update_70_3320_seed1.model : 17986
[-0.27596137  0.7499639  -0.3646765  -0.06694357 -0.9395968  -0.46490133
  0.16081156 -0.09734522  0.03853091 -0.14129539  0.14292957 -0.25707564
  0.07982291  0.09203587  0.28592637  0.89334935  0.8431731   0.10742577
 -0.03505363  0.12475754  0.42302302 -

# No seed 

- 데이터 수: 990
- 랜덤

In [4]:
get_results("./outputs_noseed_990")

all_990_noseed.model : 7262
[-0.2609587   0.32850745 -0.3824264  -0.09685676 -0.50311637 -1.1555929
  0.01685771 -0.20502585 -0.27093866 -0.5568868   0.19694622  0.14148402
  0.27729386 -0.22441389 -0.2795831   0.5891221   0.18436287  0.8387486
 -0.8005636  -0.33281502  0.04918722  0.40663272  0.25973195 -0.62311006
  0.13188201  0.44368142  0.47177732  0.47025272 -0.30343682  0.02461812
  0.72861576 -0.54343873 -0.20420477  0.05982861  0.157007   -0.4738067
  0.60381806  0.10469322  0.55346847 -0.40804482 -0.1034812   0.2791814
  0.3424535  -0.10203693  0.96590483  0.54965895  0.3899303  -0.81253
  0.7276483  -0.5840385 ]
['멀다/Adjective', '감용/Noun', '골재/Noun', '시너지/Noun', '흡수/Noun']

train_30_update_70_990_noseed.model : 7262
[ 0.06554076 -0.58604884 -0.1836017  -0.15875612 -0.04303745  0.05290389
  0.5364944   0.0025411   0.25715095 -0.62359536  0.6064984  -0.5991403
 -0.07735937 -0.09074619 -0.16602392  0.6710063  -0.4601954   0.6207025
 -0.6570269  -0.5892167   0.37046453  0.591120

- 데이터 수: 3320
- 랜덤

In [5]:
get_results("./outputs_noseed_3320")

all_3320_noseed.model : 17986
[-0.01652918  0.37617728 -0.28251225 -0.20184346  0.39182976 -0.04084512
 -0.04817214  0.2643307   0.45838472  0.33285108  0.4771449   0.06021675
  0.61087984 -0.05315985 -0.3723126   0.66278195  0.30943352 -0.0231518
  0.24638712  0.21192688  0.61412936 -0.1805209   0.8231063  -0.4545494
  1.1955192   0.01577133 -0.32565665 -0.30803475  0.00178305 -0.08137338
 -0.8968787  -0.3189287  -0.13367148 -0.6788903  -0.63821137 -0.52122474
  0.19077446  0.9348913   0.3799393   0.10696463  0.22108641 -0.5653261
  0.18141179  0.72299665  0.77951986  0.19592501  0.677823   -0.7284746
  0.10624928  0.15070105]
['원광/Noun', '고다/Noun', '나노튜브/Noun', '풀러렌/Noun', '화나다/Verb']

train_30_update_70_3320_noseed.model : 17986
[-3.2395241e-01  6.2532812e-01 -4.4380629e-01 -1.5480158e-01
 -6.6452378e-01 -7.9327798e-01  3.1010067e-01 -1.4417207e-01
  2.7341071e-01 -2.1297278e-01 -2.5940619e-02 -2.0428963e-01
  1.9976461e-01  1.5854344e-01 -9.5120832e-02  1.0058761e+00
  6.1725897e-0

# 결과 출력

In [6]:
def detect_seed(string):
    is_seed = string.split("_")[-1]
    is_seed = "X" if is_seed == 'noseed' else "O"
    return is_seed


def detect_datacount(string):
    data_cnt = string.split("_")[:-1][-1]
    return data_cnt


def detect_types(string):
    base_string = string.split("_")[:-1][:-1]
    rate, train_type = [], []
    for index, value in enumerate(base_string):
        result = divmod(index+1, 2)
        if result[1] == 0:
            value = int(int(value) / 10)
            rate.append(str(value))
        else:
            new_value = '학습' 
            if value == 'update':
                new_value = '추가학습'
            train_type.append(new_value)
            
    rate_str = " 대 ".join(rate)
    train_type_str = " + ".join(train_type)
    return (train_type_str, rate_str)


FULL_DF['시드고정여부'] = FULL_DF['모델명'].apply(lambda x: detect_seed(x))
FULL_DF['데이터수'] = FULL_DF['모델명'].apply(lambda x: detect_datacount(x))
FULL_DF['학습유형'] = FULL_DF['모델명'].apply(lambda x: detect_types(x)[0])
FULL_DF['데이터분할비율'] = FULL_DF['모델명'].apply(lambda x: detect_types(x)[1])
new_df = FULL_DF[['시드고정여부', '데이터수', '학습유형', '데이터분할비율', '단어', '상위유사단어','단어벡터']].copy()
new_df.to_csv("결과_{}.csv".format(WORD.split("/")[0]), index=False, encoding="utf-8-sig")