In [2]:
import fasttext
import fasttext.util

from scipy import spatial
import pandas as pd

def cosine(word1, word2): #get cosine distance from each pair
    vector1 = ft.get_word_vector(word1) #getting word vectors from fasttext
    vector2 = ft.get_word_vector(word2)
    return spatial.distance.cosine(vector1, vector2)

def cosine_sim(word1, word2): #get cosine similarity from each pair
    vector1 = ft.get_word_vector(word1) #getting word vectors from fasttext
    vector2 = ft.get_word_vector(word2)
    return 1-spatial.distance.cosine(vector1, vector2)

In [7]:
# Get fastText models for English
ft = fasttext.load_model('cc.en.300.bin')
ft.get_dimension()

# Calculate cosine distance and similarity, and appending to existing data frame with raw lexical co-occurrence
df = pd.read_csv ('corpusEMV_filler.csv')
print(df[:10])

df['word1_match_cosine_E'] = df.apply(lambda row : cosine(row['cue_E'], row['word1_E']), axis = 1)
df['word2_match_cosine_E'] = df.apply(lambda row : cosine(row['cue_E'], row['word2_E']), axis = 1)
                            
df['word1_match_cosine_sim_E'] = df.apply(lambda row : cosine_sim(row['cue_E'], row['word1_E']), axis = 1)
df['word2_match_cosine_sim_E'] = df.apply(lambda row : cosine_sim(row['cue_E'], row['word2_E']), axis = 1)

print(df[:10])

# Write to a new csv file
df.to_csv('corpusEMV_filler.csv', index=False)

         triad cue_M  cue_frequency_M word1_M  word1_frequency_M word2_M  \
0        lemon    柠檬              NaN      葡萄                NaN       梨   
1      library   图书馆              NaN      银行                NaN     电影院   
2         kite    风筝              NaN      篮球                NaN      秋千   
3       bridge     桥              NaN      隧道                NaN      公路   
4       infant    婴儿              NaN      男人                NaN      女人   
5    coriander    香菜              NaN       盐                NaN      胡椒   
6  grandfather    祖父              NaN      姐妹                NaN      叔叔   
7       candle    蜡烛              NaN      火炬                NaN      灯笼   
8          fox    狐狸              NaN      老虎                NaN      狮子   
9       branch    树枝              NaN       根                NaN       茎   

   word2_frequency_M word1_match_M  word1_match_frequency_M word2_match_M  \
0                NaN       柠檬 - 葡萄                      NaN        柠檬 - 梨   
1        



In [8]:
# Get fastText models for Mandarin
ft = fasttext.load_model('cc.zh.300.bin')
ft.get_dimension()

# Calculate cosine distance and similarity, and appending to existing data frame with raw lexical co-occurrence
df = pd.read_csv ('corpusEMV_filler.csv')
print(df[:10])

df['word1_match_cosine_M'] = df.apply(lambda row : cosine(row['cue_M'], row['word1_M']), axis = 1)
df['word2_match_cosine_M'] = df.apply(lambda row : cosine(row['cue_M'], row['word2_M']), axis = 1)
                            
df['word1_match_cosine_sim_M'] = df.apply(lambda row : cosine_sim(row['cue_M'], row['word1_M']), axis = 1)
df['word2_match_cosine_sim_M'] = df.apply(lambda row : cosine_sim(row['cue_M'], row['word2_M']), axis = 1)
print(df[:10])

# Write to a new csv file
df.to_csv('corpusEMV_filler.csv', index=False)



         triad cue_M  cue_frequency_M word1_M  word1_frequency_M word2_M  \
0        lemon    柠檬              NaN      葡萄                NaN       梨   
1      library   图书馆              NaN      银行                NaN     电影院   
2         kite    风筝              NaN      篮球                NaN      秋千   
3       bridge     桥              NaN      隧道                NaN      公路   
4       infant    婴儿              NaN      男人                NaN      女人   
5    coriander    香菜              NaN       盐                NaN      胡椒   
6  grandfather    祖父              NaN      姐妹                NaN      叔叔   
7       candle    蜡烛              NaN      火炬                NaN      灯笼   
8          fox    狐狸              NaN      老虎                NaN      狮子   
9       branch    树枝              NaN       根                NaN       茎   

   word2_frequency_M word1_match_M  word1_match_frequency_M word2_match_M  \
0                NaN       柠檬 - 葡萄                      NaN        柠檬 - 梨   
1        

In [9]:
# Get fastText models for Vietnamese
ft = fasttext.load_model('cc.vi.300.bin')
ft.get_dimension()

# Calculate cosine distance and similarity, and appending to existing data frame with raw lexical co-occurrence
df = pd.read_csv ('corpusEMV_filler.csv')
print(df[:10])

df['word1_match_cosine_V'] = df.apply(lambda row : cosine(row['cue_V'], row['word1_V']), axis = 1)
df['word2_match_cosine_V'] = df.apply(lambda row : cosine(row['cue_V'], row['word2_V']), axis = 1)
                            
df['word1_match_cosine_sim_V'] = df.apply(lambda row : cosine_sim(row['cue_V'], row['word1_V']), axis = 1)
df['word2_match_cosine_sim_V'] = df.apply(lambda row : cosine_sim(row['cue_V'], row['word2_V']), axis = 1)

print(df[:10])

# Write to a new csv file
df.to_csv('corpusEMV_filler.csv', index=False)

         triad cue_M  cue_frequency_M word1_M  word1_frequency_M word2_M  \
0        lemon    柠檬              NaN      葡萄                NaN       梨   
1      library   图书馆              NaN      银行                NaN     电影院   
2         kite    风筝              NaN      篮球                NaN      秋千   
3       bridge     桥              NaN      隧道                NaN      公路   
4       infant    婴儿              NaN      男人                NaN      女人   
5    coriander    香菜              NaN       盐                NaN      胡椒   
6  grandfather    祖父              NaN      姐妹                NaN      叔叔   
7       candle    蜡烛              NaN      火炬                NaN      灯笼   
8          fox    狐狸              NaN      老虎                NaN      狮子   
9       branch    树枝              NaN       根                NaN       茎   

   word2_frequency_M word1_match_M  word1_match_frequency_M word2_match_M  \
0                NaN       柠檬 - 葡萄                      NaN        柠檬 - 梨   
1        

