**SemEval2020 Task 1 Model Training**

In [5]:
# Approach 1: Train two models for two time points and align them using Orthogonal Procrustes

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models import KeyedVectors

# Training word2vec model on english practice corpus 1
model = Word2Vec(corpus_file="test_data_public/english/corpus1/lemma/ccoha1.txt", vector_size=300, window=10, min_count=1, workers=4, negative=5)
model.save("test_data_public/english/corpus1/lemma/ccoha1.model")

# Saving wordvectors
word_vectors = model.wv
word_vectors.save("test_data_public/english/corpus1/lemma/ccoha1.wv")

wv = KeyedVectors.load("test_data_public/english/corpus1/lemma/ccoha1.wv", mmap='r')

In [6]:
# Train second word2vec model using corpus2 lines
model2 = Word2Vec(corpus_file="test_data_public/english/corpus2/lemma/ccoha2.txt", vector_size=300, window=10, min_count=1, workers=4, negative=5)
model2.save("test_data_public/english/corpus2/lemma/ccoha2.model")

# Saving wordvectors
word_vectors2 = model2.wv
word_vectors2.save("test_data_public/english/corpus2/lemma/ccoha2.wv")

wv2 = KeyedVectors.load("test_data_public/english/corpus2/lemma/ccoha2.wv", mmap='r')

In [7]:
# Load target words from targets.txt
target_words = []
with open("test_data_public/english/targets.txt", "r") as f:
    for line in f:
        target_words.append(line.strip())

print(target_words)

['attack_nn', 'bag_nn', 'ball_nn', 'bit_nn', 'chairman_nn', 'circle_vb', 'contemplation_nn', 'donkey_nn', 'edge_nn', 'face_nn', 'fiction_nn', 'gas_nn', 'graft_nn', 'head_nn', 'land_nn', 'lane_nn', 'lass_nn', 'multitude_nn', 'ounce_nn', 'part_nn', 'pin_vb', 'plane_nn', 'player_nn', 'prop_nn', 'quilt_nn', 'rag_nn', 'record_nn', 'relationship_nn', 'risk_nn', 'savage_nn', 'stab_nn', 'stroke_vb', 'thump_nn', 'tip_vb', 'tree_nn', 'twist_nn', 'word_nn']


In [37]:
# Align wv and wv2 using Orthogonal Procrustes
from scipy.linalg import orthogonal_procrustes
from numpy.linalg import norm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# print(wv.get_normed_vectors(), wv2.get_normed_vectors())

# Get vectors for target words
wv_target_vectors = np.array([wv.get_vector(word) for word in target_words])
wv2_target_vectors = np.array([wv2.get_vector(word) for word in target_words])

wv_target_vectors_mu = wv_target_vectors.mean(axis=0)
wv_target_vectors_centered = wv_target_vectors - wv_target_vectors_mu

wv2_target_vectors_mu = wv2_target_vectors.mean(axis=0)
wv2_target_vectors_centered = wv2_target_vectors - wv2_target_vectors_mu

R, sca = orthogonal_procrustes(wv_target_vectors_centered, wv2_target_vectors_centered)
# print(R, sca)
scale = sca / np.square(norm(wv_target_vectors_centered))

wv2_target_vectors_approx = scale * np.dot(wv_target_vectors_centered, R) + wv2_target_vectors_mu


In [38]:
# Task 1 - Compute target words cosine similarity between aligned vectors and wv2 vectors
task1_similarities = []
for i, word in enumerate(target_words):
    # print(word, cosine_similarity(wv2_target_vectors_approx[i].reshape(1,-1), wv2_target_vectors[i].reshape(1,-1)))
    task1_similarities.append([word, cosine_similarity(wv2_target_vectors_approx[i].reshape(1,-1), wv2_target_vectors[i].reshape(1,-1))[0][0]])

# print(task1_similarities)

# Load truth file
task1_truth = []
with open("test_data_public/english/truth/binary.txt", "r") as f:
    for line in f:
        task1_truth.append(line.strip().split())

# For each list in task1_similarities, check if value is less than 0.9, assign 1 if true, 0 if false
task1_res = []
for i, word in enumerate(task1_similarities):
    if word[1] < 0.9:
        task1_res.append([word[0], 1])
    else:
        task1_res.append([word[0], 0])

# Compare task1_res and task1_truth
task1_correct = 0
for i, word in enumerate(task1_res):
    if word[1] == int(task1_truth[i][1]):
        task1_correct += 1

print("Task 1 accuracy (align): ", task1_correct/len(task1_res))

Task 1 accuracy (align):  0.5945945945945946


In [40]:
# Approach 2: Train a combined model on both corpora (target words in corpus2 are changed to target_word_)

# Store corpus1 and corpus2 lines together
c1_c2_lines = []

with open("test_data_public/english/corpus1/lemma/ccoha1.txt") as file:
    c1_lines = [line.rstrip().split() for line in file]
    c1_c2_lines.extend(c1_lines)

with open("test_data_public/english/corpus2/lemma/ccoha2.txt") as file:
    c2_lines = [line.rstrip().split() for line in file]
    for line in c2_lines:
        for word in line:
            if word in target_words:
                line[line.index(word)] = word + '_'
    c1_c2_lines.extend(c2_lines)

# Train combined model
model_combined = Word2Vec(c1_c2_lines, vector_size=300, window=10, min_count=1, workers=4, negative=5)

# Saving wordvectors
word_vectors_combined = model_combined.wv
word_vectors_combined.save("test_data_public/english/ccoha_combined.wv")

wv_combined = KeyedVectors.load("test_data_public/english/ccoha_combined.wv", mmap='r')

In [41]:
# Task 1 - Compute target words cosine distance in combined word vector
task1_similarities_combined = []
for i, word in enumerate(target_words):
    # print(word, cosine_similarity(wv2_target_vectors_approx[i].reshape(1,-1), wv2_target_vectors[i].reshape(1,-1)))
    task1_similarities_combined.append([word, cosine_similarity(wv_combined.get_vector(word).reshape(1,-1), wv_combined.get_vector(word + '_').reshape(1,-1))[0][0]])

# print(task1_similarities_combined)

# Compare task1_similarities_combined and task1_truth
task1_combined_res = []
for i, word in enumerate(task1_similarities_combined):
    if word[1] < 0.65:
        task1_combined_res.append([word[0], 1])
    else:
        task1_combined_res.append([word[0], 0])

task1_combined_correct = 0
for i, word in enumerate(task1_combined_res):
    if word[1] == int(task1_truth[i][1]):
        task1_combined_correct += 1

print("Task 1 accuracy (combined): ", task1_combined_correct/len(task1_combined_res))

Task 1 accuracy (combined):  0.6756756756756757


In [44]:
#  Task 2 - Compute spearman correlation between target words cosine similarity in wv2 and wv2_aligned\n",
from scipy import stats
# Load task2 truth file
task2_truth = []
# open truth graded file

with open("test_data_public/english/truth/graded.txt", "r") as f:
    for line in f:
        task2_truth.append(line.strip().split()[1])

# task2_ranking = the second item in for every item in task1_similarities
task2_ranking_align = [item[1] for item in task1_similarities]

# task2_ranking_combined = the second item in for every item in task1_similarities_combined
task2_ranking_combined = [item[1] for item in task1_similarities_combined]

# Compute spearman correlation
task2_align_corr = stats.spearmanr(task2_ranking_align, task2_truth)

task2_combined_corr = stats.spearmanr(task2_ranking_combined, task2_truth)

print("Task 2 spearman correlation (align): ", task2_align_corr)
print("Task 2 spearman correlation (combined): ", task2_combined_corr)

Task 2 spearman correlation (align):  SpearmanrResult(correlation=-0.06425988601233414, pvalue=0.7055400503518773)
Task 2 spearman correlation (combined):  SpearmanrResult(correlation=-0.2616634103860174, pvalue=0.11772270339847497)


In [73]:
############################## EVALUATION USING DATSEMSHIFT 3.0 #######################################


# Open semShift.txt file
with open("../semShift.txt") as file:
    # Split the lines of the file by tabs
    lines = [line.rstrip().split('\t') for line in file]

# lines is the first four strings of each line
lines = [line[:4] for line in lines]
# print(lines[:3])

# Load DatSemShift dataset in pandas dataframe
import pandas as pd
df = pd.read_csv('../semShift.csv', header=None)
df.columns = ['id', 'meaning1', 'shift_dir', 'meaning2']
df.head(10)

# clean non-alphabet chars in the string in meaning1 and split the string
df['meaning1'] = df['meaning1'].str.replace('[^a-zA-Z]', ' ').str.split()
# clean non-alphabet chars in the string in meaning2 and split the string
df['meaning2'] = df['meaning2'].str.replace('[^a-zA-Z]', ' ').str.split()

df.head(10)

Unnamed: 0,id,meaning1,shift_dir,meaning2
0,5548,[pope],→,"[ruff, fish]"
1,6550,[hat],→,"[mushroom, cap]"
2,750,"[to, search, to, look, for]",↔,"[to, want]"
3,4864,[heart],→,"[hearts, in, cards]"
4,6858,[country],→,[turkey]
5,509,"[comb, of, a, bird]",—,[comb]
6,973,"[to, stand, up]",→,"[to, revolt, rebel]"
7,1303,"[to, pull, to, draw]",→,"[to, slow, linger]"
8,751,[leaf],→,"[sheet, of, paper]"
9,624,"[to, see, to, look, at]",→,"[to, have, an, appearance]"


In [74]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Remove stop words from word1
df['meaning1'] = df['meaning1'].apply(lambda x: [item for item in x if item not in stop_words])
# Remove stop words from word2
df['meaning2'] = df['meaning2'].apply(lambda x: [item for item in x if item not in stop_words])

df.head(50)

Unnamed: 0,id,meaning1,shift_dir,meaning2
0,5548,[pope],→,"[ruff, fish]"
1,6550,[hat],→,"[mushroom, cap]"
2,750,"[search, look]",↔,[want]
3,4864,[heart],→,"[hearts, cards]"
4,6858,[country],→,[turkey]
5,509,"[comb, bird]",—,[comb]
6,973,[stand],→,"[revolt, rebel]"
7,1303,"[pull, draw]",→,"[slow, linger]"
8,751,[leaf],→,"[sheet, paper]"
9,624,"[see, look]",→,[appearance]


In [15]:
# Evaluation Approach 1: use the first word in meaning1 and meaning2 that are in the vocabulary
# Align the set of traget word vectors
# Compute cosine similarity between the two target words in c1 and c2
# if cosine similarity changed in the direction of shift direction, then the prediction is correct

# make an empty data frame to store the results
df_shift_res = pd.DataFrame(columns=['meaning1', 'shift_dir', 'meaning2'])
num_shifts = 0

# for each row in df
for index, row in df.iterrows():
    meaning1_word = ""
    meaning2_word = ""
    # for each word in meaning1
    for word in row['meaning1']:
        # if word is in the vocabulary
        if word in wv and word in wv2:
            meaning1_word = word
    # for each word in meaning2
    for word in row['meaning2']:
        # if word is in the vocabulary
        if word in wv and word in wv2:
            meaning2_word = word

    # if meaning1_word and meaning2_word are not empty
    if meaning1_word and meaning2_word:
        # if meaning1_word and meaning2_word are not the same word
        if meaning1_word != meaning2_word:
            num_shifts += 1
            # add meaning1_word and meaning2_word and their cosine similarity to shifts_list
            df_shift_res = df_shift_res.append({'meaning1': meaning1_word, 'shift_dir': row['shift_dir'], 'meaning2': meaning2_word}, ignore_index=True)


print("Number of shifts in df_shift_res: ", len(df_shift_res))
df_shift_res.head(10)


Number of shifts in df_shift_res:  680


Unnamed: 0,meaning1,shift_dir,meaning2
0,pope,→,fish
1,hat,→,cap
2,look,↔,want
3,country,→,turkey
4,bird,—,comb
5,stand,→,rebel
6,draw,→,linger
7,leaf,→,paper
8,look,→,appearance
9,cool,→,calm


**Approach 1.1: Separate embedding no alignment**

In [52]:
# compute cosine similarity between meaning1 and meaning2 in df_shift_res
detection_res_single_word = [] 
for i in range(len(df_shift_res)):
    cos_sim_c1 = cosine_similarity(wv.get_vector(df_shift_res.iloc[i]['meaning1']).reshape(1, -1), wv.get_vector(df_shift_res.iloc[i]['meaning2']).reshape(1, -1))[0][0]
    cos_sim_c2 = cosine_similarity(wv2.get_vector(df_shift_res.iloc[i]['meaning1']).reshape(1, -1), wv2.get_vector(df_shift_res.iloc[i]['meaning2']).reshape(1, -1))[0][0]
    detection_res_single_word.append([cos_sim_c1, cos_sim_c2, np.sign(cos_sim_c2 - cos_sim_c1)])

print(detection_res_single_word[:10])

# add the cosine similarity of meaning1 and meaning2 in df_shift_res to df_shift_res
df_shift_res['cos_sim_c1'] = [detection_res_single_word[i][0] for i in range(len(detection_res_single_word))]
df_shift_res['cos_sim_c2'] = [detection_res_single_word[i][1] for i in range(len(detection_res_single_word))]
df_shift_res['detected_dir_1.1'] = [detection_res_single_word[i][2] for i in range(len(detection_res_single_word))]

df_shift_res.head(10)

[[-0.24862945, -0.3373772, -1.0], [0.8995824, 0.81243926, -1.0], [0.1387261, 0.21904372, 1.0], [0.11960769, 0.36179477, 1.0], [0.41633216, 0.28525874, -1.0], [0.17037529, 0.15207587, -1.0], [0.1645097, 0.3066887, 1.0], [0.21008238, 0.34180748, 1.0], [0.20084272, 0.25881878, 1.0], [0.5883417, 0.62778693, 1.0]]


Unnamed: 0,meaning1,shift_dir,meaning2,cos_sim_c1,cos_sim_c2,detected_dir_1.1
0,pope,→,fish,-0.248629,-0.337377,-1.0
1,hat,→,cap,0.899582,0.812439,-1.0
2,look,↔,want,0.138726,0.219044,1.0
3,country,→,turkey,0.119608,0.361795,1.0
4,bird,—,comb,0.416332,0.285259,-1.0
5,stand,→,rebel,0.170375,0.152076,-1.0
6,draw,→,linger,0.16451,0.306689,1.0
7,leaf,→,paper,0.210082,0.341807,1.0
8,look,→,appearance,0.200843,0.258819,1.0
9,cool,→,calm,0.588342,0.627787,1.0


In [54]:
# count # of correctly predicted shifts
correct_shifts = 0
for i in range(len(df_shift_res)):
    if df_shift_res.iloc[i]['detected_dir_1.1'] == 1.0:
        correct_shifts += 1

print("Number of correctly predicted shifts: ", correct_shifts)
print("Number of total shifts: ", len(df_shift_res))
print("Accuracy: ", correct_shifts / len(df_shift_res))

Number of correctly predicted shifts:  404
Number of total shifts:  680
Accuracy:  0.5941176470588235


**Approach 1.2: Separate embedding with alignment**

In [18]:
from scipy.linalg import orthogonal_procrustes
from numpy.linalg import norm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# join all meaning1 and meaning2 words in one list
sem_shift_words = df_shift_res['meaning1'].tolist()
meaning2_words = df_shift_res['meaning2'].tolist()
sem_shift_words.extend(meaning2_words)

# get the vectors of the words in sem_shift_words in wv and wv2 to do orthogonal procrustes alignment
wv_sem_shift_vecs = np.array([wv.get_vector(word) for word in sem_shift_words])
wv2_sem_shift_vecs = np.array([wv2.get_vector(word) for word in sem_shift_words])

wv_sem_shift_vecs_mu = np.mean(wv_sem_shift_vecs, axis=0)
wv_sem_shift_vecs_centered = wv_sem_shift_vecs - wv_sem_shift_vecs_mu

wv2_sem_shift_vecs_mu = np.mean(wv2_sem_shift_vecs, axis=0)
wv2_sem_shift_vecs_centered = wv2_sem_shift_vecs - wv2_sem_shift_vecs_mu

R, sca = orthogonal_procrustes(wv_sem_shift_vecs_centered, wv2_sem_shift_vecs_centered)
scale = sca / np.square(norm(wv_sem_shift_vecs_centered))

wv2_sem_shift_approx = np.dot(wv_sem_shift_vecs_centered, R) * scale + wv2_sem_shift_vecs_mu


In [60]:
detection_res_single_word_aligned = [] 

for i in range(len(df_shift_res)):
    cos_sim_c1 = cosine_similarity(wv2_sem_shift_approx[i].reshape(1,-1), wv2_sem_shift_approx[i+len(df_shift_res)-1].reshape(1,-1))[0][0]
    cos_sim_c2 = cosine_similarity(wv2_sem_shift_vecs[i].reshape(1, -1), wv2_sem_shift_vecs[i+len(df_shift_res)-1].reshape(1, -1))[0][0]
    detection_res_single_word_aligned.append([cos_sim_c1, cos_sim_c2, np.sign(cos_sim_c2 - cos_sim_c1)])

# add the cosine similarity of meaning1 and meaning2 in df_shift_res to df_shift_res
df_shift_res['cos_sim_c1_aligned'] = [detection_res_single_word_aligned[i][0] for i in range(len(detection_res_single_word_aligned))]
df_shift_res['cos_sim_c2_aligned'] = [detection_res_single_word_aligned[i][1] for i in range(len(detection_res_single_word_aligned))]
df_shift_res['detected_dir_1.2'] = [detection_res_single_word_aligned[i][2] for i in range(len(detection_res_single_word_aligned))]

df_shift_res.head(10)

[[0.15776566, 0.045846466, -1.0], [0.5554598, 0.32201913, -1.0], [0.30379307, 0.18748595, -1.0], [0.23062642, 0.14869829, -1.0], [0.56373936, 0.5832027, 1.0], [0.22429632, 0.27559346, 1.0], [0.18340445, 0.05448911, -1.0], [0.4448002, 0.42315954, -1.0], [0.04751588, 0.04989227, 1.0], [0.25186875, 0.09505028, -1.0]]


Unnamed: 0,meaning1,shift_dir,meaning2,cos_sim_c1,cos_sim_c2,detected_dir_1.1,cos_sim_c1_aligned,cos_sim_c2_aligned,detected_dir_1.2
0,pope,→,fish,-0.248629,-0.337377,-1.0,0.157766,0.045846,-1.0
1,hat,→,cap,0.899582,0.812439,-1.0,0.55546,0.322019,-1.0
2,look,↔,want,0.138726,0.219044,1.0,0.303793,0.187486,-1.0
3,country,→,turkey,0.119608,0.361795,1.0,0.230626,0.148698,-1.0
4,bird,—,comb,0.416332,0.285259,-1.0,0.563739,0.583203,1.0
5,stand,→,rebel,0.170375,0.152076,-1.0,0.224296,0.275593,1.0
6,draw,→,linger,0.16451,0.306689,1.0,0.183404,0.054489,-1.0
7,leaf,→,paper,0.210082,0.341807,1.0,0.4448,0.42316,-1.0
8,look,→,appearance,0.200843,0.258819,1.0,0.047516,0.049892,1.0
9,cool,→,calm,0.588342,0.627787,1.0,0.251869,0.09505,-1.0


In [61]:
# count # of correctly predicted shifts
correct_shifts = 0
for i in range(len(df_shift_res)):
    if df_shift_res.iloc[i]['detected_dir_1.2'] == 1.0:
        correct_shifts += 1

print("Number of correctly predicted shifts: ", correct_shifts)
print("Number of total shifts: ", len(df_shift_res))
print("Accuracy: ", correct_shifts / len(df_shift_res))

Number of correctly predicted shifts:  193
Number of total shifts:  680
Accuracy:  0.2838235294117647


**Approach 1.3: Train combined Embedding on both Corpus**

In [82]:
# Store corpus1 and corpus2 lines together
c1_c2_lines = []

with open("test_data_public/english/corpus1/lemma/ccoha1.txt") as file:
    c1_lines = [line.rstrip().split() for line in file]
    c1_c2_lines.extend(c1_lines)

with open("test_data_public/english/corpus2/lemma/ccoha2.txt") as file:
    c2_lines = [line.rstrip().split() for line in file]
    for line in c2_lines:
        for word in line:
            if word in sem_shift_words:
                line[line.index(word)] = word + '_'
    c1_c2_lines.extend(c2_lines)

# Train combined model
model_combined = Word2Vec(c1_c2_lines, vector_size=300, window=10, min_count=1, workers=4, negative=5)

# Saving wordvectors
word_vectors_combined_sem_shift = model_combined.wv
word_vectors_combined_sem_shift.save("test_data_public/english/ccoha_combined_sem_shift.wv")

wv_combined_sem_shift = KeyedVectors.load("test_data_public/english/ccoha_combined_sem_shift.wv", mmap='r')

In [84]:
detection_res_single_word_combined = [] 

for i in range(len(df_shift_res)):
    cos_sim_c1 = cosine_similarity(wv_combined_sem_shift.get_vector(df_shift_res.iloc[i]['meaning1']).reshape(1,-1), wv_combined_sem_shift.get_vector(df_shift_res.iloc[i]['meaning2']).reshape(1,-1))[0][0]
    cos_sim_c2 = cosine_similarity(wv_combined_sem_shift.get_vector(df_shift_res.iloc[i]['meaning1'] + '_').reshape(1, -1), wv_combined_sem_shift.get_vector(df_shift_res.iloc[i]['meaning2'] + '_').reshape(1, -1))[0][0]
    detection_res_single_word_combined.append([cos_sim_c1, cos_sim_c2, np.sign(cos_sim_c2 - cos_sim_c1)])

# add the cosine similarity of meaning1 and meaning2 in df_shift_res to df_shift_res
df_shift_res['cos_sim_c1_combined'] = [detection_res_single_word_combined[i][0] for i in range(len(detection_res_single_word_combined))]
df_shift_res['cos_sim_c2_combined'] = [detection_res_single_word_combined[i][1] for i in range(len(detection_res_single_word_combined))]
df_shift_res['detected_dir_1.3'] = [detection_res_single_word_combined[i][2] for i in range(len(detection_res_single_word_combined))]

df_shift_res.head(10)

# count # of correctly predicted shifts
correct_shifts = 0
for i in range(len(df_shift_res)):
    if df_shift_res.iloc[i]['detected_dir_1.3'] == 1.0:
        correct_shifts += 1

print("Number of correctly predicted shifts: ", correct_shifts)
print("Number of total shifts: ", len(df_shift_res))
print("Accuracy: ", correct_shifts / len(df_shift_res))

Number of correctly predicted shifts:  327
Number of total shifts:  680
Accuracy:  0.4808823529411765


In [95]:
df_shift_res.head(10)

Unnamed: 0,meaning1,shift_dir,meaning2,cos_sim_c1,cos_sim_c2,detected_dir_1.1,cos_sim_c1_aligned,cos_sim_c2_aligned,detected_dir_1.2,cos_sim_c1_combined,cos_sim_c2_combined,detected_dir_1.3
0,pope,→,fish,-0.248629,-0.337377,-1.0,0.157766,0.045846,-1.0,-0.243716,-0.383739,-1.0
1,hat,→,cap,0.899582,0.812439,-1.0,0.55546,0.322019,-1.0,0.903475,0.792746,-1.0
2,look,↔,want,0.138726,0.219044,1.0,0.303793,0.187486,-1.0,0.137415,0.234502,1.0
3,country,→,turkey,0.119608,0.361795,1.0,0.230626,0.148698,-1.0,0.059941,0.419437,1.0
4,bird,—,comb,0.416332,0.285259,-1.0,0.563739,0.583203,1.0,0.383995,0.333703,-1.0
5,stand,→,rebel,0.170375,0.152076,-1.0,0.224296,0.275593,1.0,0.249531,0.217194,-1.0
6,draw,→,linger,0.16451,0.306689,1.0,0.183404,0.054489,-1.0,0.21807,0.284901,1.0
7,leaf,→,paper,0.210082,0.341807,1.0,0.4448,0.42316,-1.0,0.203262,0.342544,1.0
8,look,→,appearance,0.200843,0.258819,1.0,0.047516,0.049892,1.0,0.262808,0.198636,-1.0
9,cool,→,calm,0.588342,0.627787,1.0,0.251869,0.09505,-1.0,0.652084,0.53089,-1.0


**Approach 2.1: Average Embedding in Meaning Gloss**

In [80]:
df_shift_res2 = pd.DataFrame(columns=['meaning1', 'shift_dir', 'meaning2'])
num_shifts = 0

# for each row in df
for index, row in df.iterrows():
    meaning1_words = []
    meaning2_words = []
    meaning1_avgs_wv = []
    meaning2_avgs_wv = []
    meaning1_avgs_wv2 = []
    meaning2_avgs_wv2 = []
    # for each word in meaning1
    for word in row['meaning1']:
        # if word is in the vocabulary
        if word in wv and word in wv2:
            meaning1_words.append(word)
    # for each word in meaning2
    for word in row['meaning2']:
        # if word is in the vocabulary
        if word in wv and word in wv2:
            meaning2_words.append(word)

    # if meaning1_word and meaning2_word are not empty
    if len(meaning1_words) > 0 and len(meaning2_words) > 0:
        # if meaning1_word and meaning2_word are not the same word
        if meaning1_words != meaning2_words:
            num_shifts += 1

            meaning1_avgs_wv = np.mean(np.array([wv.get_vector(word) for word in meaning1_words]), axis=0)
            meaning2_avgs_wv = np.mean(np.array([wv.get_vector(word) for word in meaning2_words]), axis=0)

            meaning1_avgs_wv2 = np.mean(np.array([wv2.get_vector(word) for word in meaning1_words]), axis=0)
            meaning2_avgs_wv2 = np.mean(np.array([wv2.get_vector(word) for word in meaning2_words]), axis=0)

            # calculate cosine similarity between meaning1 and meaning2
            cos_sim_c1 = cosine_similarity(meaning1_avgs_wv.reshape(1,-1), meaning2_avgs_wv.reshape(1,-1))[0][0]

            cos_sim_c2 = cosine_similarity(meaning1_avgs_wv2.reshape(1,-1), meaning2_avgs_wv2.reshape(1,-1))[0][0]

            # if cosine similarity of meaning1 and meaning2 in wv2 is greater than cosine similarity of meaning1 and meaning2 in wv
            if cos_sim_c2 > cos_sim_c1:
                shift_dir = 1.0
            else:
                shift_dir = -1.0

            # add meaning1_words and meaning2_words, their average vectors, and cosine similarity to df_shift_res2
            df_shift_res2 = df_shift_res2.append({'meaning1': meaning1_words, 'shift_dir': row['shift_dir'], 'meaning2': meaning2_words, 
                                                'cos_sim_c1_avg_emb': cos_sim_c1, 'cos_sim_c2_avg_emb': cos_sim_c2, 'detected_dir': shift_dir}, ignore_index=True)


print("Number of shifts in df_shift_res: ", len(df_shift_res2))
df_shift_res2.head(10)


Number of shifts in df_shift_res:  632


Unnamed: 0,meaning1,shift_dir,meaning2,cos_sim_c1_avg_emb,cos_sim_c2_avg_emb,detected_dir
0,[pope],→,"[ruff, fish]",-0.193159,-0.287207,-1.0
1,[hat],→,"[mushroom, cap]",0.886209,0.732821,-1.0
2,"[search, look]",↔,[want],0.193901,0.203343,1.0
3,[country],→,[turkey],0.119608,0.361795,1.0
4,"[comb, bird]",—,[comb],0.703312,0.648283,-1.0
5,[stand],→,"[revolt, rebel]",0.117913,0.211288,1.0
6,"[pull, draw]",→,"[slow, linger]",0.178873,0.391186,1.0
7,[leaf],→,"[sheet, paper]",0.397161,0.533569,1.0
8,"[see, look]",→,[appearance],0.245025,0.263974,1.0
9,[cool],→,[calm],0.588342,0.627787,1.0


In [81]:
# count # of correctly predicted shifts
correct_shifts = 0
for i in range(len(df_shift_res2)):
    if df_shift_res2.iloc[i]['detected_dir'] == 1.0:
        correct_shifts += 1

print("Number of correctly predicted shifts: ", correct_shifts)
print("Number of total shifts: ", len(df_shift_res2))
print("Accuracy: ", correct_shifts / len(df_shift_res2))

Number of correctly predicted shifts:  362
Number of total shifts:  632
Accuracy:  0.5727848101265823


**Approach 2.2: Separate embeddings with alignment**

In [85]:
sem_shift_words_all = []

# for each word in sem_shift_words
for i in range(len(df_shift_res2)):
    sem_shift_words_all.extend(df_shift_res2.iloc[i]['meaning1'])
    sem_shift_words_all.extend(df_shift_res2.iloc[i]['meaning2'])

wv_sem_shift_vecs_all = np.array([wv.get_vector(word) for word in sem_shift_words_all])
wv2_sem_shift_vecs_all = np.array([wv2.get_vector(word) for word in sem_shift_words_all])

wv_sem_shift_vecs_all_mu = np.mean(wv_sem_shift_vecs_all, axis=0)
wv_sem_shift_vecs_all_centered = wv_sem_shift_vecs_all - wv_sem_shift_vecs_all_mu

wv2_sem_shift_vecs_all_mu = np.mean(wv2_sem_shift_vecs_all, axis=0)
wv2_sem_shift_vecs_all_centered = wv2_sem_shift_vecs_all - wv2_sem_shift_vecs_all_mu

R, sca = orthogonal_procrustes(wv_sem_shift_vecs_all_centered, wv2_sem_shift_vecs_all_centered)
scale = sca / np.square(norm(wv_sem_shift_vecs_all_centered))

wv2_sem_shift_all_approx = np.dot(wv_sem_shift_vecs_all_centered, R) * scale + wv2_sem_shift_vecs_all_mu

In [92]:
detection_res_all_words_aligned = [] 
meaning1_index = 0
meaning2_index = 0
# count how many strings are in meaning1
for i in range(len(df_shift_res2)):
    meaning2_index += len(df_shift_res2.iloc[i]['meaning1'])

for i in range(len(df_shift_res2)):
    meaning1_words = df_shift_res2.iloc[i]['meaning1']
    meaning1_wv_c1 = wv2_sem_shift_all_approx[meaning1_index:meaning1_index+len(meaning1_words)]
    meaning1_wv_c2 = wv2_sem_shift_vecs_all[meaning1_index:meaning1_index+len(meaning1_words)]
    meaning1_index += len(meaning1_words)
    meaning1_avgs_wv_c1 = np.mean(meaning1_wv_c1, axis=0)
    meaning1_avgs_wv_c2 = np.mean(meaning1_wv_c2, axis=0)

    meaning2_words = df_shift_res2.iloc[i]['meaning2']
    meaning2_wv_c1 = wv2_sem_shift_all_approx[meaning2_index:meaning2_index+len(meaning2_words)]
    meaning2_wv_c2 = wv2_sem_shift_vecs_all[meaning2_index:meaning2_index+len(meaning2_words)]
    meaning2_index += len(meaning2_words)
    meaning2_avgs_wv_c1 = np.mean(meaning2_wv_c1, axis=0)
    meaning2_avgs_wv_c2 = np.mean(meaning2_wv_c2, axis=0)

    cos_sim_c1 = cosine_similarity(meaning1_avgs_wv_c1.reshape(1,-1), meaning2_avgs_wv_c1.reshape(1,-1))[0][0]
    cos_sim_c2 = cosine_similarity(meaning1_avgs_wv_c2.reshape(1,-1), meaning2_avgs_wv_c2.reshape(1,-1))[0][0]

    detection_res_all_words_aligned.append([cos_sim_c1, cos_sim_c2, np.sign(cos_sim_c2 - cos_sim_c1)])


# add the cosine similarity of meaning1 and meaning2 in df_shift_res to df_shift_res
df_shift_res2['cos_sim_c1_aligned_all'] = [detection_res_all_words_aligned[i][0] for i in range(len(detection_res_all_words_aligned))]
df_shift_res2['cos_sim_c2_aligned_all'] = [detection_res_all_words_aligned[i][1] for i in range(len(detection_res_all_words_aligned))]
df_shift_res2['detected_dir_2.2'] = [detection_res_all_words_aligned[i][2] for i in range(len(detection_res_all_words_aligned))]

df_shift_res2.head(10)

Unnamed: 0,meaning1,shift_dir,meaning2,cos_sim_c1_avg_emb,cos_sim_c2_avg_emb,detected_dir,cos_sim_c1_aligned_all,cos_sim_c2_aligned_all,detected_dir_2.2,cos_sim_c1_combined_all,cos_sim_c2_combined_all
0,[pope],→,"[ruff, fish]",-0.193159,-0.287207,-1.0,0.029504,0.124642,1.0,-0.225196,-0.383828
1,[hat],→,"[mushroom, cap]",0.886209,0.732821,-1.0,0.395676,0.08476,-1.0,0.897048,0.71624
2,"[search, look]",↔,[want],0.193901,0.203343,1.0,0.359364,0.174429,-1.0,0.187692,0.248006
3,[country],→,[turkey],0.119608,0.361795,1.0,0.418272,0.323525,-1.0,0.083844,0.342028
4,"[comb, bird]",—,[comb],0.703312,0.648283,-1.0,0.500387,0.363287,-1.0,0.659231,0.633424
5,[stand],→,"[revolt, rebel]",0.117913,0.211288,1.0,0.56812,0.126827,-1.0,0.219444,0.138628
6,"[pull, draw]",→,"[slow, linger]",0.178873,0.391186,1.0,0.082797,0.120619,1.0,0.195974,0.361573
7,[leaf],→,"[sheet, paper]",0.397161,0.533569,1.0,0.616859,0.647212,1.0,0.384226,0.527084
8,"[see, look]",→,[appearance],0.245025,0.263974,1.0,0.5676,0.523005,-1.0,0.312477,0.203783
9,[cool],→,[calm],0.588342,0.627787,1.0,0.591769,0.377467,-1.0,0.648213,0.606192


In [90]:
# count # of correctly predicted shifts
correct_shifts = 0
for i in range(len(df_shift_res2)):
    if df_shift_res2.iloc[i]['detected_dir_2.2'] == 1.0:
        correct_shifts += 1

print("Number of correctly predicted shifts: ", correct_shifts)
print("Number of total shifts: ", len(df_shift_res2))
print("Accuracy: ", correct_shifts / len(df_shift_res2))

Number of correctly predicted shifts:  152
Number of total shifts:  632
Accuracy:  0.24050632911392406


**Approach 2.3: Combined word2vec embedding for all shift words**

In [88]:
# Store corpus1 and corpus2 lines together
c1_c2_lines = []

with open("test_data_public/english/corpus1/lemma/ccoha1.txt") as file:
    c1_lines = [line.rstrip().split() for line in file]
    c1_c2_lines.extend(c1_lines)

with open("test_data_public/english/corpus2/lemma/ccoha2.txt") as file:
    c2_lines = [line.rstrip().split() for line in file]
    for line in c2_lines:
        for word in line:
            if word in sem_shift_words_all:
                line[line.index(word)] = word + '_'
    c1_c2_lines.extend(c2_lines)

# Train combined model
model_combined = Word2Vec(c1_c2_lines, vector_size=300, window=10, min_count=1, workers=4, negative=5)

# Saving wordvectors
word_vectors_combined_sem_shift_all = model_combined.wv
word_vectors_combined_sem_shift_all.save("test_data_public/english/ccoha_combined_sem_shift_all.wv")

wv_combined_sem_shift_all = KeyedVectors.load("test_data_public/english/ccoha_combined_sem_shift_all.wv", mmap='r')

In [93]:
detection_res_all_words_combined = [] 

for i in range(len(df_shift_res2)):
    meaning1_words = df_shift_res2.iloc[i]['meaning1']
    meaning2_words = df_shift_res2.iloc[i]['meaning2']

    meaning1_avgs_wv = np.mean(np.array([wv_combined_sem_shift_all.get_vector(word) for word in meaning1_words]), axis=0)
    meaning2_avgs_wv = np.mean(np.array([wv_combined_sem_shift_all.get_vector(word) for word in meaning2_words]), axis=0)

    meaning1_avgs_wv2 = np.mean(np.array([wv_combined_sem_shift_all.get_vector(word + "_") for word in meaning1_words]), axis=0)
    meaning2_avgs_wv2 = np.mean(np.array([wv_combined_sem_shift_all.get_vector(word + "_") for word in meaning2_words]), axis=0)

    # calculate cosine similarity between meaning1 and meaning2
    cos_sim_c1 = cosine_similarity(meaning1_avgs_wv.reshape(1,-1), meaning2_avgs_wv.reshape(1,-1))[0][0]
    cos_sim_c2 = cosine_similarity(meaning1_avgs_wv2.reshape(1,-1), meaning2_avgs_wv2.reshape(1,-1))[0][0]

    detection_res_all_words_combined.append([cos_sim_c1, cos_sim_c2, np.sign(cos_sim_c2 - cos_sim_c1)])

df_shift_res2['cos_sim_c1_combined_all'] = [detection_res_all_words_combined[i][0] for i in range(len(detection_res_all_words_combined))]
df_shift_res2['cos_sim_c2_combined_all'] = [detection_res_all_words_combined[i][1] for i in range(len(detection_res_all_words_combined))]
df_shift_res2['detected_dir_2.3'] = [detection_res_all_words_combined[i][2] for i in range(len(detection_res_all_words_combined))]

df_shift_res2.head(10)

Unnamed: 0,meaning1,shift_dir,meaning2,cos_sim_c1_avg_emb,cos_sim_c2_avg_emb,detected_dir,cos_sim_c1_aligned_all,cos_sim_c2_aligned_all,detected_dir_2.2,cos_sim_c1_combined_all,cos_sim_c2_combined_all,detected_dir_2.3
0,[pope],→,"[ruff, fish]",-0.193159,-0.287207,-1.0,0.029504,0.124642,1.0,-0.225196,-0.383828,-1.0
1,[hat],→,"[mushroom, cap]",0.886209,0.732821,-1.0,0.395676,0.08476,-1.0,0.897048,0.71624,-1.0
2,"[search, look]",↔,[want],0.193901,0.203343,1.0,0.359364,0.174429,-1.0,0.187692,0.248006,1.0
3,[country],→,[turkey],0.119608,0.361795,1.0,0.418272,0.323525,-1.0,0.083844,0.342028,1.0
4,"[comb, bird]",—,[comb],0.703312,0.648283,-1.0,0.500387,0.363287,-1.0,0.659231,0.633424,-1.0
5,[stand],→,"[revolt, rebel]",0.117913,0.211288,1.0,0.56812,0.126827,-1.0,0.219444,0.138628,-1.0
6,"[pull, draw]",→,"[slow, linger]",0.178873,0.391186,1.0,0.082797,0.120619,1.0,0.195974,0.361573,1.0
7,[leaf],→,"[sheet, paper]",0.397161,0.533569,1.0,0.616859,0.647212,1.0,0.384226,0.527084,1.0
8,"[see, look]",→,[appearance],0.245025,0.263974,1.0,0.5676,0.523005,-1.0,0.312477,0.203783,-1.0
9,[cool],→,[calm],0.588342,0.627787,1.0,0.591769,0.377467,-1.0,0.648213,0.606192,-1.0


In [94]:
# count # of correctly predicted shifts
correct_shifts = 0
for i in range(len(df_shift_res2)):
    if df_shift_res2.iloc[i]['detected_dir_2.3'] == 1.0:
        correct_shifts += 1

print("Number of correctly predicted shifts: ", correct_shifts)
print("Number of total shifts: ", len(df_shift_res2))
print("Accuracy: ", correct_shifts / len(df_shift_res2))

Number of correctly predicted shifts:  298
Number of total shifts:  632
Accuracy:  0.47151898734177217
