**SemEval2020 Task 1 Model Training**

In [29]:
# Approach 1: Train two models for two time points and align them using Orthogonal Procrustes

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

# Training word2vec model on english practice corpus 1
model = Word2Vec(corpus_file="trial_data_public/corpora/english/corpus1/corpus1.txt", vector_size=300, window=10, min_count=1, workers=4, negative=5)
# model.save("eng_prac_corp1.model")

In [30]:
from gensim.models import KeyedVectors

# Saving wordvectors
word_vectors = model.wv
word_vectors.save("eng_prac_corp1.wordvectors")

wv = KeyedVectors.load("eng_prac_corp1.wordvectors", mmap='r')


In [31]:
print(model.wv.most_similar('walk', topn=10))

[('after', 0.9997694492340088), ('into', 0.9997608661651611), ('over', 0.9997478127479553), ('through', 0.9997445344924927), ('stand', 0.9997408390045166), ('eye', 0.9997376203536987), ('first', 0.9997310042381287), ('while', 0.9997270107269287), ('both', 0.9997267127037048), ('light', 0.9997245073318481)]


In [32]:
# Store target words
target_words = ['walk', 'distance', 'small', 'god']

with open("trial_data_public/corpora/english/corpus2/corpus2.txt") as file:
    lines = [line.rstrip().split() for line in file]

In [33]:
# Train second word2vec model using corpus2 lines
model2 = Word2Vec(lines, vector_size=300, window=10, min_count=1, workers=4, negative=5)
# Saving wordvectors
word_vectors2 = model2.wv
word_vectors2.save("eng_prac_corp2.wordvectors")

wv2 = KeyedVectors.load("eng_prac_corp2.wordvectors", mmap='r')

In [35]:
print(model2.wv.most_similar('walk', topn=10))

[('out', 0.999792218208313), ('into', 0.999790370464325), ('off', 0.9997903108596802), ('their', 0.9997901916503906), ('her', 0.9997893571853638), ('and', 0.9997891783714294), ("'s", 0.9997891187667847), ('make', 0.9997890591621399), ('down', 0.9997889995574951), ('from', 0.9997884035110474)]


In [50]:
# Align wv and wv2 using Orthogonal Procrustes
from scipy.linalg import orthogonal_procrustes
from numpy.linalg import norm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# print(wv.get_normed_vectors(), wv2.get_normed_vectors())

# Get vectors for target words
wv_target_vectors = np.array([wv.get_vector(word) for word in target_words])
wv2_target_vectors = np.array([wv2.get_vector(word) for word in target_words])

wv_target_vectors_mu = wv_target_vectors.mean(axis=0)
wv_target_vectors_centered = wv_target_vectors - wv_target_vectors_mu

wv2_target_vectors_mu = wv2_target_vectors.mean(axis=0)
wv2_target_vectors_centered = wv2_target_vectors - wv2_target_vectors_mu

R, sca = orthogonal_procrustes(wv_target_vectors_centered, wv2_target_vectors_centered)
# print(R, sca)
scale = sca / np.square(norm(wv_target_vectors_centered))

wv2_target_vectors_approx = scale * np.dot(wv_target_vectors_centered, R) + wv2_target_vectors_mu

# Now we have the aligned vectors for target words in wv2_target_vectors_approx
# We can use these vectors to find the most similar words in wv2
for i, word in enumerate(target_words):
    print(word, cosine_similarity(wv2_target_vectors_approx[i].reshape(1,-1), wv2_target_vectors[i].reshape(1,-1)))



walk [[0.9994904]]
distance [[0.9996628]]
small [[0.999903]]
god [[0.9994504]]


In [28]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models import KeyedVectors

# Approach 2: Train a combined model on both corpora (target words in corpus2 are changed to target_word_)
target_words = ['walk', 'distance', 'small', 'god', 'gay']

# Store corpus1 and corpus2 lines together
c1_c2_lines = []

with open("trial_data_public/corpora/english/corpus1/corpus1.txt") as file:
    c1_lines = [line.rstrip().split() for line in file]
    c1_c2_lines.extend(c1_lines)

with open("trial_data_public/corpora/english/corpus2/corpus2.txt") as file:
    c2_lines = [line.rstrip().split() for line in file]
    for line in c2_lines:
        for word in line:
            if word in target_words:
                line[line.index(word)] = word + '_'
    c1_c2_lines.extend(c2_lines)

for line in c1_c2_lines:
    if 'gay_' in line:
        print('gay_', line)
    if 'gay' in line:
        print('gay', line)

# Train combined model
model_combined = Word2Vec(c1_c2_lines, vector_size=300, window=10, min_count=1, workers=4, negative=5)
# Saving wordvectors
word_vectors_combined = model_combined.wv
word_vectors_combined.save("eng_prac_combined.wordvectors")

wv_combined = KeyedVectors.load("eng_prac_combined.wordvectors", mmap='r')

print(wv_combined.distance('gay', 'gay_'))

gay ['the', 'same', 'mansion', 'rouse', 'by', 'the', 'same', 'cheer', 'note', 'but', 'he', 'smile', 'not', 'upon', 'the', 'joyous', 'throng', 'as', 'they', 'gather', 'around', 'the', 'spot', 'occupy', 'by', 'congo', 'and', 'his', 'canine', 'favourite', 'nor', 'yet', 'upon', 'those', 'of', 'the', 'gay', 'youth', 'who', 'ride', 'up', 'and', 'touch', 'their', 'beaver', 'respectfully', 'to', 'the', 'smiling', 'maiden', 'as', 'they', 'singly', 'or', 'in', 'pair', 'canter', 'away', 'over', 'the', 'bridge', 'in', 'pursuit', 'of', 'their', 'day', "'s", 'sport']
gay ['what', 'poor', 'little', 'blind', 'alice', 'discern', 'in', 'him', 'and', 'love', 'the', 'gay', 'young', 'lady', 'can', 'not', 'see']
gay ['it', 'be', 'early', 'in', 'the', 'month', 'of', 'july', 'the', 'earth', 'gay', 'in', 'its', 'green', 'pomp', 'of', 'foliage', 'its', 'rich', 'flush', 'of', 'bloom', 'the', 'heaven', 'dazzlingly', 'blue', 'the', 'air', 'mild', 'and', 'balmy', 'the', 'wild', 'landscape', 'diversify', 'with', 'it

**Model Evaluation using DatSemShift 3.0**

In [13]:
############################## EVALUATION USING DATSEMSHIFT 3.0 #######################################

# Open semShift.txt file
with open("../semShift.txt") as file:
    # Split the lines of the file by tabs
    lines = [line.rstrip().split('\t') for line in file]

# lines is the first four strings of each line
lines = [line[:4] for line in lines]
print(lines[:3])

[['5548', 'pope', '→', 'ruff (fish)'], ['6550', '<hat>', '→', 'mushroom cap'], ['0750', 'to search, to look for', '↔', 'to want']]


In [15]:
# Load DatSemShift dataset in pandas dataframe
import pandas as pd
df = pd.read_csv('../semShift.csv', header=None)
df.columns = ['id', 'word1', 'shift_dir', 'word2']
df.head()



Unnamed: 0,id,word1,shift_dir,word2
0,5548,pope,→,ruff (fish)
1,6550,<hat>,→,mushroom cap
2,750,"to search, to look for",↔,to want
3,4864,heart,→,hearts (in cards)
4,6858,<country>,→,turkey
