**SemEval2020 Task 1 Model Training**

In [1]:
# Approach 1: Train two models for two time points and align them using Orthogonal Procrustes

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models import KeyedVectors

# Training word2vec model on english practice corpus 1
model = Word2Vec(corpus_file="test_data_public/english/corpus1/lemma/ccoha1.txt", vector_size=300, window=10, min_count=1, workers=4, negative=5)
model.save("test_data_public/english/corpus1/lemma/ccoha1.model")

# Saving wordvectors
word_vectors = model.wv
word_vectors.save("test_data_public/english/corpus1/lemma/ccoha1.wv")

wv = KeyedVectors.load("test_data_public/english/corpus1/lemma/ccoha1.wv", mmap='r')

In [2]:
# Train second word2vec model using corpus2 lines
model2 = Word2Vec(corpus_file="test_data_public/english/corpus2/lemma/ccoha2.txt", vector_size=300, window=10, min_count=1, workers=4, negative=5)
model2.save("test_data_public/english/corpus2/lemma/ccoha2.model")

# Saving wordvectors
word_vectors2 = model2.wv
word_vectors2.save("test_data_public/english/corpus2/lemma/ccoha2.wv")

wv2 = KeyedVectors.load("test_data_public/english/corpus2/lemma/ccoha2.wv", mmap='r')

In [3]:
# Load target words from targets.txt
target_words = []
with open("test_data_public/english/targets.txt", "r") as f:
    for line in f:
        target_words.append(line.strip())

print(target_words)

['attack_nn', 'bag_nn', 'ball_nn', 'bit_nn', 'chairman_nn', 'circle_vb', 'contemplation_nn', 'donkey_nn', 'edge_nn', 'face_nn', 'fiction_nn', 'gas_nn', 'graft_nn', 'head_nn', 'land_nn', 'lane_nn', 'lass_nn', 'multitude_nn', 'ounce_nn', 'part_nn', 'pin_vb', 'plane_nn', 'player_nn', 'prop_nn', 'quilt_nn', 'rag_nn', 'record_nn', 'relationship_nn', 'risk_nn', 'savage_nn', 'stab_nn', 'stroke_vb', 'thump_nn', 'tip_vb', 'tree_nn', 'twist_nn', 'word_nn']


In [None]:
# Align wv and wv2 using Orthogonal Procrustes
from scipy.linalg import orthogonal_procrustes
from numpy.linalg import norm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# print(wv.get_normed_vectors(), wv2.get_normed_vectors())

# Get vectors for target words
wv_target_vectors = np.array([wv.get_vector(word) for word in target_words])
wv2_target_vectors = np.array([wv2.get_vector(word) for word in target_words])

wv_target_vectors_mu = wv_target_vectors.mean(axis=0)
wv_target_vectors_centered = wv_target_vectors - wv_target_vectors_mu

wv2_target_vectors_mu = wv2_target_vectors.mean(axis=0)
wv2_target_vectors_centered = wv2_target_vectors - wv2_target_vectors_mu

R, sca = orthogonal_procrustes(wv_target_vectors_centered, wv2_target_vectors_centered)
# print(R, sca)
scale = sca / np.square(norm(wv_target_vectors_centered))

wv2_target_vectors_approx = scale * np.dot(wv_target_vectors_centered, R) + wv2_target_vectors_mu


In [48]:
# Task 1 - Compute target words cosine similarity between aligned vectors and wv2 vectors
task1_similarities = []
for i, word in enumerate(target_words):
    # print(word, cosine_similarity(wv2_target_vectors_approx[i].reshape(1,-1), wv2_target_vectors[i].reshape(1,-1)))
    task1_similarities.append([word, cosine_similarity(wv2_target_vectors_approx[i].reshape(1,-1), wv2_target_vectors[i].reshape(1,-1))[0][0]])

# print(task1_similarities)

# Load truth file
task1_truth = []
with open("test_data_public/english/truth/binary.txt", "r") as f:
    for line in f:
        task1_truth.append(line.strip().split())

# For each list in task1_similarities, check if value is less than 0.9, assign 1 if true, 0 if false
task1_res = []
for i, word in enumerate(task1_similarities):
    if word[1] < 0.9:
        task1_res.append([word[0], 1])
    else:
        task1_res.append([word[0], 0])

# Compare task1_res and task1_truth
task1_correct = 0
for i, word in enumerate(task1_res):
    if word[1] == int(task1_truth[i][1]):
        task1_correct += 1

print("Task 1 accuracy (align): ", task1_correct/len(task1_res))

Task 1 accuracy (align):  0.5945945945945946


In [17]:
# Approach 2: Train a combined model on both corpora (target words in corpus2 are changed to target_word_)

# Store corpus1 and corpus2 lines together
c1_c2_lines = []

with open("test_data_public/english/corpus1/lemma/ccoha1.txt") as file:
    c1_lines = [line.rstrip().split() for line in file]
    c1_c2_lines.extend(c1_lines)

with open("test_data_public/english/corpus2/lemma/ccoha2.txt") as file:
    c2_lines = [line.rstrip().split() for line in file]
    for line in c2_lines:
        for word in line:
            if word in target_words:
                line[line.index(word)] = word + '_'
    c1_c2_lines.extend(c2_lines)

# Train combined model
model_combined = Word2Vec(c1_c2_lines, vector_size=300, window=10, min_count=1, workers=4, negative=5)

# Saving wordvectors
word_vectors_combined = model_combined.wv
word_vectors_combined.save("test_data_public/english/ccoha_combined.wv")

wv_combined = KeyedVectors.load("test_data_public/english/ccoha_combined.wv", mmap='r')

In [38]:
# Task 1 - Compute target words cosine distance in combined word vector
task1_similarities_combined = []
for i, word in enumerate(target_words):
    # print(word, cosine_similarity(wv2_target_vectors_approx[i].reshape(1,-1), wv2_target_vectors[i].reshape(1,-1)))
    task1_similarities_combined.append([word, cosine_similarity(wv_combined.get_vector(word).reshape(1,-1), wv_combined.get_vector(word + '_').reshape(1,-1))[0][0]])

# print(task1_similarities_combined)

# Compare task1_similarities_combined and task1_truth
task1_combined_res = []
for i, word in enumerate(task1_similarities_combined):
    if word[1] < 0.65:
        task1_combined_res.append([word[0], 1])
    else:
        task1_combined_res.append([word[0], 0])

task1_combined_correct = 0
for i, word in enumerate(task1_combined_res):
    if word[1] == int(task1_truth[i][1]):
        task1_combined_correct += 1

print("Task 1 accuracy (combined): ", task1_combined_correct/len(task1_combined_res))

Task 1 accuracy (combined):  0.6756756756756757


In [None]:
############################## EVALUATION USING DATSEMSHIFT 3.0 #######################################


# Open semShift.txt file
with open("../semShift.txt") as file:
    # Split the lines of the file by tabs
    lines = [line.rstrip().split('\t') for line in file]

# lines is the first four strings of each line
lines = [line[:4] for line in lines]
# print(lines[:3])

# Load DatSemShift dataset in pandas dataframe
import pandas as pd
df = pd.read_csv('../semShift.csv', header=None)
df.columns = ['id', 'word1', 'shift_dir', 'word2']
df.head(10)

# clean non-alphabet chars in the string in word1 and split the string
df['word1'] = df['word1'].str.replace('[^a-zA-Z]', ' ').str.split()
# clean non-alphabet chars in the string in word2 and split the string
df['word2'] = df['word2'].str.replace('[^a-zA-Z]', ' ').str.split()

df.head(10)

  df['word1'] = df['word1'].str.replace('[^a-zA-Z]', ' ').str.split()
  df['word2'] = df['word2'].str.replace('[^a-zA-Z]', ' ').str.split()


Unnamed: 0,id,word1,shift_dir,word2
0,5548,[pope],→,"[ruff, fish]"
1,6550,[hat],→,"[mushroom, cap]"
2,750,"[to, search, to, look, for]",↔,"[to, want]"
3,4864,[heart],→,"[hearts, in, cards]"
4,6858,[country],→,[turkey]
5,509,"[comb, of, a, bird]",—,[comb]
6,973,"[to, stand, up]",→,"[to, revolt, rebel]"
7,1303,"[to, pull, to, draw]",→,"[to, slow, linger]"
8,751,[leaf],→,"[sheet, of, paper]"
9,624,"[to, see, to, look, at]",→,"[to, have, an, appearance]"


In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Remove stop words from word1
df['word1'] = df['word1'].apply(lambda x: [item for item in x if item not in stop_words])
# Remove stop words from word2
df['word2'] = df['word2'].apply(lambda x: [item for item in x if item not in stop_words])

df.head(10)

Unnamed: 0,id,word1,shift_dir,word2
0,5548,[pope],→,"[ruff, fish]"
1,6550,[hat],→,"[mushroom, cap]"
2,750,"[search, look]",↔,[want]
3,4864,[heart],→,"[hearts, cards]"
4,6858,[country],→,[turkey]
5,509,"[comb, bird]",—,[comb]
6,973,[stand],→,"[revolt, rebel]"
7,1303,"[pull, draw]",→,"[slow, linger]"
8,751,[leaf],→,"[sheet, paper]"
9,624,"[see, look]",→,[appearance]
