In [1]:
import os
import pickle

import gensim
import gensim.downloader as api
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

from nltk.tokenize import RegexpTokenizer

import scipy.spatial.distance as scipy_distances

In [2]:
language = 'english'
epochs = 10

In [3]:
def get_sentences(corpus: int, language: str = 'english'):
    pickle_path = f'..\\data\\semantic-change\\cbow\\{language}'
    sentences_filepath = os.path.join(pickle_path, f'corpus{corpus}-sentences.pickle')
    if os.path.exists(sentences_filepath):
        with open(sentences_filepath, 'rb') as sentences_file:
            sentences = pickle.load(sentences_file)
            return sentences
        
    corpus_path = os.path.join(
        '..\\data\\semantic-change\\eval',
        language,
        f'corpus{corpus}',
        'lemma')

    text_filenames = list(filter(
        lambda x: x.endswith('.txt'),
        os.listdir(corpus_path)))

    sentences = []
    tokenizer = RegexpTokenizer(r'\w+')

    for text_filename in text_filenames:
        file_path = os.path.join(corpus_path, text_filename)
        with open(file_path, 'r', encoding='utf-8') as textfile:
            file_text_lines = textfile.readlines()
            sentences_tokenized = [tokenizer.tokenize(x.lower()) for x in file_text_lines]
            sentences.extend(sentences_tokenized)
        
    with open(sentences_filepath, 'wb') as sentences_file:
        pickle.dump(sentences, sentences_file)
        
    return sentences

In [4]:
def get_original_model_vector_keys(language: str = 'english'):
    pickle_path = f'..\\data\\semantic-change\\cbow\\{language}'
    vocab_keys_path = os.path.join(pickle_path, 'google-news-vocab-keys.pickle')
    if os.path.exists(vocab_keys_path):
        with open(vocab_keys_path, 'rb') as vocab_keys_file:
            vocab_keys = pickle.load(vocab_keys_file)
            return vocab_keys
    
    original_model = KeyedVectors.load_word2vec_format("..\\data\\semantic-change\\cbow\\english\\GoogleNews-vectors-negative300.bin", binary = True)
    vocab_keys = [list(original_model.vocab.keys())]
    with open(vocab_keys_path, 'wb') as vocab_keys_file:
        pickle.dump(vocab_keys, vocab_keys_file)

    return vocab_keys

In [5]:
original_vocab_keys = get_original_model_vector_keys()

In [6]:
def get_target_words():
    targets_path = f'..\\data\\semantic-change\\eval\\{language}\\targets.txt'
    with open(targets_path, 'r', encoding='utf-8') as targets_file:
        targets = targets_file.readlines()
        if language == 'english':
            targets = [x[:-4] for x in targets]
        else:
            targets = [x[:-1] for x in targets]
    
    return targets

targets = get_target_words()

In [7]:
original_model_path = "..\\data\\semantic-change\\cbow\\english\\GoogleNews-vectors-negative300.bin"
def load_target_vectors_for_corpus(
    corpus: int,
    language:str = 'english'):
    target_vectors_path = f'..\\data\\semantic-change\\cbow\\{language}\\target-vectors-{corpus}.pickle' 
    if os.path.exists(target_vectors_path):
        with open(target_vectors_path, 'rb') as target_vectors_file:
            target_vectors = pickle.load(target_vectors_file)
            return target_vectors
    
    sentences_to_use = get_sentences(corpus=corpus)
    print(sentences_to_use[:5])
#     model_path = f'..\\data\\semantic-change\\cbow\\{language}\\word2vec-corpus{corpus}.bin'
    model = Word2Vec(size=300, min_count=1)
    model.build_vocab(sentences_to_use)
    print(model.wv['attack'])
    model_examples = model.corpus_count
    print(f'Using {model_examples} total examples')
    
    print('Loading original model...')
    model.build_vocab(original_vocab_keys, update=True)
    model.intersect_word2vec_format(original_model_path, binary=True)
    
#     if not os.path.exists(model_path):
    print('Starting to train model...')
    model.train(sentences_to_use, total_examples=model_examples, epochs=model.epochs)
    print('Training model finished')#. Saving model to disk...')
#     model.wv.save_word2vec_format(model_path, binary=True)
#     print('Saving finished')
#     else:
#         print('Loading model...')
#         model.wv = KeyedVectors.load_word2vec_format(model_path, binary=True)
#         print('Loaded model')
        
    target_vectors = {}
    for target in targets:
        target_vectors[target] = model.wv[target]
        
    with open(target_vectors_path, 'wb') as target_vectors_file:
        pickle.dump(target_vectors, target_vectors_file)
    
    return target_vectors

In [8]:
target_vectors1 = load_target_vectors_for_corpus(corpus=1)

[['he', 'may', 'find', 'himself', 'unexpectedly', 'throw', 'into', 'new', 'and', 'strange', 'position', 'where', 'after', 'look', 'carefully', 'around', 'him', 'he', 'fear', 'to', 'move', 'lest', 'by', 'a', 'mistake', 'step', 'he', 'should', 'do', 'an', 'injury', 'to', 'the', 'cause', 'which', 'he', 'have', 'it', 'in', 'his', 'heart', 'to', 'serve'], ['first', 'and', 'foremost', 'he', 'draw', 'forth', 'from', 'a', 'case', 'in', 'the', 'corner', 'of', 'his', 'lodging', 'a', 'brass', 'buttoned', 'blue', 'coat', 'of', 'a', 'popular', 'cut', 'and', 'fall', 'to', 'beat', 'it', 'over', 'the', 'shoulder', 'and', 'down', 'the', 'back', 'with', 'a', 'yard', 'stick', 'as', 'if', 'he', 'have', 'under', 'his', 'hand', 'the', 'body', 'and', 'person', 'of', 'his', 'dire', 'enemy', 'in', 'the', 'world', 'then', 'he', 'twist', 'the', 'right', 'arm', 'up', 'and', 'dash', 'at', 'the', 'place', 'where', 'the', 'rib', 'might', 'have', 'be', 'then', 'he', 'fall', 'upon', 'the', 'breast', 'and', 'pumelled',

In [9]:
target_vectors2 = load_target_vectors_for_corpus(corpus=2)

[['oui', 'yes', 'i', 'mumble', 'the', 'linguistic', 'transition', 'now', 'in', 'limbo'], ['let', 'be', 'say', 'you', 'be', 'contentedly', 'cook', 'hot', 'dog', 'on', 'one', 'of', 'the', 'park', 's', 'rusted', 'barbecue', 'grill', 'at', 'high', 'noon', 'on', 'this', 'oven', 'hot', 'day'], ['certain', 'kind', 'of', 'lighting', 'refuse', 'to', 'lie', 'about', 'the', 'relentless', 'ineluctableprocess', 'of', 'ageing'], ['within', 'the', 'first', '50', 'yard', 'diduryk', 's', 'man', 'come', 'under', 'heavy', 'fire'], ['second', 'there', 'be', 'significant', 'change', 'in', 'power', 'relation', 'within', 'the', 'community', 'and', 'between', 'the', 'tribal', 'and', 'federal', 'government']]
[ 1.16241479e-03  1.42612867e-03  4.94861160e-04  1.33493997e-03
  7.25217746e-04  3.35817895e-04  1.64562475e-03  5.58590691e-04
 -6.51073002e-04 -1.04540889e-03  5.11674560e-04  7.60363590e-04
 -7.10135035e-04 -9.30519309e-04  4.32367844e-04  2.25653319e-04
 -4.44546924e-04 -9.43739375e-04  1.42442177e-

In [10]:
print(target_vectors1['lass'])
print('---')
print(target_vectors2['lass'])

[-0.0324707   0.05541992 -0.05444336  0.02087402  0.27539062 -0.03417969
 -0.26367188  0.0246582   0.04931641  0.22070312  0.11669922 -0.17285156
 -0.171875   -0.1875     -0.12451172  0.30078125 -0.05249023 -0.03271484
 -0.03979492 -0.06494141 -0.01507568  0.2265625  -0.15332031 -0.34179688
  0.38085938 -0.23046875 -0.14941406  0.24511719 -0.02636719 -0.22460938
  0.09130859  0.23828125  0.17382812  0.19824219 -0.22070312 -0.10791016
 -0.00842285  0.10693359  0.11523438  0.22363281 -0.0078125  -0.08886719
  0.34179688  0.15136719  0.453125   -0.13867188  0.09521484  0.20605469
  0.07910156  0.07958984  0.0625      0.11767578  0.24609375  0.09570312
 -0.16015625 -0.10009766 -0.15429688 -0.296875   -0.24121094 -0.04638672
 -0.05664062 -0.03076172 -0.08203125 -0.07617188 -0.23144531  0.14160156
  0.06689453  0.17773438  0.26171875  0.20996094  0.21679688  0.16015625
 -0.04321289  0.10351562 -0.22949219  0.02832031 -0.13378906 -0.23535156
 -0.1328125  -0.00891113 -0.13183594  0.10253906  0

In [11]:
result_dict = {}
for word in targets:
    cosine_distance = scipy_distances.cosine(target_vectors1[word], target_vectors2[word])
    result_dict[word] = cosine_distance
    print(f'{cosine_distance}')

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
