In [1]:
from sklearn.linear_model import LinearRegression
import numpy as np
import bz2
import gensim.models.keyedvectors as kv
import collections
import scipy.spatial.distance as sd

In [2]:
drama_model = kv.KeyedVectors.load_word2vec_format(bz2.open("../models/drama-17412-reduced.w2v.bz2"))

In [3]:
poetry_model = kv.KeyedVectors.load_word2vec_format(bz2.open("../models/poetry-56817-reduced.w2v.bz2"))

In [4]:
# extract common vocabulary
common_vocab = [word for word in drama_model.vocab if word in poetry_model.vocab]
common_vectors_drama = drama_model[common_vocab]
common_vectors_poetry = poetry_model[common_vocab]
print("common vocabulary: {0}".format(len(common_vocab)))

common vocabulary: 59239


In [5]:
lin_model = LinearRegression()
lin_model.fit(common_vectors_drama, 
              common_vectors_poetry)

LinearRegression()

In [6]:
aligned_vectors = collections.OrderedDict()
for idx, word in enumerate(common_vocab):
        wv = lin_model.predict(common_vectors_poetry[idx].reshape(1, -1))
        aligned_vectors[word] = wv.reshape(-1)

In [7]:
common_vocab = list(aligned_vectors.keys())
aligned_vectors = np.array(list(aligned_vectors.values()))

In [8]:
def neighbors(word,model,k=100):
    neighbors=list()
    idx = common_vocab.index(word)
    vec = model[idx]
    scores = sd.cdist([vec], model, "cosine")[0]
    sorted_ids = np.argsort(scores)
    for i in range(1, k + 1):
        neighbors.append((common_vocab[sorted_ids[i]], scores[sorted_ids[i]]))
    return neighbors

In [9]:
def pairwise_distance(word1,word2,model):
    idx1 = common_vocab.index(word1)
    idx2 = common_vocab.index(word2)
    dist = sd.cdist([model[idx1]], [model[idx2]], "cosine")[0]
    return dist[0]

In [10]:
def fitted_neighbors(word,k=10):
    n0 = neighbors(word,common_vectors_drama,k)
    n1 = neighbors(word,common_vectors_poetry,k)
    for n,m in enumerate([n0,n1]):
        print("model: {0}".format(n))
        for w,v in m:
            print(w,v)
        print("\n")
    print("fitted model:")
    
    for w,v in neighbors(word,aligned_vectors,k):
        if w in [x[0] for x in n0] and w in [x[0] for x in n1]:
            print(w,v,"both")
        elif w in [x[0] for x in n0]:
            print(w,v,"m0")
        elif w in [x[0] for x in n1]:
            print(w,v,"m1")
        else:
            print(w,v)

In [11]:
fitted_neighbors("blossom")

model: 0
bloom 0.5044440073448707
fragrance 0.5677726187657668
flower 0.5797588856030544
bud 0.6049989790459576
blossomed 0.6076058432176916
blooming 0.6373277095070187
chow 0.6603967268099505
budding 0.6604322307464214
crestless 0.663437859019554
bough 0.6781377936233154


model: 1
bud 0.24118634940661
flower 0.3021936429890745
bloom 0.3079461268713415
fragrance 0.44945727074790653
blossomed 0.45477104058695694
budding 0.4674032405124695
leaf 0.48904413398097824
fragrant 0.5170225794672134
dew 0.5348886788240482
april 0.5598084674488641


fitted model:
bud 0.2368840866762928 both
bloom 0.24873244719099474 both
flower 0.28641759050669025 both
fragrance 0.38196101129357996 both
fragrant 0.38419757980748204 m1
blossomed 0.39885495441981467 both
budding 0.4337430210077733 both
bough 0.449413250941182 m0
floweret 0.44993472963197456
gladden 0.45416357427156406
