In [1]:
import re
import numpy as np
import pandas as pd
import pickle
import string
from tqdm import tqdm
from collections import Counter
from itertools import chain
from nltk.corpus import brown
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

PATH = 'data/'

# Create Vocabulary

In [2]:
sentences = [[word.lower() for word in sent
              if word not in string.punctuation]
             for sent in brown.sents()]
words = list(chain(*sentences))

In [3]:
score_df = pd.read_csv(PATH+'human_judged.csv')

In [4]:
table_1_clean = pd.concat(
    [score_df['word_1'], score_df['word_2']]
).drop_duplicates().tolist()

In [5]:
W_counter = Counter(words).most_common(5000)
W = [i[0] for i in W_counter]
W += list(set(table_1_clean) - set(W))
W_freq = W_freq = np.hstack([
    np.array([i[1] for i in W_counter]),
    np.ones(len(W)-5000)
])

In [6]:
mapping = dict(zip(W, range(len(W))))
inverse_mapping = dict(zip(range(len(W)), W))

W_mapped = [mapping[w] for w in W]

In [7]:
# five most/least popular words
[i for i in W[:5]], [i for i in W[-5:]]

(['the', 'of', 'and', 'to', 'a'],
 ['asylum', 'magician', 'pillow', 'implement', 'cemetery'])

In [8]:
# delete all oov words
sentences_clear = [[word for word in sent if word in W] for sent in sentences]

# Create Word-Context pairs

In [9]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

batch_size = 512
num_batches = len(sentences_clear)//batch_size+int(len(sentences_clear)%batch_size!=0)
big_sum = Counter([])
for batch in tqdm(chunks(sentences_clear, batch_size), total=num_batches):
    tdf=[]
    for sent in batch:
        for i in range(len(sent)-1):
            tdf.append((sent[i], sent[i+1]))
    big_sum += Counter(tdf)
big_sum

100%|██████████| 112/112 [00:01<00:00, 61.41it/s]


Counter({('the', 'county'): 36,
         ('county', 'grand'): 1,
         ('grand', 'jury'): 10,
         ('jury', 'said'): 8,
         ('said', 'friday'): 5,
         ('friday', 'an'): 1,
         ('an', 'investigation'): 7,
         ('investigation', 'of'): 15,
         ('of', 'recent'): 6,
         ('recent', 'primary'): 1,
         ('primary', 'election'): 2,
         ('election', 'produced'): 1,
         ('produced', '``'): 1,
         ('``', 'no'): 109,
         ('no', 'evidence'): 15,
         ('evidence', "''"): 3,
         ("''", 'that'): 25,
         ('that', 'any'): 32,
         ('any', 'took'): 1,
         ('took', 'place'): 18,
         ('the', 'jury'): 38,
         ('jury', 'further'): 1,
         ('further', 'said'): 2,
         ('said', 'in'): 33,
         ('in', 'that'): 183,
         ('that', 'the'): 1440,
         ('the', 'city'): 158,
         ('city', 'executive'): 1,
         ('executive', 'committee'): 5,
         ('committee', 'which'): 4,
         ('which', 'ha

In [10]:
pairs = big_sum.most_common()

In [11]:
values_arr, word_arr, context_arr = [], [], []
for p in pairs:
    values_arr.append(p[1])
    word_arr.append(mapping[p[0][0]])
    context_arr.append(mapping[p[0][1]])

# Matrix operations
### (Word-Context Vector Model, PPMI, PCA)

In [12]:
n_a_b = csr_matrix((values_arr,(word_arr, context_arr)), dtype=np.int32)

In [13]:
total_words = W_freq.sum()
p_a = W_freq.reshape(-1,1) / total_words
p_b = W_freq.reshape(1,-1) / total_words
p_a_b = n_a_b / n_a_b.sum()
res = np.squeeze(np.log2((p_a_b /(p_a * p_b))))
res = np.where(res > 0, res, 0)

  res = np.squeeze(np.log2((p_a_b /(p_a * p_b))))


In [14]:
pca_dict = dict()
for i in [10,100,300]:
    pca = PCA(n_components=i)
    pca_dict[i] = pca.fit_transform(res)

# Evaluation

In [15]:
score_df['word_1_num'] = score_df['word_1'].map(mapping)
score_df['word_2_num'] = score_df['word_2'].map(mapping)
score_df.head(3)

Unnamed: 0,word_1,word_2,human_judged,word_1_num,word_2_num
0,cord,smile,0.02,5000,1927
1,rooster,voyage,0.04,5003,5018
2,noon,string,0.04,4065,5017


In [16]:
def compute_cos_sim(df, ind1, ind2):
    return cosine_similarity(df[ind1].reshape(1,-1), df[ind2].reshape(1,-1))[0][0]

In [17]:
score_df['pure_bigrams'] = score_df.apply(
    lambda x: compute_cos_sim(n_a_b, x['word_1_num'], x['word_2_num']),
    axis=1
)

score_df['ppmi'] = score_df.apply(
    lambda x: compute_cos_sim(res, x['word_1_num'], x['word_2_num']),
    axis=1
)

for i in [10,100,300]:
    score_df[f'pca_{i}'] = score_df.apply(
        lambda x: compute_cos_sim(pca_dict[i], x['word_1_num'], x['word_2_num']),
        axis=1
    )

In [18]:
score_df.to_csv(PATH+'score_df.csv', index=False)

In [19]:
corr = score_df.drop(['word_1_num','word_2_num'], axis=1).corr(method='pearson').round(3)
corr_with_humans = corr['human_judged'].drop('human_judged').sort_values(ascending=False)
corr_with_humans

pca_100         0.407
pca_300         0.347
pca_10          0.180
ppmi            0.093
pure_bigrams    0.074
Name: human_judged, dtype: float64

In [26]:
# save PCA300
with open(PATH+'lsa.txt', 'w') as f:
    f.write(f"{pca_dict[300].shape[0]} {pca_dict[300].shape[1]}\n")
    for i in range(pca_dict[300].shape[0]):
        f.write(inverse_mapping[i])
        for i in pca_dict[300][i]:
            f.write(' ' + str(i))
        f.write('\n')