In [1]:
import os
import time
import tqdm
import operator
import datetime
import pickle as pkl
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

In [55]:
## Make sure the following folders have been included in the working director:
## Best_Trained: which contains the best word embedding learned from the best logistic regression and neural net models
## Best_Pretrained: which contains the word embedding learned of pre-trained embeddings
## pickle: which will include the following two pickle files: 1) 10000_id2token.pkl 2) 10000_token2id.pkl
## Fine_Tune_Weight: which will include the following files: 1) government_model.pt 2) fiction_model.pt 3) telephone_model.pt
## 4) slate_model.pt 5) travel_model.pt

In [3]:
import load_data

In [4]:
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
## Set up emvironment
CURR_PATH = os.getcwd()
WEIGHT_PATH = '/Best_Trained/'
PRETRAINED_WEIGHT_PATH = '/Best_Pretrained/'
FINR_WEIGHT_PATH = '/Fine_Tune_Weight/'
VOCAB_SIZE = 10000

### for 3.1

In [6]:
## Load trained weight
Best_lr = torch.load(CURR_PATH + WEIGHT_PATH + '10000_50_MUL_log-reg.pt')
Best_nn = torch.load(CURR_PATH + WEIGHT_PATH + '10000_500_MUL_neural-net.pt')

embedding_lr = Best_lr['encoder.embed.weight']
embedding_nn = Best_nn['encoder.embed.weight']

In [7]:
## laod token_index loop up tables
id2token = pkl.load(open('pickle/'+str(VOCAB_SIZE)+'_id2token.pkl', 'rb'))
token2id = pkl.load(open('pickle/'+str(VOCAB_SIZE)+'_token2id.pkl', 'rb'))

In [8]:
## Calculate similarity matrix
simi_lr = cosine_similarity(embedding_lr)
simi_nn = cosine_similarity(embedding_nn)

  np.sqrt(norms, norms)
  np.sqrt(norms, norms)


In [126]:
def expand(simi_matrix, k=10):
    '''
    Expand the similarity matrix to a sorted array with similarity socres
    '''
    res = []
    for i in range(len(simi_matrix)-1):
        for j in range(i+1,len(simi_matrix[0])):
            if simi_matrix[i][j] != 0 and simi_matrix[i][j] <= 0.99:
                res.append(simi_matrix[i][j])
        if i % 500 == 0:
            print('finished {}'.format(i))
    print('Done expanding')
    return sorted(res)

def generate_word_pair(matrix, threshold, id2_token):
    '''
    Generate top 10 similar tokens with the 
    '''
    res = []
    for i in range(len(matrix)-1):
        for j in range(i+1, len(matrix[0])):
            if matrix[i][j] >= threshold:
                res.append([id2_token[i], id2_token[j]])
        if i % 500 == 0:
            print('finished {}'.format(i))
    print('Done token pair generation')
    return res

In [127]:
## Generate sorted array of similarity scores
sorted_lr = expand(simi_lr)
sorted_nn = expand(simi_nn)

finished 0
finished 500
finished 1000
finished 1500
finished 2000
finished 2500
finished 3000
finished 3500
finished 4000
finished 4500
finished 5000
finished 5500
finished 6000
finished 6500
finished 7000
finished 7500
finished 8000
finished 8500
finished 9000
finished 9500
Done expanding
finished 0
finished 500
finished 1000
finished 1500
finished 2000
finished 2500
finished 3000
finished 3500
finished 4000
finished 4500
finished 5000
finished 5500
finished 6000
finished 6500
finished 7000
finished 7500
finished 8000
finished 8500
finished 9000
finished 9500
Done expanding


In [128]:
## Generate top 10 most similar token pairs
res_lr = generate_word_pair(simi_lr, sorted_lr[-10], id2token)
res_nn = generate_word_pair(simi_nn, sorted_nn[-10], id2token)

finished 0
finished 500
finished 1000
finished 1500
finished 2000
finished 2500
finished 3000
finished 3500
finished 4000
finished 4500
finished 5000
finished 5500
finished 6000
finished 6500
finished 7000
finished 7500
finished 8000
finished 8500
finished 9000
finished 9500
Done token pair generation
finished 0
finished 500
finished 1000
finished 1500
finished 2000
finished 2500
finished 3000
finished 3500
finished 4000
finished 4500
finished 5000
finished 5500
finished 6000
finished 6500
finished 7000
finished 7500
finished 8000
finished 8500
finished 9000
finished 9500
Done token pair generation


In [130]:
## top 10 similar token pairs for linear regression
res_lr

[['under', 'wakeboard'],
 ['worker', 'entertains'],
 ['Nobody', 'shower'],
 ['sad', 'joyously'],
 ['swims', 'cooler'],
 ['great', 'Island'],
 ['bars', 'sanded'],
 ['snowcapped', 'intimate'],
 ['gliding', 'beaming'],
 ['sponge', 'mountaintops']]

In [131]:
## top 10 similar token pairs for neural network
res_nn

[['sleeping', 'Nobody'],
 ['sleeping', 'sleeps'],
 ['sleeping', 'movies'],
 ['no', 'Nobody'],
 ['lunch', 'hour'],
 ['birthday', 'joyously'],
 ['happily', 'joyously'],
 ['cats', 'sleep'],
 ['conference', 'Hispanic'],
 ['siblings', 'joyously']]

### for 3.4

In [135]:
## Load pre-trained weight
Best_pretrained = torch.load(CURR_PATH + PRETRAINED_WEIGHT_PATH + '10000_pretrained_DIRECT_neural-net.pt')
embedding_pre = Best_pretrained['encoder.embed.weight']

In [138]:
## Calculate similarity matrix
pretrained_simi = cosine_similarity(embedding_pre)

  np.sqrt(norms, norms)


In [158]:
## Generate sorted array of similarities
sorted_pre = expand(pretrained_simi)

finished 0
finished 500
finished 1000
finished 1500
finished 2000
finished 2500
finished 3000
finished 3500
finished 4000
finished 4500
finished 5000
finished 5500
finished 6000
finished 6500
finished 7000
finished 7500
finished 8000
finished 8500
finished 9000
finished 9500
Done expanding


In [166]:
## Generate top 10 similar token pairs for pre-trained embedding
res_pretrained = generate_word_pair(pretrained_simi, sorted_pre[-10], id2token)

finished 0
finished 500
finished 1000
finished 1500
finished 2000
finished 2500
finished 3000
finished 3500
finished 4000
finished 4500
finished 5000
finished 5500
finished 6000
finished 6500
finished 7000
finished 7500
finished 8000
finished 8500
finished 9000
finished 9500
Done token pair generation


In [167]:
## Present the result for pretrained embedding
res_pretrained

[['13', '14'],
 ['14', '15'],
 ['14', '16'],
 ['14', '19'],
 ['19', '23'],
 ['19', '21'],
 ['22', '23'],
 ['22', '21'],
 ['53', '56'],
 ['23', '21']]

### for 3.3

In [21]:
## Load fine tuned embedding
gov_embedding = torch.load(CURR_PATH + FINR_WEIGHT_PATH + 'government_model.pt')
fic_embedding = torch.load(CURR_PATH + FINR_WEIGHT_PATH + 'fiction_model.pt')
slate_embedding = torch.load(CURR_PATH + FINR_WEIGHT_PATH + 'slate_model.pt')
tele_embedding = torch.load(CURR_PATH + FINR_WEIGHT_PATH + 'telephone_model.pt')
tra_embedding = torch.load(CURR_PATH + FINR_WEIGHT_PATH + 'travel_model.pt')

gov_embedding = gov_embedding['encoder.embed.weight'].numpy().tolist()
fic_embedding = fic_embedding['encoder.embed.weight'].numpy().tolist()
slate_embedding = slate_embedding['encoder.embed.weight'].numpy().tolist()
tele_embedding = tele_embedding['encoder.embed.weight'].numpy().tolist()
tra_embedding = tra_embedding['encoder.embed.weight'].numpy().tolist()

In [22]:
def cosine_simi(vec1, vec2):
    return np.dot(vec1, vec2) / (np.sqrt(np.dot(vec1, vec1)) * np.sqrt(np.dot(vec2, vec2)))

In [49]:
## calculate similarity matrix
def calculate_simi(arr1, arr2):
    res = {}
    for i in range(len(arr1)):
        res[i] = cosine_simi(arr1[i], arr2[i])
    return res

def top_ten_change(simi, id2token, k=12):
    res = {}
    for i in range(len(simi)):
        res[i] = simi[i]
    sorted_res = sorted(res.items(), key=lambda kv: kv[1])
    result = []
    counter = 0
    for key in sorted_res:
        if counter < k:
            result.append(id2token[key[0]])
            counter += 1
        else:
            break
    return result

In [24]:
gov_simi = calculate_simi(gov_embedding, embedding_nn)
fic_simi = calculate_simi(fic_embedding, embedding_nn)
slate_simi = calculate_simi(slate_embedding, embedding_nn)
tele_simi = calculate_simi(tele_embedding, embedding_nn)
tra_simi = calculate_simi(tra_embedding, embedding_nn)

  


In [50]:
## Top change for gov
top_ten_change(gov_simi, id2token)

['<pad>',
 'required',
 'ignored',
 'executive',
 'success',
 'deal',
 'senior',
 'provides',
 'just',
 'did',
 'largest',
 'same']

In [51]:
## Top change for fic
top_ten_change(fic_simi, id2token)

['<pad>',
 'people',
 'mist',
 'hit',
 'paid',
 'contents',
 'reach',
 'during',
 'general',
 'changing',
 'hearing',
 'many']

In [52]:
## Top change for slate
top_ten_change(slate_simi, id2token)

['<pad>',
 'full',
 'operation',
 'image',
 'trying',
 'era',
 'seemingly',
 'dreams',
 'determine',
 'players',
 '%',
 'garden']

In [53]:
top_ten_change(tele_simi, id2token)

['<pad>',
 'peace',
 'aim',
 'expert',
 'hated',
 'got',
 'price',
 'never',
 'meet',
 'Donald',
 'entire',
 'hand']

In [54]:
top_ten_change(tra_simi, id2token)

['<pad>',
 'nothing',
 'never',
 'most',
 'tourists',
 'their',
 'wo',
 'Many',
 'any',
 'anywhere',
 'houses',
 'make']