In [2]:
%matplotlib inline
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

import Vector_Similarity


# TRAIN_SET_PATH = "20ng-no-stop.txt"
# TRAIN_SET_PATH = "r52-all-terms.txt"
TRAIN_SET_PATH = "dataset/r8-no-stop.txt"

GLOVE_6B_50D_PATH = "dataset/glove.6B.50d.txt"
GLOVE_840B_300D_PATH = "dataset/glove.6B.300d.txt"
encoding="utf-8"

In [3]:

from read_folds import FoldsTraining
folds_training = FoldsTraining(nb_folds=2)
# relevance_list = [item for sublist in folds_training.true_relevance_stemmed for item in sublist]
# relevance_list = folds_training.true_relevance_stemmed[0]
queries_list_train = folds_training.queries[0]
queries_list_test = folds_training.queries[1]
paragraphs_dict = folds_training.paragraphs_dict

print(queries_list_train[20])


('enwiki:Heavy%20water/Effect%20on%20biological%20systems/Toxicity%20in%20humans', 'Heavy water Effect on biological systems Toxicity in humans', ['gist', 'consequence', 'effectuate', 'human being', 'human', 'organization', 'essence', 'toxic', 'water', 'water supply', 'weewee', 'set up', 'pee', 'body of water', 'scheme', 'impression', 'biolog', 'issue', 'heavi', 'system of rules', 'irrigate', 'piss', 'effect', 'homo', 'burden', 'event', 'upshot', 'result', 'water system', 'arrangement', 'organisation', 'man', 'force', 'core', 'outcome', 'system', 'urine', 'H2O', 'piddle'])


In [4]:
X, y = [], []
y_mapping = {}
y_mapping_index = 0
i = 0

for query in queries_list_train:
    X.append(query[2])

for par_id, par_text in paragraphs_dict.items():
    y.append(par_text)

# for query_relevance in relevance_list:
#     # if i == 0:
#         # print(query_relevance)
#     query = query_relevance[0]
#     relevance = query_relevance[1]
#     for par_id, rel in relevance.items():
#         # if i == 0:
#             # print(query, par_id, paragraphs_dict[par_id])
#             # i += 1
#         X.append(query)
#         y.append(paragraphs_dict[par_id])


X, y = np.array(X), np.array(y)
print(X[0], y[0])
print("total x =", len(X), "y =", len(y))


['etymolog', 'chocol'] ['one', 'oldest', 'western', 'philosophi', 'human', 'right', 'product', 'natur', 'law', 'stem', 'differ', 'philosoph', 'religi', 'ground', 'theori', 'hold', 'human', 'right', 'codifi', 'moral', 'behavior', 'human', 'social', 'product', 'develop', 'process', 'biolog', 'social', 'evolut', 'associ', 'hume', 'human', 'right', 'also', 'describ', 'sociolog', 'pattern', 'rule', 'set', 'sociolog', 'theori', 'law', 'work', 'weber', 'approach', 'includ', 'notion', 'individu', 'societi', 'accept', 'rule', 'legitim', 'author', 'exchang', 'secur', 'econom', 'advantag', 'rawl', 'social', 'contract', 'two', 'theori', 'domin', 'contemporari', 'human', 'right', 'discuss', 'interest', 'theori', 'theori', 'interest', 'theori', 'argu', 'princip', 'function', 'human', 'right', 'protect', 'promot', 'certain', 'essenti', 'human', 'interest', 'theori', 'attempt', 'establish', 'valid', 'human', 'right', 'base', 'uniqu', 'human', 'capac', 'freedom']
total x = 388 y = 1815


In [5]:
# reading glove files, this may take a while
# we're reading line by line and only saving vectors
# that correspond to words from our training set
# if you wan't to play around with the vectors and have 
# enough RAM - remove the 'if' line and load everything

import struct 

glove_small = {}
all_words = set(w for words in X for w in words)
with open(GLOVE_6B_50D_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        # if (word in all_words):
        nums=np.array(parts[1:], dtype=np.float32)
        glove_small[word] = nums

            
# glove_big = {}
# with open(GLOVE_840B_300D_PATH, "rb") as infile:
#     for line in infile:
#         parts = line.split()
#         word = parts[0].decode(encoding)
#         nums=np.array(parts[1:], dtype=np.float32)
#         glove_small[word] = nums
            

In [6]:

# and a tf-idf version of the same
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec) > 0:
            self.dim = len(word2vec[next(iter(glove_small))])
        else:
            self.dim = 0

    def fit(self, X, y):

        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] * self.word2weight[w]
                     for w in words if w in self.word2vec] or
                    [np.zeros(self.dim)], axis=0)
            for words in X
        ])


In [7]:
tfidf = TfidfEmbeddingVectorizer(glove_small).fit(np.append(X, y), y=None)
# tfidf_y = TfidfEmbeddingVectorizer(glove_small).fit(y, y=None)
# x_vec = tfidf.transform(X)
# y_vec = tfidf.transform(y)

x_vec_X = tfidf.transform(X)
y_vec_X = tfidf.transform(y)


In [8]:
import operator

topn = 20

# query = queries_list_test[10]
final_scores_tsss = {}
final_scores_cosine = {}
stop = 0
progress = 0
for query in queries_list_test:
    q = query[2]
    # print(queries_list_test[10])
    # print(q)

    scores_tsss = {}
    scores_cosine = {}
    q_vec = tfidf.transform([q])[0]
    for par_id, par_text in paragraphs_dict.items():
        par_text_vec = tfidf.transform([par_text])[0]
        scores_tsss[par_id] = Vector_Similarity.TS_SS(q_vec, par_text_vec)
        scores_cosine[par_id] = Vector_Similarity.Cosine(q_vec, par_text_vec)

    sorted_scores_tsss = sorted(scores_tsss.items(), key=operator.itemgetter(1), reverse=True)[:topn]
    sorted_scores_cosine = sorted(scores_cosine.items(), key=operator.itemgetter(1), reverse=True)[:topn]

    for tsss in sorted_scores_tsss:
        if final_scores_tsss.get(query[0]) is None:
            final_scores_tsss[query[0]] = []
        final_scores_tsss[query[0]].append([tsss[0], tsss[1]])

    for cosine in sorted_scores_tsss:
        if final_scores_cosine.get(query[0]) is None:
            final_scores_cosine[query[0]] = []
        final_scores_cosine[query[0]].append([cosine[0], cosine[1]])

    progress += 1
    print("progress:", "%.3f" % round(progress/len(queries_list_test), 3))



  result = InnerProduct(vec1, vec2) / (VectorSize(vec1) * VectorSize(vec2))


progress: 0.005


progress: 0.009


progress: 0.014


progress: 0.019


progress: 0.023


progress: 0.028


progress: 0.033


progress: 0.037


progress: 0.042


progress: 0.047


progress: 0.051


progress: 0.056


progress: 0.061


progress: 0.065


progress: 0.070


progress: 0.075


progress: 0.079


progress: 0.084


progress: 0.089


progress: 0.093


progress: 0.098


progress: 0.103


progress: 0.107


progress: 0.112


progress: 0.117


progress: 0.121


progress: 0.126


progress: 0.131


progress: 0.136


progress: 0.140


progress: 0.145


progress: 0.150


progress: 0.154


progress: 0.159


progress: 0.164


progress: 0.168


progress: 0.173


progress: 0.178


progress: 0.182


progress: 0.187


progress: 0.192


progress: 0.196


progress: 0.201


progress: 0.206


progress: 0.210


progress: 0.215


progress: 0.220


progress: 0.224


progress: 0.229


progress: 0.234


progress: 0.238


progress: 0.243


progress: 0.248


progress: 0.252


progress: 0.257


progress: 0.262


progress: 0.266


progress: 0.271


progress: 0.276


progress: 0.280


progress: 0.285


progress: 0.290


progress: 0.294


progress: 0.299


progress: 0.304


progress: 0.308


progress: 0.313


progress: 0.318


progress: 0.322


progress: 0.327


progress: 0.332


progress: 0.336


progress: 0.341


progress: 0.346


progress: 0.350


progress: 0.355


progress: 0.360


progress: 0.364


progress: 0.369


progress: 0.374


progress: 0.379


progress: 0.383


progress: 0.388


progress: 0.393


progress: 0.397


progress: 0.402


progress: 0.407


progress: 0.411


progress: 0.416


progress: 0.421


progress: 0.425


progress: 0.430


progress: 0.435


progress: 0.439


progress: 0.444


progress: 0.449


progress: 0.453


progress: 0.458


progress: 0.463


progress: 0.467


progress: 0.472


progress: 0.477


progress: 0.481


progress: 0.486


progress: 0.491


progress: 0.495


progress: 0.500


progress: 0.505


progress: 0.509


progress: 0.514


progress: 0.519


progress: 0.523


progress: 0.528


progress: 0.533


progress: 0.537


progress: 0.542


progress: 0.547


progress: 0.551


progress: 0.556


progress: 0.561


progress: 0.565


progress: 0.570


progress: 0.575


progress: 0.579


progress: 0.584


progress: 0.589


progress: 0.593


progress: 0.598


progress: 0.603


progress: 0.607


progress: 0.612


progress: 0.617


progress: 0.621


progress: 0.626


progress: 0.631


progress: 0.636


progress: 0.640


progress: 0.645


progress: 0.650


progress: 0.654


progress: 0.659


progress: 0.664


progress: 0.668


progress: 0.673


progress: 0.678


progress: 0.682


progress: 0.687


progress: 0.692


progress: 0.696


progress: 0.701


progress: 0.706


progress: 0.710


progress: 0.715


progress: 0.720


progress: 0.724


progress: 0.729


progress: 0.734


progress: 0.738


progress: 0.743


progress: 0.748


progress: 0.752


progress: 0.757


progress: 0.762


progress: 0.766


progress: 0.771


progress: 0.776


progress: 0.780


progress: 0.785


progress: 0.790


progress: 0.794


progress: 0.799


progress: 0.804


progress: 0.808


progress: 0.813


progress: 0.818


progress: 0.822


progress: 0.827


progress: 0.832


progress: 0.836


progress: 0.841


progress: 0.846


progress: 0.850


progress: 0.855


progress: 0.860


progress: 0.864


progress: 0.869


progress: 0.874


progress: 0.879


progress: 0.883


progress: 0.888


progress: 0.893


progress: 0.897


progress: 0.902


progress: 0.907


progress: 0.911


progress: 0.916


progress: 0.921


progress: 0.925


progress: 0.930


progress: 0.935


progress: 0.939


progress: 0.944


progress: 0.949


progress: 0.953


progress: 0.958


progress: 0.963


progress: 0.967


progress: 0.972


progress: 0.977


progress: 0.981


progress: 0.986


progress: 0.991


progress: 0.995


progress: 1.000


In [9]:
from trec_car.format_runs import *
stop = 0
output_entries_tsss = []
for query_id, paragraphs in final_scores_tsss.items():     
    rank = 1
    for paragraph_score in paragraphs:        
        entry = RankingEntry(query_id, paragraph_score[0], rank, paragraph_score[1])
        output_entries_tsss.append(entry)
        rank += 1
        
print(len(output_entries_tsss))

4280


In [10]:
output_entries_cosine = []
for query_id, paragraphs in final_scores_cosine.items():     
    rank = 1
    for paragraph_score in paragraphs:        
        entry = RankingEntry(query_id, paragraph_score[0], rank, paragraph_score[1])
        output_entries_cosine.append(entry)
        rank += 1

In [11]:
def save_scores_to_file(output_entries, filename="test.out"):
    with open(filename, mode='w', encoding='UTF-8') as f:
        writer = f
        temp_list = []
        for entry in output_entries:
            temp_list.append(entry)
        format_run(writer, temp_list, exp_name='test')
        f.close()

In [15]:
o = output_entries_tsss[11]
print(o.query_id, o.paragraph_id, o.rank, o.score)
save_scores_to_file(output_entries_tsss, filename="word2vec_tsss_synonym_top20.out")
save_scores_to_file(output_entries_cosine, filename="word2vec_cosine_synonym_top20.out")

enwiki:Dismissal%20(employment)/Reasons 04a2363ff6e6c4066c2c7553bb9961e199ed2fac 12 1855.8868142557226
