In [1]:
import numpy as np
from collections import defaultdict
import math
import random
import pickle
import argparse
import copy

import sys
sys.path.append('../')
from utils import commons
from utils import store
from utils import vector_utils

print_every = 500000
print_status = True

In [2]:
def compute_idf(data, min_count):
    """
    IDF is used to weight the term vectors.
    """
    if print_status:
        print('Computing IDF')
    counts = defaultdict(float)
    for i in range(len(data)):
        if print_status and i % print_every == 0:
            print('Counting ' + str(i))
        line = set(data[i].split())
        for feature in line:
            counts[feature]+=1
    delete = [feature for feature in counts if counts[feature] <= min_count]
    for feature in delete:
        del counts[feature]
    for feature in counts:
        counts[feature]=math.sqrt(len(data)/counts[feature])
    return counts

In [3]:
def initialize_vectors(features, idf, dim, seeds):
    """
    This creates the initial random projection for each feature. You create initial
    vector with dimensionality dim. Dim should be in the range 500-1000. You then
    select n (n is determined by seeds) elements and set the value to 1 or -1
    randomly. This performs the random projection.
    """
    vectors = {}

    for i in range(len(features)):
        if print_status and i % print_every == 0:
            print('Initializing ' + str(i))
        feature = features[i]
        vector=np.zeros(dim)
        sample=random.sample(range(0,dim),seeds) # Grab the n random elements for random projection
        for index in sample:
            vector[index]=random.choice([-1.0,1.0]) # Set each element to +1 or -1 for random projection
        #vector=vector * idf[feature] # Weight based on IDF
        vectors[feature]=vector
    return vectors

In [33]:
def train_vectors(data, vectors):
    """
    For each feature in each line, add the feature to all other features. Conceptually,
    each co-occurance of two features moves the two features closer together.
    """
    trained_vectors=copy.deepcopy(vectors)
    for i in range(len(data)):
        if i == 1000000:
            print('breaking')
            break
        if print_status and i % print_every == 0:
            print('Processed ' + str(i))
        line = data[i].split()
        line=[feature for feature in line if feature in vectors]

        for feature_1 in line:
            for feature_2 in line:
                if feature_1 != feature_2:
                    trained_vectors[feature_1]+=vectors[feature_2] # This is it for the training! Simple addition.
    for feature in trained_vectors:
        trained_vectors[feature] = vector_utils.normalize_vector(trained_vectors[feature])
    return trained_vectors

In [5]:
from config_files.ri_config import  config
in_file = config['in_file']
out_file = config['out_dir']

In [6]:
min_count=10

In [7]:
data=commons.get_data(in_file)

In [12]:
idf=compute_idf(data, min_count)

Computing IDF
Counting 0
Counting 100000
Counting 200000
Counting 300000
Counting 400000
Counting 500000
Counting 600000
Counting 700000
Counting 800000
Counting 900000
Counting 1000000
Counting 1100000
Counting 1200000
Counting 1300000
Counting 1400000
Counting 1500000
Counting 1600000
Counting 1700000
Counting 1800000
Counting 1900000
Counting 2000000
Counting 2100000
Counting 2200000
Counting 2300000
Counting 2400000
Counting 2500000
Counting 2600000
Counting 2700000
Counting 2800000
Counting 2900000
Counting 3000000
Counting 3100000
Counting 3200000
Counting 3300000
Counting 3400000
Counting 3500000
Counting 3600000
Counting 3700000
Counting 3800000
Counting 3900000
Counting 4000000
Counting 4100000
Counting 4200000
Counting 4300000
Counting 4400000
Counting 4500000
Counting 4600000
Counting 4700000
Counting 4800000
Counting 4900000
Counting 5000000
Counting 5100000
Counting 5200000
Counting 5300000
Counting 5400000
Counting 5500000
Counting 5600000
Counting 5700000
Counting 580000

In [14]:
print(idf['fish'])

18.208521695749212


In [15]:
dim=500
seeds=20

In [16]:
vectors = initialize_vectors(list(idf.keys()), idf, dim, seeds)

Initializing 0
Initializing 100000
Initializing 200000


In [34]:
vectors_trained = {}
for i in range(2):
    vectors_trained = train_vectors(data, vectors)

Processed 0
Processed 100000
Processed 200000
Processed 300000
Processed 400000
Processed 500000
Processed 600000
Processed 700000
Processed 800000
Processed 900000
breaking
Processed 0
Processed 100000
Processed 200000
Processed 300000
Processed 400000
Processed 500000
Processed 600000
Processed 700000
Processed 800000
Processed 900000
breaking


In [35]:
vectors_trained['the']

array([-3.02313305e-02, -1.19897765e-02, -1.31941017e-01,  4.73763786e-04,
       -1.41510148e-02, -9.21394563e-03,  7.25964712e-03, -2.35215387e-03,
        2.38621652e-02,  1.70660264e-02,  2.07623728e-02,  5.18631249e-03,
       -1.69212602e-02,  4.79852941e-03,  4.72399448e-03, -1.56535254e-02,
       -3.18767760e-02,  1.05546587e-02, -3.21544049e-03, -2.47320445e-04,
       -2.23870328e-02,  1.57695398e-02,  8.60036018e-03,  8.30634095e-03,
        8.07403736e-03,  1.12970779e-02,  7.80034584e-03, -3.95191701e-02,
       -5.17250430e-02, -1.68432457e-02, -2.96539137e-02,  1.21863693e-02,
        1.19618397e-01,  5.61026423e-03, -3.07113208e-03, -8.76636979e-03,
       -2.97755137e-03,  9.25716490e-03, -1.43763593e-02,  5.81244252e-03,
       -2.19939389e-02, -5.35937270e-04, -3.56980188e-03,  2.37494471e-02,
       -1.77524070e-02,  2.99211040e-03,  2.98497739e-02, -1.09322779e-02,
       -6.41110267e-03,  4.27907407e-02,  6.32319907e-03,  5.99456779e-03,
        1.41588895e-02,  

In [39]:
from scipy.spatial.distance import cosine

def cosine_similarity(v1, v2):
    if np.linalg.norm(v1) == 0 or np.linalg.norm(v2) == 0:
        return 0.
    return 1. - cosine(v1, v2)

In [67]:
res = {}
query = 'mice'
for term in vectors_trained:
    score = cosine_similarity(vectors_trained[query],vectors_trained[term])
    res[term]=score

In [68]:
import operator
sorted_res = sorted(res.items(), key=operator.itemgetter(1),reverse=True)


In [69]:
for i in range(50):
    print(sorted_res[i])

('mice', 1.0)
('rats', 0.985224819984608)
('marked', 0.9796734367511785)
('animals', 0.9780356496141382)
('dogs', 0.9765347160034336)
('tissues', 0.9759405797746441)
('rat', 0.9751669923815269)
('contrast', 0.9751006783861932)
('mouse', 0.9749031251094362)
('decrease', 0.9743469667159469)
('alterations', 0.9740631385764345)
('addition', 0.9739759311587995)
('resulted', 0.973092467483888)
('murine', 0.9727150343991262)
('changes', 0.9725157720759887)
('monkeys', 0.9724332741275352)
('cultures', 0.9724178739513326)
('impaired', 0.9716866063353543)
('summary', 0.9709196465304253)
('adult', 0.9706670443023335)
('levels', 0.970598518451225)
('reductions', 0.9705437122176466)
('cells', 0.9703180785831913)
('hepatocytes', 0.9702784046782994)
('pigs', 0.9699979832784323)
('livers', 0.9699361218786154)
('elevated', 0.9693629053046202)
('horses', 0.9688823905457323)
('normal', 0.9688734796617067)
('brains', 0.9686748086535459)
('vitro', 0.9686313736523582)
('situ', 0.9684188950519449)
('cats', 0

In [70]:
import pickle

def pickle_dict(dict, out_dir, file_name):
    if not out_dir.endswith('/'):
        out_dir+='/'
    with open(out_dir+file_name, 'wb') as out:
        pickle.dump(dict, out, protocol=pickle.HIGHEST_PROTOCOL)

In [72]:
pickle_dict(vectors_trained,'./','first_pass')