In [2]:
import numpy as np
from collections import defaultdict
import math
import random
import pickle
import argparse
import copy

import sys
sys.path.append('../')
from utils import commons
from utils import vector_utils

print_every = 500000
print_status = True

In [2]:
def compute_idf(data, min_count):
    """
    IDF is used to weight the term vectors.
    """
    if print_status:
        print('Computing IDF')
    counts = defaultdict(float)
    for i in range(len(data)):
        if print_status and i % print_every == 0:
            print('Counting ' + str(i))
        line = set(data[i].split())
        for feature in line:
            counts[feature]+=1
    delete = [feature for feature in counts if counts[feature] <= min_count]
    for feature in delete:
        del counts[feature]
    for feature in counts:
        counts[feature]=math.sqrt(len(data)/counts[feature])
    return counts

In [3]:
def initialize_vectors(features, idf, dim, seeds):
    """
    This creates the initial random projection for each feature. You create initial
    vector with dimensionality dim. Dim should be in the range 500-1000. You then
    select n (n is determined by seeds) elements and set the value to 1 or -1
    randomly. This performs the random projection.
    """
    vectors = {}

    for i in range(len(features)):
        if print_status and i % print_every == 0:
            print('Initializing ' + str(i))
        feature = features[i]
        vector=np.zeros(dim)
        sample=random.sample(range(0,dim),seeds) # Grab the n random elements for random projection
        for index in sample:
            vector[index]=random.choice([-1.0,1.0]) # Set each element to +1 or -1 for random projection
        #vector=vector * idf[feature] # Weight based on IDF
        vectors[feature]=vector
    return vectors

In [33]:
def train_vectors(data, vectors):
    """
    For each feature in each line, add the feature to all other features. Conceptually,
    each co-occurance of two features moves the two features closer together.
    """
    trained_vectors=copy.deepcopy(vectors)
    for i in range(len(data)):
        if i == 1000000:
            print('breaking')
            break
        if print_status and i % print_every == 0:
            print('Processed ' + str(i))
        line = data[i].split()
        line=[feature for feature in line if feature in vectors]

        for feature_1 in line:
            for feature_2 in line:
                if feature_1 != feature_2:
                    trained_vectors[feature_1]+=vectors[feature_2] # This is it for the training! Simple addition.
    for feature in trained_vectors:
        trained_vectors[feature] = vector_utils.normalize_vector(trained_vectors[feature])
    return trained_vectors

In [5]:
from config_files.ri_config import  config
in_file = config['in_file']
out_file = config['out_dir']

In [6]:
min_count=10

In [7]:
data=commons.get_data(in_file)

In [12]:
idf=compute_idf(data, min_count)

Computing IDF
Counting 0
Counting 100000
Counting 200000
Counting 300000
Counting 400000
Counting 500000
Counting 600000
Counting 700000
Counting 800000
Counting 900000
Counting 1000000
Counting 1100000
Counting 1200000
Counting 1300000
Counting 1400000
Counting 1500000
Counting 1600000
Counting 1700000
Counting 1800000
Counting 1900000
Counting 2000000
Counting 2100000
Counting 2200000
Counting 2300000
Counting 2400000
Counting 2500000
Counting 2600000
Counting 2700000
Counting 2800000
Counting 2900000
Counting 3000000
Counting 3100000
Counting 3200000
Counting 3300000
Counting 3400000
Counting 3500000
Counting 3600000
Counting 3700000
Counting 3800000
Counting 3900000
Counting 4000000
Counting 4100000
Counting 4200000
Counting 4300000
Counting 4400000
Counting 4500000
Counting 4600000
Counting 4700000
Counting 4800000
Counting 4900000
Counting 5000000
Counting 5100000
Counting 5200000
Counting 5300000
Counting 5400000
Counting 5500000
Counting 5600000
Counting 5700000
Counting 580000

In [14]:
print(idf['fish'])

18.208521695749212


In [15]:
dim=500
seeds=20

In [16]:
vectors = initialize_vectors(list(idf.keys()), idf, dim, seeds)

Initializing 0
Initializing 100000
Initializing 200000


In [34]:
vectors_trained = {}
for i in range(2):
    vectors_trained = train_vectors(data, vectors)

Processed 0
Processed 100000
Processed 200000
Processed 300000
Processed 400000
Processed 500000
Processed 600000
Processed 700000
Processed 800000
Processed 900000
breaking
Processed 0
Processed 100000
Processed 200000
Processed 300000
Processed 400000
Processed 500000
Processed 600000
Processed 700000
Processed 800000
Processed 900000
breaking


In [35]:
vectors_trained['the']

array([-3.02313305e-02, -1.19897765e-02, -1.31941017e-01,  4.73763786e-04,
       -1.41510148e-02, -9.21394563e-03,  7.25964712e-03, -2.35215387e-03,
        2.38621652e-02,  1.70660264e-02,  2.07623728e-02,  5.18631249e-03,
       -1.69212602e-02,  4.79852941e-03,  4.72399448e-03, -1.56535254e-02,
       -3.18767760e-02,  1.05546587e-02, -3.21544049e-03, -2.47320445e-04,
       -2.23870328e-02,  1.57695398e-02,  8.60036018e-03,  8.30634095e-03,
        8.07403736e-03,  1.12970779e-02,  7.80034584e-03, -3.95191701e-02,
       -5.17250430e-02, -1.68432457e-02, -2.96539137e-02,  1.21863693e-02,
        1.19618397e-01,  5.61026423e-03, -3.07113208e-03, -8.76636979e-03,
       -2.97755137e-03,  9.25716490e-03, -1.43763593e-02,  5.81244252e-03,
       -2.19939389e-02, -5.35937270e-04, -3.56980188e-03,  2.37494471e-02,
       -1.77524070e-02,  2.99211040e-03,  2.98497739e-02, -1.09322779e-02,
       -6.41110267e-03,  4.27907407e-02,  6.32319907e-03,  5.99456779e-03,
        1.41588895e-02,  

In [12]:
from scipy.spatial.distance import cosine

def cosine_similarity(v1, v2):
    if np.linalg.norm(v1) == 0 or np.linalg.norm(v2) == 0:
        return 0.
    return 1. - cosine(v1, v2)

In [67]:
res = {}
query = 'mice'
for term in vectors_trained:
    score = cosine_similarity(vectors_trained[query],vectors_trained[term])
    res[term]=score

In [68]:
import operator
sorted_res = sorted(res.items(), key=operator.itemgetter(1),reverse=True)


In [69]:
for i in range(50):
    print(sorted_res[i])

('mice', 1.0)
('rats', 0.985224819984608)
('marked', 0.9796734367511785)
('animals', 0.9780356496141382)
('dogs', 0.9765347160034336)
('tissues', 0.9759405797746441)
('rat', 0.9751669923815269)
('contrast', 0.9751006783861932)
('mouse', 0.9749031251094362)
('decrease', 0.9743469667159469)
('alterations', 0.9740631385764345)
('addition', 0.9739759311587995)
('resulted', 0.973092467483888)
('murine', 0.9727150343991262)
('changes', 0.9725157720759887)
('monkeys', 0.9724332741275352)
('cultures', 0.9724178739513326)
('impaired', 0.9716866063353543)
('summary', 0.9709196465304253)
('adult', 0.9706670443023335)
('levels', 0.970598518451225)
('reductions', 0.9705437122176466)
('cells', 0.9703180785831913)
('hepatocytes', 0.9702784046782994)
('pigs', 0.9699979832784323)
('livers', 0.9699361218786154)
('elevated', 0.9693629053046202)
('horses', 0.9688823905457323)
('normal', 0.9688734796617067)
('brains', 0.9686748086535459)
('vitro', 0.9686313736523582)
('situ', 0.9684188950519449)
('cats', 0

In [70]:
import pickle

def pickle_dict(dict, out_dir, file_name):
    if not out_dir.endswith('/'):
        out_dir+='/'
    with open(out_dir+file_name, 'wb') as out:
        pickle.dump(dict, out, protocol=pickle.HIGHEST_PROTOCOL)

In [72]:
pickle_dict(vectors_trained,'./','first_pass')

In [2]:
window_size=10

In [1]:
sent = 'LDH was induced by implantation of autologous nucleus pulposus (NP), harvest from the tail, in lumbar 4/5 spinal nerve roots of rat. Von Frey filaments and radiant heat tests were performed to determine mechanical and thermal pain threshold respectively. Basso, Beattie, and Bresnahan (BBB) scale was assessed to test the locomotor function. The protein level of p-SFKs, t-SFKs, p-p38, t-p38 in spinal cord was examined by western blotting analysis. Cellular location of p-p38 was determined by immunochemistry staining. Spinal TNF-α, IL-1β and IL-6 levels were detected by ELISA.'

In [25]:
def create_context_training(line, widow_size):
    contexts = []
    line = line.split()
    for i in range(len(line)):
        target = line[i]
        start = 0
        end = len(line)
        if i-window_size > 0:
            start=i-window_size
        if i+window_size+1<len(line):
            end=i+window_size+1
        before = line[start:i]
        after = line[i+1:end]
        print(target)
        print(before)
        print(after)
        print('\n\n')
        contexts.append([target,before + after])
    return contexts

In [26]:
create_context_training(sent,window_size)

LDH
[]
['was', 'induced', 'by', 'implantation', 'of', 'autologous', 'nucleus', 'pulposus', '(NP),', 'harvest']



was
['LDH']
['induced', 'by', 'implantation', 'of', 'autologous', 'nucleus', 'pulposus', '(NP),', 'harvest', 'from']



induced
['LDH', 'was']
['by', 'implantation', 'of', 'autologous', 'nucleus', 'pulposus', '(NP),', 'harvest', 'from', 'the']



by
['LDH', 'was', 'induced']
['implantation', 'of', 'autologous', 'nucleus', 'pulposus', '(NP),', 'harvest', 'from', 'the', 'tail,']



implantation
['LDH', 'was', 'induced', 'by']
['of', 'autologous', 'nucleus', 'pulposus', '(NP),', 'harvest', 'from', 'the', 'tail,', 'in']



of
['LDH', 'was', 'induced', 'by', 'implantation']
['autologous', 'nucleus', 'pulposus', '(NP),', 'harvest', 'from', 'the', 'tail,', 'in', 'lumbar']



autologous
['LDH', 'was', 'induced', 'by', 'implantation', 'of']
['nucleus', 'pulposus', '(NP),', 'harvest', 'from', 'the', 'tail,', 'in', 'lumbar', '4/5']



nucleus
['LDH', 'was', 'induced', 'by', 'implantat

[['LDH',
  ['was',
   'induced',
   'by',
   'implantation',
   'of',
   'autologous',
   'nucleus',
   'pulposus',
   '(NP),',
   'harvest']],
 ['was',
  ['LDH',
   'induced',
   'by',
   'implantation',
   'of',
   'autologous',
   'nucleus',
   'pulposus',
   '(NP),',
   'harvest',
   'from']],
 ['induced',
  ['LDH',
   'was',
   'by',
   'implantation',
   'of',
   'autologous',
   'nucleus',
   'pulposus',
   '(NP),',
   'harvest',
   'from',
   'the']],
 ['by',
  ['LDH',
   'was',
   'induced',
   'implantation',
   'of',
   'autologous',
   'nucleus',
   'pulposus',
   '(NP),',
   'harvest',
   'from',
   'the',
   'tail,']],
 ['implantation',
  ['LDH',
   'was',
   'induced',
   'by',
   'of',
   'autologous',
   'nucleus',
   'pulposus',
   '(NP),',
   'harvest',
   'from',
   'the',
   'tail,',
   'in']],
 ['of',
  ['LDH',
   'was',
   'induced',
   'by',
   'implantation',
   'autologous',
   'nucleus',
   'pulposus',
   '(NP),',
   'harvest',
   'from',
   'the',
   'tail,'

In [29]:
import sys
sys.path.append('../')

from utils import commons

files = commons.get_files_in_dir('/Users/joshuacgoodwin/Documents/github_projects/data/docs_tkns_json/')
print(files)

['18', '9', '0', '11', '7', '16', '6', '17', '1', '10', '19', '8', '4', '15', '3', '12', '2', '13', '5', '14']


In [37]:
plist = []

import json

for file in files:
    print(file)
    print(len(plist))
    data = commons.get_data('/Users/joshuacgoodwin/Documents/github_projects/data/docs_tkns_json/'+file)
    for doc in data:
        doc=json.loads(doc)
        tkns=''
        sent_tkns = doc['sent_tkns']
        title_tkns = doc['title_tkns']
                
        if isinstance(sent_tkns,list):
            tkns = ' '.join(sent_tkns)
            plist.append(title_tkns + ' ' + tkns)
        elif isinstance(sent_tkns,str):
            plist.append(title_tkns + ' ' + sent_tkns)
            
#         if not isinstance(sent_tkns,list):
#             print('here')
            #plist.append(title_tkns + ' ' + sent_tkns)

            
        
#         tkns = ' '.join
        
#         #sent_tkns
        
#     print(data[0])
#     break

18
0
9
198469
0
396912
11
595395
7
793797
16
992290
6
1190721
17
1389147
1
1587578
10
1785993
19
1984428
8
2182872
4
2381299
15
2579664
3
2778028
12
2976449
2
3174891
13
3373341
5
3571815
14
3770304


In [32]:
if isinstance('', list):
    print('yes')

In [40]:
import codecs

def print_list(plist,out,mode='w'):
    if len(plist) < 10000000:
        f_out = codecs.open(out, mode, encoding='utf-8')
        f_out.write('\n'.join(plist))
        f_out.close()
    else:
        f_out = codecs.open(out, mode, encoding='utf-8') # Nuke the file if needed
        f_out.close()
        f_out = codecs.open(out, 'a', encoding='utf-8')
        plist = list_splitter(plist,100)
        for el in plist:
            pstring = '\n'.join(el) + '\n'
            f_out.write(pstring)
        f_out.close()

In [41]:
print_list(plist,'/Users/joshuacgoodwin/Documents/github_projects/data/sentences.txt')

In [2]:
import sys
sys.path.append('../')
from utils import commons
from data.config_files import ri_config as config


In [3]:
print(config.config)

{'in_file': 'F:\\github_projects\\data\\embeddings\\medline_sentences\\sentences_shuf.txt', 'out_dir': 'F:\\github_projects\\data\\embeddings\\medline_sentences\\models\\', 'file_name': 'ri_index', 'seeds': 20, 'dim': 500, 'min_count': 25, 'print_status': True, 'print_every': 500000, 'window_size': None, 'sample': 500000}


In [None]:
config

In [3]:
import pickle

In [9]:
#vectors_trained = pickle.loads('/Users/joshuacgoodwin/Documents/github_projects/data/models/ri_index')

with open('/Users/joshuacgoodwin/Documents/github_projects/data/models/ri_index', "rb") as input_file:
    vectors_trained= pickle.load(input_file)

In [11]:
print(len(vectors_trained))

27202


In [50]:
res = {}
query = 'neuron'
for term in vectors_trained:
    score = cosine_similarity(vectors_trained[query],vectors_trained[term])
    res[term]=score

In [51]:
import operator
sorted_res = sorted(res.items(), key=operator.itemgetter(1),reverse=True)


In [52]:
for i in range(50):
    print(sorted_res[i])

('neuron', 1.0)
('nucleus', 0.9989356031978658)
('neuronal', 0.998606313615176)
('neurons', 0.9985424754574547)
('modulation', 0.9984715652361867)
('modulatory', 0.9984223586711597)
('underlies', 0.9983697996924052)
('brains', 0.9982431104869345)
('neocortex', 0.9981930430302333)
('axonal', 0.9980418256117097)
('plasticity', 0.9979864713718714)
('nuclei', 0.9979784751885309)
('pns', 0.9979621067343971)
('dissociated', 0.9979361488980795)
('motoneuron', 0.9979341388466362)
('disrupted', 0.9978706821470995)
('soma', 0.9978403424122316)
('synaptogenesis', 0.9977419326969141)
('bulb', 0.9977240333373057)
('neurotoxic', 0.9975949560327732)
('neurogenesis', 0.9975749104663344)
('purkinje', 0.9975010907238585)
('neurotransmitter', 0.9974548859861846)
('neurofilament', 0.9974529020385536)
('disruption', 0.9973969268332917)
('selectively', 0.9973863341917518)
('motoneurons', 0.9973673660104214)
('doi', 0.9973446456458005)
('myelin', 0.9972976473164312)
('ganglion', 0.997287733507636)
('immunore

In [9]:
import numpy as np

test = {}

for i in range(10):
    test[str(i)]=np.zeros(25)

In [10]:
def tester_1(vecs):
    for name in vecs:
        vec = vecs[name]
        vec[0]=1.0
        

In [11]:
tester_1(test)

In [12]:
for el in test:
    print(test[el])

[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
