In [None]:
import gensim
from gensim.models import Word2Vec
import sklearn
from sklearn.decomposition import PCA as PCA
import matplotlib.pyplot as plt
import time
import re
from gensim.models.phrases import Phrases,Phraser
from spacy.lang.en.stop_words import STOP_WORDS
import numpy as np 
from gensim.utils import simple_preprocess as pre_process
from nltk import word_tokenize 
from nltk.util import ngrams
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok
import fse
from fse.models import Average
from fse import IndexedList
from gensim.models import FastText
from collections import Counter
from fse.models import SIF
from fse.models import uSIF

In [52]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/harshitg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
### Reading the combined text file as a list of queries ####
queries = open("/Users/harshitg/github/autocomplete/autocomplete/everything_combined.txt").readlines()

In [3]:
counter = Counter(queries)

In [6]:
queries.index("liver treated with cytoxan adriamycin taxol and herceptin\n")

313285

In [19]:
queries[6]

'the sigmoid colon is extensively encased .\n'

In [12]:
#### Splitting each query into a list of strings(a form that can be fed into an word embedding algorithm) ###
for i in range(0,len(queries)):
       queries[i] = word_tokenize(queries[i])

In [13]:
#### Converting the words in each query into the corresponding vector representations using Word2Vec###
ft2 = Word2Vec(queries, window = 5, min_count=1)

In [6]:
#### Converting the words in each query into the corresponding vector representations using Fasttext###
ft = FastText(queries, window = 5, min_count=1)

In [25]:
### Trying out the SIF method after using Fasttext to create word embeddings ####
model1 = SIF(ft, workers=2)

In [22]:
### Trying out the SIF method after using Word2Vec to create word embeddings ####
model2 = SIF(ft2, workers=2)

In [17]:
#### Converting the list of queries into an indexed list ####
index_list = IndexedList(queries)

In [26]:
#### Training the SIF model on the indexed list of queries (while using FastText) ####
model1.train(IndexedList(queries))

(419081, 4110171)

In [27]:
#### Training the SIF model on the indexed list of queries (while using Word2Vec) ####
model2.train(IndexedList(queries))

(419081, 4110171)

In [46]:
#### looking at the most similar sentences for the  query ###
model1.sv.most_similar(6,indexable=index_list.items)[0:5]

[(['the', 'sigmoid', 'colon', 'is', 'the'], 7837, 0.9927172660827637),
 (['the', 'colon', 'is', 'apparently'], 70216, 0.9447572827339172),
 (['the', 'colon', 'is'], 42505, 0.941917896270752),
 (['colon',
   ',',
   'and',
   'is',
   'contiguous',
   'with',
   'the',
   'uterus',
   'and',
   'adnexa',
   '.'],
  5,
  0.9235566854476929),
 (['the', 'colon', 'is', 'markedly'], 72403, 0.9215347766876221)]

In [47]:
#### looking at the most similar sentences for the  query ###
model2.sv.most_similar(6,indexable=index_list.items)[0:5]

[(['the', 'sigmoid', 'colon', 'is', 'the'], 7837, 0.9980358481407166),
 (['sigmoid', 'colon', 'with', 'diverticuli', '.'], 12033, 0.9860890507698059),
 (['sigmoid', 'colon', 'and'], 8730, 0.985425591468811),
 (['sigmoid', 'colon', 'and'], 31667, 0.985425591468811),
 (['sigmoid', 'colon', 'and'], 76409, 0.985425591468811)]

In [98]:
#### Saving the chosen model ####
fse.models.SentenceVectors.save(model2.sv,"/Users/harshitg/github/autocomplete/autocomplete/sentence2vec_everything_combined_sif.model")

In [14]:
### Applying the Deep Averaging Network methodology for sentence embeddings ####
model3 = Average(ft2)

In [15]:
#### Training the model ####
model3.train(IndexedList(queries))

(419081, 4110171)

In [18]:
model3.sv.most_similar(6,indexable=index_list.items)[0:5]

[(['the', 'sigmoid', 'colon', 'is', 'the'], 7837, 0.9610152244567871),
 (['the', 'colon', 'is'], 42505, 0.9583863019943237),
 (['the', 'colon', 'is', 'markedly'], 72403, 0.9570503830909729),
 (['the', 'colon', 'is', 'apparently'], 70216, 0.9565603733062744),
 (['the', 'bladder', 'is', 'collapsed', '.'], 63191, 0.95305997133255)]

In [103]:
fse.models.SentenceVectors.save(model3.sv,"/Users/harshitg/github/autocomplete/autocomplete/sentence2vec_everything_combined_average.model")

In [19]:
#### Applying the uSIF methodology ####
model4_word2vec = uSIF(ft2,workers = 2)

In [20]:
#### Training the model ####
model4_word2vec.train(IndexedList(queries))

(419081, 4110171)

In [22]:
model4_word2vec.sv.most_similar(6,indexable=index_list.items)[0:5]

[(['the', 'sigmoid', 'colon', 'is', 'the'], 7837, 0.9983557462692261),
 (['sigmoid', 'colon', 'with', 'diverticuli', '.'], 12033, 0.98822021484375),
 (['sigmoid', 'colon', 'and'], 31667, 0.9872925281524658),
 (['sigmoid', 'colon', 'and'], 8730, 0.9872925281524658),
 (['sigmoid', 'colon', 'and'], 76409, 0.9872925281524658)]

In [53]:
#### Saving the model vectors ####
fse.models.SentenceVectors.save(model3_word2vec.sv,"/Users/harshitg/github/autocomplete/autocomplete/sentence2vec_everything_combined_word2vec_usif.model")