In [49]:
import gensim
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [63]:
# constants 
PATH = './abstracts.txt'
NUMBER_OF_ABSTRACTS = 20000
FIND_SIMILAR_TO = "To investigate the myelinotoxicity of cerebropsinal fluid in multiple sclerosis, we used an in vivo model of the myelinated central-nervous-system tract of tadpoles for quantitative double-blind tests of 46 cerebrospinal-fluid samples. Groups of xenopus tadpoles were injected with cerebrospinal fluid near the optic nerve. Forty-eight hours later, whole mounts of optic nerves were prepared, and a differential interference microscope was used to count myelin lesions. Cerebrospinal-fluid samples from 60 per cent of the patients with an acute attack of definite multiple sclerosis had myelinotoxic activity. This activity correlated best with the severity and duration of the disease, rather than with gamma-globulin or total protein concentrations. Activity was negative in 85 per cent of cerebrospinal-fluid samples from a control group with other neurologic diseases. This assay is a useful method for investigating myelinotoxic factors of cerebrospinal fluid in patients with multiple sclerosis, but was not helpful diagnostically."

In [52]:
# get abstracts
data = !head -{NUMBER_OF_ABSTRACTS*3} {PATH}
raw_documents = list(filter(lambda x: x != '', data))
print(f'Number of abstracts: {len(raw_documents)}')
print('\n')
print(f'Example abstract:\n{raw_documents[1]}')

Number of abstracts: 20000


Example abstract:
To investigate the myelinotoxicity of cerebropsinal fluid in multiple sclerosis, we used an in vivo model of the myelinated central-nervous-system tract of tadpoles for quantitative double-blind tests of 46 cerebrospinal-fluid samples. Groups of xenopus tadpoles were injected with cerebrospinal fluid near the optic nerve. Forty-eight hours later, whole mounts of optic nerves were prepared, and a differential interference microscope was used to count myelin lesions. Cerebrospinal-fluid samples from 60 per cent of the patients with an acute attack of definite multiple sclerosis had myelinotoxic activity. This activity correlated best with the severity and duration of the disease, rather than with gamma-globulin or total protein concentrations. Activity was negative in 85 per cent of cerebrospinal-fluid samples from a control group with other neurologic diseases. This assay is a useful method for investigating myelinotoxic factors of cerebros

In [53]:
# stop words include english dic and punctation
stop = stopwords.words('english') + list(string.punctuation) + list("a")

In [54]:
gen_docs = []
for text in raw_documents:
    gen_docs.append([w for w in word_tokenize(text.lower()) if w not in stop])

In [64]:
dictionary = gensim.corpora.Dictionary(gen_docs)
print("Number of words in dictionary:",len(dictionary))
print("\n")
print("Most common words:")
for i in range(20):
    print(i, dictionary[i])

Number of words in dictionary: 89671


Most common words:
0 allow
1 chamber
2 cleared
3 constructed
4 conveys
5 cytological
6 cytology
7 electric
8 error
9 exfoliative
10 filter
11 filters
12 heart
13 judgement
14 light-microscopical
15 material
16 mechanism
17 new
18 noticed
19 papanicolaou


In [56]:
# Tutorial: https://www.oreilly.com/learning/how-do-i-compare-document-similarity-using-python
# convert to bag of words, 
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

In [57]:
# create Tfid model
tf_idf = gensim.models.TfidfModel(corpus)

In [58]:
# compute similarities
sims = gensim.similarities.Similarity(None,tf_idf[corpus], num_features=len(dictionary))

In [59]:
# tokenize the query
query_doc = [w for w in word_tokenize(FIND_SIMILAR_TO.lower()) if w not in stop]

# convert query to bag of words
query_doc_bow = dictionary.doc2bow(query_doc)

# convert query to tfid
query_doc_tf_idf = tf_idf[query_doc_bow]

# view the terms with highest tfid value
terms = [(x, y) for (x, y) in zip(query_doc,query_doc_tf_idf)]
sorted(terms, key=lambda x: -x[1][1])

[('tadpoles', (42, 0.45383976103701629)),
 ('60', (75, 0.30255984069134417)),
 ('rather', (92, 0.28924030486958785)),
 ('duration', (90, 0.24592340927681602)),
 ('lesions', (72, 0.1862006732067478)),
 ('near', (55, 0.17505454772708953)),
 ('severity', (89, 0.17496862786927542)),
 ('multiple', (82, 0.1737229486877136)),
 ('tract', (41, 0.16645779319330634)),
 ('central-nervous-system', (40, 0.16266486878709835)),
 ('per', (76, 0.16266486878709835)),
 ('model', (38, 0.13964537013266173)),
 ('myelinated', (39, 0.12484488870068033)),
 ('samples', (48, 0.12053555338545606)),
 ('myelin', (71, 0.12053555338545606)),
 ('optic', (56, 0.11931832930126197)),
 ('per', (100, 0.10979480549711486)),
 ('definite', (81, 0.10680017234837591)),
 ('prepared', (65, 0.10088712432528638)),
 ('samples', (74, 0.10088712432528638)),
 ('myelinotoxic', (84, 0.096932481215636038)),
 ('later', (60, 0.09461384503807943)),
 ('used', (36, 0.088938977555125462)),
 ('optic', (63, 0.0887554559960715)),
 ('46', (46, 0.086

In [60]:
values = list(sims[query_doc_tf_idf])

for sim in sorted(sims[query_doc_tf_idf])[::-1][:5]:
    print(sim)
    print(raw_documents[values.index(sim)])
    print("\n")

1.0
To investigate the myelinotoxicity of cerebropsinal fluid in multiple sclerosis, we used an in vivo model of the myelinated central-nervous-system tract of tadpoles for quantitative double-blind tests of 46 cerebrospinal-fluid samples. Groups of xenopus tadpoles were injected with cerebrospinal fluid near the optic nerve. Forty-eight hours later, whole mounts of optic nerves were prepared, and a differential interference microscope was used to count myelin lesions. Cerebrospinal-fluid samples from 60 per cent of the patients with an acute attack of definite multiple sclerosis had myelinotoxic activity. This activity correlated best with the severity and duration of the disease, rather than with gamma-globulin or total protein concentrations. Activity was negative in 85 per cent of cerebrospinal-fluid samples from a control group with other neurologic diseases. This assay is a useful method for investigating myelinotoxic factors of cerebrospinal fluid in patients with multiple scler