- http://kavita-ganesan.com/gensim-word2vec-tutorial-starter-code/#.W6HVCHWWZgo
- http://kavita-ganesan.com/how-to-incorporate-phrases-into-word2vec-a-text-mining-approach/#.W6HaQHWWZgp

https://medium.com/the-official-integrate-ai-blog/what-you-need-to-know-about-natural-language-processing-2c8240e6c38e

bag-of-words : ignore the fact that language is sequential and just treat the words in a document as a collection without any specific order.
  It throws away all the complex interactions and dependencies that occur between words, it also had some undeniable benefits.
  It is effective if you want to classify documents or cluster them based on the distribution of words.

word2vec : run a lot of text through a shallow neural network(one hidden layer) and learn to predict the other words nearby.
  Then take the weights from the hidden layer, dense vectors for representing each word -> similar words end up with comparable vector representations

attention : models not only had the ability to remember important information from one time step to another, but they could not take the whole output of the RNN and only focus on those parts that were most relevant to the task at hand.
  Attention accomplishes this by creating a probability distribution over the words, directing the model to look at the words that are most salient.

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [2]:
#retrieve table titles from csv
data = pd.read_csv('titles.csv', delimiter='\t', error_bad_lines=True, header=None)
data.columns = ['id', 'title']
data.title = data.title.str.strip()
documents = data
documents['title'].replace('', np.nan, inplace=True)
documents = documents.astype(str)
print('checking if text is missing')
print(documents.isna().any())
# documents.dropna(subset=['title'], inplace=True)
documents.shape

checking if text is missing
id       False
title    False
dtype: bool


(45352, 2)

In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import strip_numeric
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [4]:
stemmer = SnowballStemmer('english')
STOP_WORDS = list(gensim.parsing.preprocessing.STOPWORDS)
STOP_WORDS.extend(['table', 'legend'])

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
#     deacc=True removes punctuations
    for token in gensim.utils.simple_preprocess(text, deacc=True):
        if token not in STOP_WORDS and len(token)>1:
#             result.append(lemmatize_stemming(strip_numeric(token)))
            result.append(lemmatize_stemming(token))
    return result

In [7]:
#process the text, save the results as processed_docs
processed_docs = pd.DataFrame()
processed_docs = pd.concat([documents.id, documents.title.map(preprocess)], axis=1)
processed_docs[:5]

In [10]:
sentences = [list(s) for s in processed_docs.title]

In [39]:
%%time
model = gensim.models.Word2Vec(sentences, size=150, window=10, min_count=10, workers=10)

CPU times: user 4.31 s, sys: 28 ms, total: 4.34 s
Wall time: 1.18 s


In [40]:
#remove unnecessary memory
model.init_sims(replace=True)

In [49]:
from pprint import pprint
# model.wv.similarity('adver', 'event')
print('===== drug =====')
pprint(sorted(model.wv.most_similar("drug"), key=lambda x: x[1], reverse=True))
print('===== advers =====')
pprint(sorted(model.wv.most_similar("advers"), key=lambda x: x[1], reverse=True))
print('===== outcom =====')
pprint(sorted(model.wv.most_similar("outcom"), key=lambda x: x[1], reverse=True))

===== drug =====
[('discontinu', 0.8875111937522888),
 ('reason', 0.8092610239982605),
 ('reaction', 0.7970107197761536),
 ('line', 0.7958920001983643),
 ('emerg', 0.7885339856147766),
 ('receiv', 0.7883462905883789),
 ('take', 0.7868720293045044),
 ('chemotherapi', 0.7824380993843079),
 ('therapi', 0.7770647406578064),
 ('toxic', 0.7706995606422424)]
===== advers =====
[('occur', 0.9315366744995117),
 ('emerg', 0.9285376667976379),
 ('prefer', 0.8685345649719238),
 ('terminolog', 0.8670566082000732),
 ('special', 0.8470296859741211),
 ('possibl', 0.8344466686248779),
 ('common', 0.8139076828956604),
 ('discontinu', 0.8113181591033936),
 ('bevacizumab', 0.8110620975494385),
 ('safeti', 0.8056946992874146)]
===== outcom =====
[('endpoint', 0.8128015398979187),
 ('secondari', 0.7591896057128906),
 ('tertiari', 0.7004872560501099),
 ('primari', 0.7003695368766785),
 ('readmiss', 0.6654028296470642),
 ('prespecifi', 0.6596863269805908),
 ('month', 0.6581047177314758),
 ('exploratori', 0.64

In [44]:
# most_similar 메서드는 positive 인수와 negative 인수를 사용하여 다음과 같은 단어간 관계도 찾을 수 있다.
model.wv.most_similar(positive=['advers', 'event'], negative=['occur'], topn=1)

[('major', 0.8935413360595703)]

### phase2ve

In [50]:
#min_count = ignore all words and bigrams with total collected count lower than this value
#threshold = represent a score threshold for forming the phrases(higher means fewer phrases)
bigram = gensim.models.Phrases(processed_docs, min_count=1, threshold=1)
trigram = gensim.models.Phrases(bigram[processed_docs], threshold=1)

#sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [51]:
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])

def make_bigram(text):
    processed_text = [[word for word in simple_preprocess(str(doc)) if word not in STOP_WORDS] for doc in text]
    return [bigram_mod[doc] for doc in processed_text]

def make_trigram(text):
    processed_text = [[word for word in simple_preprocess(str(doc)) if word not in STOP_WORDS] for doc in text]
    return trigram_mod[[bigram_mod[doc] for doc in processed_text]]

def explain_make_trigram(text):
    conversion = {}
    for doc in text:
        pro_doc = simple_preprocess(str(doc))
        if doc!= pro_doc:
            print(doc)
            print(pro_doc)
            conversion[doc]=pro_doc
    return conversion

def n_gram_lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    text_out = []
    for sent in text:
        doc = nlp(' '.join(sent))
        text_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return text_out

def explain_n_gram_lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    conversions = {}
    for sent in text:
        doc = nlp(' '.join(sent))
        for token in doc:
            if len(token.lemma_) <4 and str(token)!=str(token.lemma_):
                conversions[token] = token.lemma_
#                 print('%s : %s' (token, token.lemma_))
    return conversions

In [52]:
processed_bigram = n_gram_lemmatization(make_bigram(processed_docs.title))
processed_trigram = n_gram_lemmatization(make_trigram(processed_docs.title))
print(str(len(processed_bigram)))
print(str(len(processed_trigram)))
processed_docs['bigram'] = processed_bigram
processed_docs['trigram'] = processed_trigram
processed_docs.head(10)

45352
45352


Unnamed: 0,id,title,bigram,trigram
0,4105,"[baselin, characterist]","[baselin, characterist]","[baselin, characterist]"
1,4106,"[analysi, efficaci]","[analysi, efficaci]","[analysi, efficaci]"
2,4107,"[comparison, postop, carbohydr, antigen, level...","[comparison, postop, carbohydr, antigen, level...","[comparison, postop, carbohydr, antigen, level..."
3,4108,"[pattern, diseas, relaps]","[pattern, disea, relap]","[pattern, disea, relap]"
4,4109,"[grade, advers, event, gemcitabin, gemcitabin,...","[grade, adver, event, gemcitabin, gemcitabin, ...","[grade, adver, event, gemcitabin, gemcitabin, ..."
5,4111,"[baselin, characterist]","[baselin, characterist]","[baselin, characterist]"
6,4112,"[treatment, zoledron, acid]","[treatment, zoledron, acid]","[treatment, zoledron, acid]"
7,4113,"[treatment, docetaxel]","[treatment, docetaxel]","[treatment, docetaxel]"
8,4114,"[treatment, relaps, discret, treat, clinician]","[treatment, relap, discret, treat, clinician]","[treatment, relap, discret, treat, clinician]"
9,4115,"[worst, advers, event, grade, report, entir, t...","[bad, adver, event, grade, report, entir, time...","[bad, adver, event, grade, report, entir, time..."


In [53]:
bi_sentences = [list(s) for s in processed_bigram]
tri_sentences = [list(s) for s in processed_trigram]

In [54]:
%%time
bi_model = gensim.models.Word2Vec(bi_sentences, size=150, window=10, min_count=10, workers=10)
tri_model = gensim.models.Word2Vec(tri_sentences, size=150, window=10, min_count=10, workers=10)

bi_model.init_sims(replace=True)
tri_model.init_sims(replace=True)

CPU times: user 8.15 s, sys: 48 ms, total: 8.2 s
Wall time: 2.43 s


In [62]:
print(str(bi_model.corpus_count))
bi_sentences[:100]

45352


[['baselin', 'characterist'],
 ['analysi', 'efficaci'],
 ['comparison',
  'postop',
  'carbohydr',
  'antigen',
  'level',
  'surviv',
  'espac',
  'conoko',
  'jaspac',
  'trial'],
 ['pattern', 'disea', 'relap'],
 ['grade', 'adver', 'event', 'gemcitabin', 'gemcitabin', 'capecitabin'],
 ['baselin', 'characterist'],
 ['treatment', 'zoledron', 'acid'],
 ['treatment', 'docetaxel'],
 ['treatment', 'relap', 'discret', 'treat', 'clinician'],
 ['bad', 'adver', 'event', 'grade', 'report', 'entir', 'time', 'trial'],
 ['baselin', 'characterist'],
 ['chemotherapi', 'deliveri', 'trial', 'drug', 'discontinu'],
 ['adver', 'event'],
 ['effect',
  'adjust',
  'person',
  'characterist',
  'indic',
  'health',
  'associ',
  'caus',
  'mortal',
  'woman',
  'report',
  'happi'],
 ['characterist', 'studi', 'cohort'],
 ['screen',
  'effect',
  'select',
  'univer',
  'screen',
  'infant',
  'small',
  'sever',
  'small',
  'gestat',
  'age'],
 ['diagnost',
  'effect',
  'select',
  'univer',
  'screen',
 

In [65]:
from pprint import pprint
# model.wv.similarity('adver', 'event')
print('===== drug =====')
pprint(sorted(bi_model.wv.most_similar("drug"), key=lambda x: x[1], reverse=True))
print('===== advers =====')
pprint(sorted(bi_model.wv.most_similar("adver"), key=lambda x: x[1], reverse=True))
print('===== outcom =====')
pprint(sorted(bi_model.wv.most_similar("outcom"), key=lambda x: x[1], reverse=True))

===== drug =====
[('discontinu', 0.9070476293563843),
 ('reason', 0.8343720436096191),
 ('emerg', 0.7960875630378723),
 ('chemotherapi', 0.795301616191864),
 ('start', 0.7870227694511414),
 ('toxic', 0.7796581387519836),
 ('receiv', 0.7774386405944824),
 ('reaction', 0.7565493583679199),
 ('regimen', 0.7543553113937378),
 ('aspirin', 0.7504972815513611)]
===== advers =====
[('occur', 0.9387513995170593),
 ('emerg', 0.9147385358810425),
 ('prefer', 0.8795109987258911),
 ('terminolog', 0.857047975063324),
 ('possibl', 0.8406365513801575),
 ('irrespect', 0.8279935717582703),
 ('common', 0.8276834487915039),
 ('bevacizumab', 0.8269638419151306),
 ('special', 0.8195157647132874),
 ('thrombot', 0.8128104209899902)]
===== outcom =====
[('endpoint', 0.8096795082092285),
 ('secondari', 0.6966773271560669),
 ('month', 0.6419578194618225),
 ('thirti', 0.6384162306785583),
 ('specifi', 0.6313784718513489),
 ('tertiari', 0.6309155225753784),
 ('predefin', 0.6207742094993591),
 ('landmark', 0.619747