### Evaluate testing dataset by cosine similarity 
- between corresponding parts (higher the better)
- between 10,000 random parts (lower the better)

- https://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html
- http://www.creativmark.com/blog/similarity-measure-of-textual-documents_p12.html

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [2]:
#retrieve table titles from csv
data = pd.read_csv('titles.csv', delimiter='\t', error_bad_lines=True, header=None)
data.columns = ['id', 'title']
data.title = data.title.str.strip()
documents = data
documents['title'].replace('', np.nan, inplace=True)
documents = documents.astype(str)
print('checking if text is missing')
print(documents.isna().any())
# documents.dropna(subset=['title'], inplace=True)
documents.shape

checking if text is missing
id       False
title    False
dtype: bool


(45352, 2)

In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import strip_numeric
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [4]:
stemmer = SnowballStemmer('english')
STOP_WORDS = list(gensim.parsing.preprocessing.STOPWORDS)
STOP_WORDS.extend(['table', 'legend'])

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
#     deacc=True removes punctuations
    for token in gensim.utils.simple_preprocess(text, deacc=True):
        if token not in STOP_WORDS and len(token)>1:
#             result.append(lemmatize_stemming(strip_numeric(token)))
            result.append(lemmatize_stemming(token))
    return result

In [5]:
#process the text, save the results as processed_docs
processed_docs = pd.DataFrame()
processed_docs = pd.concat([documents.id, documents.title.map(preprocess)], axis=1)
processed_docs[:5]

Unnamed: 0,id,title
0,4105,"[baselin, characterist]"
1,4106,"[analysi, efficaci]"
2,4107,"[comparison, postop, carbohydr, antigen, level..."
3,4108,"[pattern, diseas, relaps]"
4,4109,"[grade, advers, event, gemcitabin, gemcitabin,..."


In [6]:
#min_count = ignore all words and bigrams with total collected count lower than this value
#threshold = represent a score threshold for forming the phrases(higher means fewer phrases)
bigram = gensim.models.Phrases(processed_docs, min_count=1, threshold=1)
trigram = gensim.models.Phrases(bigram[processed_docs], threshold=1)

#sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [7]:
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])

def make_bigram(text):
    processed_text = [[word for word in simple_preprocess(str(doc)) if word not in STOP_WORDS] for doc in text]
    return [bigram_mod[doc] for doc in processed_text]

def make_trigram(text):
    processed_text = [[word for word in simple_preprocess(str(doc)) if word not in STOP_WORDS] for doc in text]
    return trigram_mod[[bigram_mod[doc] for doc in processed_text]]

def explain_make_trigram(text):
    conversion = {}
    for doc in text:
        pro_doc = simple_preprocess(str(doc))
        if doc!= pro_doc:
            print(doc)
            print(pro_doc)
            conversion[doc]=pro_doc
    return conversion

def n_gram_lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    text_out = []
    for sent in text:
        doc = nlp(' '.join(sent))
        text_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return text_out

def explain_n_gram_lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    conversions = {}
    for sent in text:
        doc = nlp(' '.join(sent))
        for token in doc:
            if len(token.lemma_) <4 and str(token)!=str(token.lemma_):
                conversions[token] = token.lemma_
#                 print('%s : %s' (token, token.lemma_))
    return conversions

In [8]:
processed_bigram = n_gram_lemmatization(make_bigram(processed_docs.title))
processed_trigram = n_gram_lemmatization(make_trigram(processed_docs.title))
print(str(len(processed_bigram)))
print(str(len(processed_trigram)))
processed_docs['bigram'] = processed_bigram
processed_docs['trigram'] = processed_trigram
processed_docs.head(10)

45352
45352


Unnamed: 0,id,title,bigram,trigram
0,4105,"[baselin, characterist]","[baselin, characterist]","[baselin, characterist]"
1,4106,"[analysi, efficaci]","[analysi, efficaci]","[analysi, efficaci]"
2,4107,"[comparison, postop, carbohydr, antigen, level...","[comparison, postop, carbohydr, antigen, level...","[comparison, postop, carbohydr, antigen, level..."
3,4108,"[pattern, diseas, relaps]","[pattern, disea, relap]","[pattern, disea, relap]"
4,4109,"[grade, advers, event, gemcitabin, gemcitabin,...","[grade, adver, event, gemcitabin, gemcitabin, ...","[grade, adver, event, gemcitabin, gemcitabin, ..."
5,4111,"[baselin, characterist]","[baselin, characterist]","[baselin, characterist]"
6,4112,"[treatment, zoledron, acid]","[treatment, zoledron, acid]","[treatment, zoledron, acid]"
7,4113,"[treatment, docetaxel]","[treatment, docetaxel]","[treatment, docetaxel]"
8,4114,"[treatment, relaps, discret, treat, clinician]","[treatment, relap, discret, treat, clinician]","[treatment, relap, discret, treat, clinician]"
9,4115,"[worst, advers, event, grade, report, entir, t...","[bad, adver, event, grade, report, entir, time...","[bad, adver, event, grade, report, entir, time..."


In [9]:
dictionary_made_by = processed_trigram
dictionary_made_by_str = 'trigram'

dictionary = gensim.corpora.Dictionary(dictionary_made_by)

In [10]:
# vectorize
# Bag-of-words representation of the documents
bow_corpus = [dictionary.doc2bow(doc) for doc in dictionary_made_by]

bow_doc_100 = bow_corpus[100]
for i in range(len(bow_doc_100)):
    print("Word {} (\"{}\") appears {} times.".format(bow_doc_100[i][0],
                                                     dictionary[bow_doc_100[i][0]],
                                                     bow_doc_100[i][1]))

processed_docs['bow_corpus'] = bow_corpus
processed_docs.head(10)

Word 227 ("clinic") appears 1 times.
Word 237 ("arteri") appears 1 times.
Word 238 ("coronari") appears 1 times.
Word 269 ("calcif") appears 1 times.
Word 270 ("correl") appears 1 times.
Word 271 ("score") appears 1 times.


Unnamed: 0,id,title,bigram,trigram,bow_corpus
0,4105,"[baselin, characterist]","[baselin, characterist]","[baselin, characterist]","[(0, 1), (1, 1)]"
1,4106,"[analysi, efficaci]","[analysi, efficaci]","[analysi, efficaci]","[(2, 1), (3, 1)]"
2,4107,"[comparison, postop, carbohydr, antigen, level...","[comparison, postop, carbohydr, antigen, level...","[comparison, postop, carbohydr, antigen, level...","[(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1..."
3,4108,"[pattern, diseas, relaps]","[pattern, disea, relap]","[pattern, disea, relap]","[(14, 1), (15, 1), (16, 1)]"
4,4109,"[grade, advers, event, gemcitabin, gemcitabin,...","[grade, adver, event, gemcitabin, gemcitabin, ...","[grade, adver, event, gemcitabin, gemcitabin, ...","[(17, 1), (18, 1), (19, 1), (20, 2), (21, 1)]"
5,4111,"[baselin, characterist]","[baselin, characterist]","[baselin, characterist]","[(0, 1), (1, 1)]"
6,4112,"[treatment, zoledron, acid]","[treatment, zoledron, acid]","[treatment, zoledron, acid]","[(22, 1), (23, 1), (24, 1)]"
7,4113,"[treatment, docetaxel]","[treatment, docetaxel]","[treatment, docetaxel]","[(23, 1), (25, 1)]"
8,4114,"[treatment, relaps, discret, treat, clinician]","[treatment, relap, discret, treat, clinician]","[treatment, relap, discret, treat, clinician]","[(16, 1), (23, 1), (26, 1), (27, 1), (28, 1)]"
9,4115,"[worst, advers, event, grade, report, entir, t...","[bad, adver, event, grade, report, entir, time...","[bad, adver, event, grade, report, entir, time...","[(13, 1), (17, 1), (19, 1), (21, 1), (29, 1), ..."


In [11]:
# from pathlib import Path
from pprint import pprint

# dir_to_check = Path(directory+'/data/')
# if not dir_to_check.is_dir():
#     os.makedirs(directory+'/data/')
mallet_path = './mallet-2.0.8/bin/mallet'

directory ='./cosine_similarity/'
optimal_topic_num = 8
lda_mallet = gensim.models.wrappers.LdaMallet(mallet_path=mallet_path, \
                                              corpus=bow_corpus, \
                                              num_topics=optimal_topic_num, \
                                              id2word=dictionary,\
                                            iterations=100,\
                                             prefix=directory)

#show topics
pprint(lda_mallet.show_topics(formatted=False))
# lda_mallet.load_word_topics()

[(0,
  [('event', 0.0866143533664295),
   ('treatment', 0.07293358651140446),
   ('popul', 0.04930109125849329),
   ('adver', 0.04927821372194642),
   ('patient', 0.029008716341424354),
   ('treat', 0.027018370661847133),
   ('outcom', 0.02482212715334813),
   ('group', 0.02058978289217817),
   ('primari', 0.02033812999016266),
   ('characterist', 0.01885109011461646)]),
 (1,
  [('coronari', 0.04859830492688833),
   ('arteri', 0.03412498835801434),
   ('leav', 0.03332401974480768),
   ('heart', 0.03310049362019186),
   ('ventricular', 0.031573065101983794),
   ('myocardi', 0.025016298779919902),
   ('infarct', 0.02224084939927354),
   ('pressur', 0.020024215330166715),
   ('blood', 0.018198751979137562),
   ('end', 0.015181149296824067)]),
 (2,
  [('year', 0.05378393351800554),
   ('age', 0.04660387811634349),
   ('relat', 0.026371191135734072),
   ('cancer', 0.02047645429362881),
   ('number', 0.0188808864265928),
   ('incid', 0.018459833795013852),
   ('woman', 0.01637673130193906),


In [None]:
topics = lda_mallet.show_topics(formatted=False)

In [None]:
import random

def format_topics_sentences(model, \
                            model_type='mallet', \
                            corpus=processed_docs, \
                            texts=dictionary_made_by):
    sent_topic_df = pd.DataFrame()
    if model_type=='tfidf':
        target_corpus = corpus.tfidf_corpus
    else:
        target_corpus = corpus.bow_corpus
    
    for i, row in enumerate(model[target_corpus]):
        origin_info = processed_docs.iloc[i]
        text_vec = texts[i]
        #get main topic in each document
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j==0: #dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
#                 print(pd.Series([origin_info.id,\
#                                 int(topic_num), \
#                                 round(prop_topic, 4), \
#                                 topic_keywords, \
#                                origin_info.title_y, \
#                                text_vec]))
                sent_topic_df = sent_topic_df.append(pd.Series([origin_info.id,\
                                                                int(topic_num), \
                                                                round(prop_topic, 4), \
                                                                topic_keywords, \
                                                               origin_info.title_y, \
                                                               text_vec]), ignore_index=True)
            else:
                break
    sent_topic_df.columns = ['id', 'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Origin_Text', 'Text_Vec']

    return (sent_topic_df)

In [None]:
processed_docs = processed_docs.merge(documents, on='id')

df_topic_sents_keywords_mallet = format_topics_sentences(model=lda_mallet, \
                                                  corpus=processed_docs, \
                                                  texts=dictionary_made_by, \
                                                 model_type='mallet')
df_topic_sents_keywords_mallet.head(20)

In [None]:
pro_bow = processed_docs[['id', 'bow_corpus']]
df_topic_sents_keywords_mallet = df_topic_sents_keywords_mallet.merge(pro_bow, on='id')
df_topic_sents_keywords_mallet.head(5)

In [None]:
def intra_inter(model, test_docs, num_pairs=10000):
    for indx, grp in df_topic_sents_keywords_mallet.groupby('Dominant_Topic'):
        # split each test document into two halves and compute topics for each half
        grp_size = len(grp)
        half_size = int(grp_size/2)
        part1 = grp.Text_Vec[:half_size]
        part2 = grp.Text_Vec[half_size:]
    
        # print computed similarities (uses cossim)
        print("average cosine similarity between corresponding parts (higher is better):")
        print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))

#     random_pairs = np.random.randint(0, len(test_docs), size=(num_pairs, 2))
#     print("average cosine similarity between 10,000 random parts (lower is better):")    
#     print(np.mean([gensim.matutils.cossim(part1[i[0]], part2[i[1]]) for i in random_pairs]))

In [None]:
print("LDA results:")
intra_inter(lda_mallet, bow_corpus)

In [None]:
index = gensim.similarities.MatrixSimilarity(lda_mallet[bow_corpus])
index.save("simIndex.index")

vec_bow = dictionary.doc2bow(processed_docs)
vec_lda = lda_mallet[vec_bow]

sims = index[vec_lda]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims)