In [6]:
import re
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import math

**Create dataframe that matches paper abstract with subjects **

In [7]:
paper_id = []
abstract = []

f = open('aminer_2015.txt','r',encoding = 'utf8')
f.readline()
for i, line in enumerate(f):
        if (i+2) % 250000 == 0:
            print('file '+file+': ',round((i+2)/1000000*100,1),"%")
        json_line = json.loads(line)
        if 'year' in json_line and 'keywords' in json_line and \
        'abstract' in json_line and 'lang' in json_line and \
        'references' in json_line and 'issn' in json_line:
            
            if json_line['lang'] == 'en' :
                
                ## store paper info, later use to get the subject of the paper
                paper_id.append(json_line['id'])
                abstract.append(json_line['abstract'])

                
f.close()

In [8]:
df= pd.DataFrame()
df['id'] = paper_id
df['abstract'] = abstract
df.set_index('id')

Unnamed: 0_level_0,abstract
id,Unnamed: 1_level_1
53e99796b7602d9701f5d979,The pHealth 2015 Conference is the 12th in a s...
53e99796b7602d9701f613ec,A graphical model is a probability distributio...
53e9979bb7602d9701f668c8,An inter-governmental body is encouraging the ...
53e997a2b7602d9701f74d6a,The most important results from the EU-sponsor...
53e997a2b7602d9701f75614,The Nursing and Midwifery Council is seeking t...
53e997a2b7602d9701f75a62,Contents.
53e997a2b7602d9701f75a78,cover
53e997a6b7602d9701f7c67e,The above comments on our previous paper are t...
53e997b2b7602d9701f91202,Machine learning's focus on ill-defined proble...
53e997b5b7602d9701f969d2,\n Die numerische Mathematik (kurz: Numerik) i...


In [9]:
df.head()

Unnamed: 0,id,abstract
0,53e99796b7602d9701f5d979,The pHealth 2015 Conference is the 12th in a s...
1,53e99796b7602d9701f613ec,A graphical model is a probability distributio...
2,53e9979bb7602d9701f668c8,An inter-governmental body is encouraging the ...
3,53e997a2b7602d9701f74d6a,The most important results from the EU-sponsor...
4,53e997a2b7602d9701f75614,The Nursing and Midwifery Council is seeking t...


In [10]:
subject = pd.read_csv('paper_subject_match.csv',index_col = 'id')

In [11]:
subject.head()

Unnamed: 0_level_0,paper_subject
id,Unnamed: 1_level_1
53e99784b7602d9701f3e13e,13.0
53e99784b7602d9701f3e4f2,13.0
53e9978db7602d9701f4f415,13.0
53e99792b7602d9701f56a86,27.0
53e99792b7602d9701f5b087,


In [12]:
tm = pd.merge(df, subject, on = ['id'])

In [13]:
tm.head()

Unnamed: 0,id,abstract,paper_subject
0,53e99796b7602d9701f5d979,The pHealth 2015 Conference is the 12th in a s...,
1,53e99796b7602d9701f613ec,A graphical model is a probability distributio...,
2,53e9979bb7602d9701f668c8,An inter-governmental body is encouraging the ...,10.0
3,53e997a2b7602d9701f74d6a,The most important results from the EU-sponsor...,
4,53e997a2b7602d9701f75614,The Nursing and Midwifery Council is seeking t...,29.0


In [14]:
tm.isnull().sum()

id                  0
abstract            0
paper_subject    6180
dtype: int64

In [15]:
len(tm)

95421

In [16]:
tm = tm.dropna()

In [17]:
tm = tm.drop(columns = ['id'])

In [18]:
tm['paper_subject'] = tm['paper_subject'].apply(np.int64)

**Concatenate all abstracts having the same subject(group by subject)**

In [19]:
tm_all = tm.groupby('paper_subject').agg({'abstract':'-'.join})

In [20]:
tm_all.reset_index(level=0, inplace=True)

In [21]:
tm_all.head()

Unnamed: 0,paper_subject,abstract
0,10,An inter-governmental body is encouraging the ...
1,11,The total variation (TV) regularization method...
2,12,Technologies for automated detection of neonat...
3,13,Skeletal stem cells (SSCs) reside in the postn...
4,14,Despite the extensive application of Monte Car...


**Processing the abstract:**
- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation
- Words that have fewer than 3 characters are removed
- All stopwords are removed
- lemmatized — words in third person to first person, verbs in past and future tenses to present
- Stemmed — words are reduced to their root form

In [22]:
#import sys
#!{sys.executable} -m pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2015)

In [23]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/yihuan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) #lemmatize as verb, default is noun

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

An example of processing the words in the abstract:

In [28]:
stemmer = SnowballStemmer('english') #Create a new instance of a language specific subclass
doc_sample = tm_all[tm_all['paper_subject'] == 10].values[0][1]
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words[:10])
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample[:100]))


['An', 'inter-governmental', 'body', 'is', 'encouraging', 'the', 'replacement', 'of', 'currency', 'with']


 tokenized and lemmatized document: 
['inter', 'government', 'bodi', 'encourag', 'replac', 'currenc', 'object', 'discoura']


The real processing of all the abstract, **takes time!**

In [27]:
processed_docs = tm_all['abstract'].map(preprocess)

In [51]:
processed_docs[11] #words processed in Economics, Econometrics and Finance

['consid',
 'problem',
 'estim',
 'local',
 'sensor',
 'paramet',
 'local',
 'paramet',
 'sensor',
 'observ',
 'relat',
 'linear',
 'stochast',
 'model',
 'studi',
 'gaussian',
 'product',
 'algorithm',
 'wireless',
 'network',
 'gspawn',
 'procedur',
 'compar',
 'popular',
 'diffus',
 'strategi',
 'perform',
 'network',
 'paramet',
 'estim',
 'communic',
 'cost',
 'sensor',
 'increas',
 'increas',
 'network',
 'densiti',
 'gspawn',
 'allow',
 'sensor',
 'broadcast',
 'messag',
 'size',
 'depend',
 'network',
 'size',
 'densiti',
 'make',
 'suitabl',
 'applic',
 'wireless',
 'sensor',
 'network',
 'gspawn',
 'converg',
 'mean',
 'mean',
 'squar',
 'stabil',
 'technic',
 'suffici',
 'condit',
 'applic',
 'gspawn',
 'network',
 'local',
 'problem',
 'line',
 'sight',
 'environ',
 'numer',
 'result',
 'suggest',
 'gspawn',
 'converg',
 'faster',
 'general',
 'diffus',
 'method',
 'lower',
 'communic',
 'cost',
 'sensor',
 'compar',
 'root',
 'mean',
 'squar',
 'error',
 'express',
 'clone

In [31]:
dictionary = gensim.corpora.Dictionary(processed_docs) 
#Dictionary encapsulates the mapping between normalized words and their integer ids.

In [41]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 30:
        break

0 aaet
1 aakr
2 aapoaii
3 ababa
4 abandon
5 abas
6 abcc
7 abdomen
8 abdomin
9 abduct
10 aberr
11 abil
12 abiogen
13 abiot
14 abl
15 ablat
16 abmd
17 abnorm
18 abolish
19 abort
20 abound
21 abras
22 abrog
23 abrupt
24 abscis
25 absenc
26 absent
27 absentia
28 absolut
29 absorb
30 absorpt


In [None]:
# dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# How should we limit?

In [33]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# for each subject, create a dictionary reporting words and how many times those words appear

In [52]:
bow_corpus[11] #（id in dict, how many times of appearance

[(8, 4),
 (10, 6),
 (11, 184),
 (13, 2),
 (14, 172),
 (15, 12),
 (17, 20),
 (21, 11),
 (23, 10),
 (25, 42),
 (26, 6),
 (28, 35),
 (29, 105),
 (30, 154),
 (32, 9),
 (34, 17),
 (35, 1),
 (38, 1),
 (39, 7),
 (41, 86),
 (42, 52),
 (43, 13),
 (44, 214),
 (45, 4),
 (47, 20),
 (48, 16),
 (49, 36),
 (50, 200),
 (51, 154),
 (56, 86),
 (57, 338),
 (58, 330),
 (62, 14),
 (67, 17),
 (68, 1),
 (71, 1144),
 (73, 203),
 (74, 1),
 (76, 1),
 (81, 68),
 (83, 48),
 (84, 32),
 (86, 11),
 (87, 50),
 (90, 8),
 (93, 3),
 (94, 49),
 (95, 758),
 (96, 3),
 (99, 5),
 (100, 1),
 (102, 63),
 (103, 130),
 (104, 7),
 (110, 349),
 (111, 1),
 (113, 111),
 (114, 12),
 (117, 711),
 (118, 216),
 (119, 1),
 (120, 2),
 (121, 3),
 (122, 2),
 (126, 25),
 (128, 119),
 (130, 2),
 (131, 3),
 (132, 10),
 (133, 45),
 (134, 150),
 (135, 5),
 (137, 12),
 (138, 25),
 (140, 9),
 (141, 8),
 (144, 172),
 (147, 1),
 (149, 22),
 (150, 70),
 (151, 2),
 (153, 171),
 (154, 293),
 (155, 6),
 (156, 31),
 (158, 1),
 (163, 7),
 (164, 2),
 (166,

In [53]:
# bag of words example in Economics, Econometrics and Finance
bow_doc_11 = bow_corpus[11]

for i in range(len(bow_doc_2)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_11[i][0], 
                                                     dictionary[bow_doc_11[i][0]], 
                                                     bow_doc_11[i][1]))

Word 8 ("abdomin") appears 4 time.
Word 10 ("aberr") appears 6 time.
Word 11 ("abil") appears 184 time.
Word 13 ("abiot") appears 2 time.
Word 14 ("abl") appears 172 time.
Word 15 ("ablat") appears 12 time.
Word 17 ("abnorm") appears 20 time.
Word 21 ("abras") appears 11 time.
Word 23 ("abrupt") appears 10 time.
Word 25 ("absenc") appears 42 time.
Word 26 ("absent") appears 6 time.
Word 28 ("absolut") appears 35 time.
Word 29 ("absorb") appears 105 time.
Word 30 ("absorpt") appears 154 time.
Word 32 ("abstract") appears 9 time.
Word 34 ("abund") appears 17 time.
Word 35 ("abus") appears 1 time.
Word 38 ("acad") appears 1 time.
Word 39 ("academ") appears 7 time.
Word 41 ("acceler") appears 86 time.
Word 42 ("accept") appears 52 time.
Word 43 ("acceptor") appears 13 time.
Word 44 ("access") appears 214 time.
Word 45 ("accid") appears 4 time.
Word 47 ("accommod") appears 20 time.
Word 48 ("accompani") appears 16 time.
Word 49 ("accomplish") appears 36 time.
Word 50 ("accord") appears 200 

Word 4493 ("imper") appears 3 time.
Word 4494 ("imperfect") appears 26 time.
Word 4495 ("imperm") appears 2 time.
Word 4496 ("imping") appears 8 time.
Word 4497 ("implant") appears 188 time.
Word 4498 ("implement") appears 638 time.
Word 4499 ("impli") appears 29 time.
Word 4500 ("implic") appears 27 time.
Word 4501 ("implicit") appears 15 time.
Word 4502 ("import") appears 450 time.
Word 4503 ("impos") appears 35 time.
Word 4504 ("imposit") appears 1 time.
Word 4505 ("imposs") appears 9 time.
Word 4506 ("imprecis") appears 4 time.
Word 4507 ("impress") appears 9 time.
Word 4508 ("imprint") appears 24 time.
Word 4509 ("improv") appears 1144 time.
Word 4510 ("impuls") appears 54 time.
Word 4511 ("imput") appears 2 time.
Word 4515 ("inabl") appears 3 time.
Word 4516 ("inaccess") appears 1 time.
Word 4517 ("inaccur") appears 9 time.
Word 4518 ("inact") appears 7 time.
Word 4519 ("inactiv") appears 10 time.
Word 4520 ("inadequ") appears 5 time.
Word 4521 ("inappropri") appears 2 time.
Word

TF-IDF method

In [54]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [55]:
corpus_tfidf = tfidf[bow_corpus]

In [56]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.010808341452570631),
 (1, 0.010808341452570631),
 (2, 0.010808341452570631),
 (3, 0.004864395986776133),
 (4, 0.006898295968474878),
 (5, 0.004254454731539502),
 (6, 0.002176509869467302),
 (7, 0.00428042058482585),
 (8, 0.004933245238394435),
 (9, 0.0019550227420478757),
 (10, 0.009884949871609882),
 (11, 0.004488794144535665),
 (12, 0.003581913988133879),
 (13, 0.014107301985064784),
 (15, 0.009759060083786093),
 (16, 0.010745741964401636),
 (17, 0.008312726005884107),
 (18, 0.01733476625705189),
 (19, 0.006160763248328936),
 (20, 0.0027346128569436406),
 (21, 0.0035193144999648876),
 (22, 0.0085608411696517),
 (23, 0.002737068356376615),
 (24, 0.004864395986776133),
 (25, 0.007700428225993526),
 (26, 0.0056573512768761875),
 (27, 0.0054041707262853155),
 (28, 0.001194894035067961),
 (29, 0.0026110852047120867),
 (30, 0.0032537470749265654),
 (31, 0.0017596572499824438),
 (32, 0.0026110852047120867),
 (33, 0.008508909463079004),
 (34, 0.012191998141963355),
 (35, 0.00364942447

 (2303, 0.0015607815586752445),
 (2304, 0.002853613723217233),
 (2305, 0.009172753251877228),
 (2306, 0.0008134367687316413),
 (2307, 0.0006099412552366308),
 (2308, 0.005841704171443365),
 (2309, 0.0018247122375844099),
 (2310, 0.0010167959609145517),
 (2311, 0.0019550227420478757),
 (2312, 0.0008053067473020627),
 (2313, 0.002176509869467302),
 (2314, 0.0016106134946041253),
 (2315, 0.0235527236833383),
 (2316, 0.0031047387367936887),
 (2317, 0.004433453869804856),
 (2318, 0.0019550227420478757),
 (2319, 0.020539278410473944),
 (2320, 0.010444340818848347),
 (2322, 0.004710544736667661),
 (2323, 0.001409498639541267),
 (2324, 0.009314216210381065),
 (2325, 0.01958858882520572),
 (2326, 0.004959543838422133),
 (2327, 0.0046052907071062665),
 (2328, 0.010938451427774562),
 (2329, 0.01101957832340453),
 (2330, 0.0017596572499824438),
 (2331, 0.0016914296776511198),
 (2332, 0.0027346128569436406),
 (2333, 0.0008053067473020627),
 (2334, 0.0054041707262853155),
 (2335, 0.00563799455816506

 (4333, 0.002176509869467302),
 (4334, 0.001409498639541267),
 (4335, 0.0020535877494429785),
 (4336, 0.010808341452570631),
 (4337, 0.0057485799737290654),
 (4338, 0.010347443952712317),
 (4339, 0.003080381624164468),
 (4340, 0.002176509869467302),
 (4341, 0.004864395986776133),
 (4342, 0.0016106134946041253),
 (4343, 0.004254454731539502),
 (4344, 0.0031047387367936887),
 (4345, 0.043233365810282524),
 (4346, 0.0054041707262853155),
 (4347, 0.0008703617349040289),
 (4348, 0.004561780593961024),
 (4349, 0.008798286249912218),
 (4350, 0.008508909463079004),
 (4351, 0.008203838570830922),
 (4352, 0.0014268068616086166),
 (4353, 0.0054041707262853155),
 (4354, 0.010808341452570631),
 (4355, 0.003581913988133879),
 (4356, 0.009314216210381065),
 (4357, 0.002176509869467302),
 (4358, 0.004754690586593482),
 (4359, 0.0031047387367936887),
 (4360, 0.016212512178855946),
 (4361, 0.0031047387367936887),
 (4362, 0.003581913988133879),
 (4363, 0.0014268068616086166),
 (4364, 0.005404170726285315

 (6359, 0.007163827976267758),
 (6360, 0.0054041707262853155),
 (6361, 0.0035193144999648876),
 (6362, 0.0027346128569436406),
 (6363, 0.0054041707262853155),
 (6364, 0.0014268068616086166),
 (6365, 0.004254454731539502),
 (6366, 0.004254454731539502),
 (6367, 0.0015848968621978274),
 (6368, 0.01945758394710453),
 (6369, 0.004353019738934604),
 (6370, 0.0010405210391168298),
 (6371, 0.0035193144999648876),
 (6372, 0.0054041707262853155),
 (6373, 0.0027346128569436406),
 (6374, 0.004254454731539502),
 (6375, 0.0054041707262853155),
 (6376, 0.007163827976267758),
 (6377, 0.0054041707262853155),
 (6378, 0.0027346128569436406),
 (6379, 0.0031047387367936887),
 (6380, 0.002853613723217233),
 (6381, 0.0054041707262853155),
 (6382, 0.0027346128569436406),
 (6383, 0.003581913988133879),
 (6384, 0.0027346128569436406),
 (6385, 0.0054041707262853155),
 (6386, 0.0027346128569436406),
 (6387, 0.007296593980164199),
 (6388, 0.0054041707262853155),
 (6389, 0.0031882781818428),
 (6390, 0.001426806861

 (8589, 0.003581913988133879),
 (8590, 0.008053067473020627),
 (8591, 0.0019550227420478757),
 (8592, 0.0031047387367936887),
 (8594, 0.0008703617349040289),
 (8595, 0.0038967844444745335),
 (8596, 0.0026436694983778346),
 (8597, 0.0008053067473020627),
 (8598, 0.003581913988133879),
 (8599, 0.0019550227420478757),
 (8600, 0.018215396528550525),
 (8601, 0.0014268068616086166),
 (8602, 0.002737068356376615),
 (8603, 0.0007047493197706335),
 (8604, 0.0038474459959267596),
 (8605, 0.0027346128569436406),
 (8606, 0.003910045484095751),
 (8607, 0.0014268068616086166),
 (8608, 0.0006099412552366308),
 (8609, 0.0014268068616086166),
 (8610, 0.0019550227420478757),
 (8611, 0.029781183120776513),
 (8612, 0.002853613723217233),
 (8613, 0.0027346128569436406),
 (8614, 0.029186375920656794),
 (8615, 0.0008053067473020627),
 (8616, 0.009987648031260317),
 (8617, 0.012418954947174755),
 (8618, 0.0002770908668628035),
 (8619, 0.012763364194618505),
 (8620, 0.0015848968621978274),
 (8621, 0.0017596572

Running LDA using Bag of Words

In [66]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=100, id2word=dictionary, passes=2, workers=2)

In [60]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"studi" + 0.006*"cell" + 0.006*"result" + 0.005*"activ" + 0.005*"effect" + 0.005*"high" + 0.004*"patient" + 0.004*"group" + 0.004*"function" + 0.004*"develop"
Topic: 1 
Words: 0.010*"cell" + 0.007*"studi" + 0.006*"gene" + 0.006*"result" + 0.006*"effect" + 0.005*"protein" + 0.005*"express" + 0.005*"increas" + 0.005*"activ" + 0.005*"patient"
Topic: 2 
Words: 0.007*"studi" + 0.006*"activ" + 0.006*"cell" + 0.005*"effect" + 0.005*"differ" + 0.005*"result" + 0.005*"increas" + 0.005*"patient" + 0.005*"model" + 0.004*"high"
Topic: 3 
Words: 0.012*"studi" + 0.008*"patient" + 0.007*"associ" + 0.006*"effect" + 0.005*"group" + 0.005*"differ" + 0.005*"signific" + 0.004*"increas" + 0.004*"level" + 0.004*"includ"
Topic: 4 
Words: 0.010*"studi" + 0.009*"cell" + 0.009*"patient" + 0.006*"activ" + 0.006*"signific" + 0.005*"effect" + 0.005*"express" + 0.004*"increas" + 0.004*"high" + 0.004*"differ"
Topic: 5 
Words: 0.009*"cell" + 0.008*"studi" + 0.008*"effect" + 0.007*"activ" + 0.00

Running LDA using TF-IDF

In [61]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary, passes=2, workers=4)

In [62]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.000*"cow" + 0.000*"milk" + 0.000*"dietari" + 0.000*"calv" + 0.000*"phenol" + 0.000*"speci" + 0.000*"antioxid" + 0.000*"cell" + 0.000*"gene" + 0.000*"crop"
Topic: 1 Word: 0.000*"graphen" + 0.000*"nanoparticl" + 0.000*"polym" + 0.000*"copolym" + 0.000*"poli" + 0.000*"aerogel" + 0.000*"coat" + 0.000*"nanocomposit" + 0.000*"fabric" + 0.000*"nanosheet"
Topic: 2 Word: 0.000*"periodont" + 0.000*"implant" + 0.000*"teeth" + 0.000*"dental" + 0.000*"gingiv" + 0.000*"dentur" + 0.000*"abut" + 0.000*"cbct" + 0.000*"dentin" + 0.000*"ohrqol"
Topic: 3 Word: 0.000*"patient" + 0.000*"cell" + 0.000*"tumor" + 0.000*"fnip" + 0.000*"protein" + 0.000*"mice" + 0.000*"phosphoryl" + 0.000*"receptor" + 0.000*"mutant" + 0.000*"women"
Topic: 4 Word: 0.000*"hyperspectr" + 0.000*"polarimetr" + 0.000*"radar" + 0.000*"modi" + 0.000*"insar" + 0.000*"isar" + 0.000*"polsar" + 0.000*"apertur" + 0.000*"ionospher" + 0.000*"brdf"
Topic: 5 Word: 0.000*"readmiss" + 0.000*"unplan" + 0.000*"copd" + 0.000*"chiropr

Classify sample document using LDA Bag of Words model

In [63]:
processed_docs[11]

['consid',
 'problem',
 'estim',
 'local',
 'sensor',
 'paramet',
 'local',
 'paramet',
 'sensor',
 'observ',
 'relat',
 'linear',
 'stochast',
 'model',
 'studi',
 'gaussian',
 'product',
 'algorithm',
 'wireless',
 'network',
 'gspawn',
 'procedur',
 'compar',
 'popular',
 'diffus',
 'strategi',
 'perform',
 'network',
 'paramet',
 'estim',
 'communic',
 'cost',
 'sensor',
 'increas',
 'increas',
 'network',
 'densiti',
 'gspawn',
 'allow',
 'sensor',
 'broadcast',
 'messag',
 'size',
 'depend',
 'network',
 'size',
 'densiti',
 'make',
 'suitabl',
 'applic',
 'wireless',
 'sensor',
 'network',
 'gspawn',
 'converg',
 'mean',
 'mean',
 'squar',
 'stabil',
 'technic',
 'suffici',
 'condit',
 'applic',
 'gspawn',
 'network',
 'local',
 'problem',
 'line',
 'sight',
 'environ',
 'numer',
 'result',
 'suggest',
 'gspawn',
 'converg',
 'faster',
 'general',
 'diffus',
 'method',
 'lower',
 'communic',
 'cost',
 'sensor',
 'compar',
 'root',
 'mean',
 'squar',
 'error',
 'express',
 'clone

In [67]:
for index, score in sorted(lda_model[bow_corpus[11]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9999974370002747	 
Topic: 0.009*"propos" + 0.007*"base" + 0.007*"result" + 0.007*"method" + 0.006*"model" + 0.006*"high" + 0.005*"perform" + 0.005*"power" + 0.005*"effect" + 0.005*"control"


In [65]:
for index, score in sorted(lda_model_tfidf[bow_corpus[11]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9907759428024292	 
Topic: 0.000*"cell" + 0.000*"graphen" + 0.000*"antenna" + 0.000*"nanoparticl" + 0.000*"mimo" + 0.000*"protein" + 0.000*"polym" + 0.000*"nanowir" + 0.000*"fabric" + 0.000*"wireless"
