In [1]:
import re
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import math

**Create dataframe that matches paper abstract with subjects **

In [2]:
paper_id = []
abstract = []

f = open('aminer_2015.txt','r',encoding = 'utf8')
f.readline()
for i, line in enumerate(f):
        if (i+2) % 250000 == 0:
            print('file '+file+': ',round((i+2)/1000000*100,1),"%")
        json_line = json.loads(line)
        if 'year' in json_line and 'keywords' in json_line and \
        'abstract' in json_line and 'lang' in json_line and \
        'references' in json_line and 'issn' in json_line:
            
            if json_line['lang'] == 'en' :
                
                ## store paper info, later use to get the subject of the paper
                paper_id.append(json_line['id'])
                abstract.append(json_line['abstract'])

                
f.close()

In [3]:
df= pd.DataFrame()
df['id'] = paper_id
df['abstract'] = abstract
#df.set_index('id')
df.describe()

Unnamed: 0,id,abstract
count,95421,95421
unique,95421,95413
top,55323e1245cec66b6f9e1750,The 2015 Tandem American Society for Blood and...
freq,1,2


In [9]:
df.head()

Unnamed: 0,id,abstract
0,53e99796b7602d9701f5d979,The pHealth 2015 Conference is the 12th in a s...
1,53e99796b7602d9701f613ec,A graphical model is a probability distributio...
2,53e9979bb7602d9701f668c8,An inter-governmental body is encouraging the ...
3,53e997a2b7602d9701f74d6a,The most important results from the EU-sponsor...
4,53e997a2b7602d9701f75614,The Nursing and Midwifery Council is seeking t...


In [10]:
df.iloc[1,1]

'A graphical model is a probability distribution associated with a graph with the following properties: (1) the nodes in the graph represent the variables in the model, (2) separation in the graph implies conditional independence of the variables given the separating set, and (3) the probability distribution can be factored according to the graph. Graphical models are useful for researchers because they support efficient computation in models with many variables and provide a visualization of a complex model.'

In [33]:
subject = pd.read_csv('paper_subject_match.csv',index_col = 'id')

In [34]:
subject.head()

Unnamed: 0_level_0,paper_subject
id,Unnamed: 1_level_1
53e99784b7602d9701f3e13e,13.0
53e99784b7602d9701f3e4f2,13.0
53e9978db7602d9701f4f415,13.0
53e99792b7602d9701f56a86,27.0
53e99792b7602d9701f5b087,


In [35]:
tm = pd.merge(df, subject, on = ['id'])

In [36]:
tm.head()

Unnamed: 0,id,abstract,paper_subject
0,53e99796b7602d9701f5d979,The pHealth 2015 Conference is the 12th in a s...,
1,53e99796b7602d9701f613ec,A graphical model is a probability distributio...,
2,53e9979bb7602d9701f668c8,An inter-governmental body is encouraging the ...,10.0
3,53e997a2b7602d9701f74d6a,The most important results from the EU-sponsor...,
4,53e997a2b7602d9701f75614,The Nursing and Midwifery Council is seeking t...,29.0


In [37]:
tm.isnull().sum()

id                  0
abstract            0
paper_subject    6180
dtype: int64

In [38]:
len(tm)

95421

In [39]:
tm = tm.dropna()

In [40]:
tm = tm.drop(columns = ['id'])

In [41]:
tm['paper_subject'] = tm['paper_subject'].apply(np.int64)

**Concatenate all abstracts having the same subject(group by subject)**

In [42]:
tm_all = tm.groupby('paper_subject').agg({'abstract':'-'.join})

In [43]:
tm_all.reset_index(level=0, inplace=True)

In [21]:
tm_all.head()

Unnamed: 0,paper_subject,abstract
0,10,An inter-governmental body is encouraging the ...
1,11,The total variation (TV) regularization method...
2,12,Technologies for automated detection of neonat...
3,13,Skeletal stem cells (SSCs) reside in the postn...
4,14,Despite the extensive application of Monte Car...


**Processing the abstract:**
- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation
- Words that have fewer than 3 characters are removed
- All stopwords are removed
- lemmatized — words in third person to first person, verbs in past and future tenses to present
- Stemmed — words are reduced to their root form

In [5]:
#import sys
#!{sys.executable} -m pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2015)

In [6]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/yihuan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) #lemmatize as verb, default is noun

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

An example of processing the words in the abstract:

In [13]:
stemmer = SnowballStemmer('english') #Create a new instance of a language specific subclass
doc_sample = df.iloc[1,1]
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words[:100])
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample[:1000]))


['A', 'graphical', 'model', 'is', 'a', 'probability', 'distribution', 'associated', 'with', 'a', 'graph', 'with', 'the', 'following', 'properties:', '(1)', 'the', 'nodes', 'in', 'the', 'graph', 'represent', 'the', 'variables', 'in', 'the', 'model,', '(2)', 'separation', 'in', 'the', 'graph', 'implies', 'conditional', 'independence', 'of', 'the', 'variables', 'given', 'the', 'separating', 'set,', 'and', '(3)', 'the', 'probability', 'distribution', 'can', 'be', 'factored', 'according', 'to', 'the', 'graph.', 'Graphical', 'models', 'are', 'useful', 'for', 'researchers', 'because', 'they', 'support', 'efficient', 'computation', 'in', 'models', 'with', 'many', 'variables', 'and', 'provide', 'a', 'visualization', 'of', 'a', 'complex', 'model.']


 tokenized and lemmatized document: 
['graphic', 'model', 'probabl', 'distribut', 'associ', 'graph', 'follow', 'properti', 'nod', 'graph', 'repres', 'variabl', 'model', 'separ', 'graph', 'impli', 'condit', 'independ', 'variabl', 'give', 'separ', 'pr

The real processing of all the abstract, **takes time!**

In [14]:
processed_docs = df['abstract'].map(preprocess)

In [15]:
processed_docs[1] #words processed in Economics, Econometrics and Finance

['graphic',
 'model',
 'probabl',
 'distribut',
 'associ',
 'graph',
 'follow',
 'properti',
 'nod',
 'graph',
 'repres',
 'variabl',
 'model',
 'separ',
 'graph',
 'impli',
 'condit',
 'independ',
 'variabl',
 'give',
 'separ',
 'probabl',
 'distribut',
 'factor',
 'accord',
 'graph',
 'graphic',
 'model',
 'use',
 'research',
 'support',
 'effici',
 'comput',
 'model',
 'variabl',
 'provid',
 'visual',
 'complex',
 'model']

In [16]:
dictionary = gensim.corpora.Dictionary(processed_docs) 
#Dictionary encapsulates the mapping between normalized words and their integer ids.

In [17]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 30:
        break

0 academ
1 accept
2 acknowledg
3 activ
4 actuat
5 adapt
6 add
7 addit
8 address
9 administr
10 affair
11 ahm
12 aim
13 allow
14 analyt
15 anim
16 app
17 applic
18 approach
19 area
20 artifact
21 aspect
22 assist
23 attende
24 attract
25 author
26 autonom
27 autonomi
28 base
29 basic
30 benefit


In [None]:
# dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# How should we limit?

In [18]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# for each subject, create a dictionary reporting words and how many times those words appear

In [52]:
bow_corpus[1] #（id in dict, how many times of appearance

[(8, 4),
 (10, 6),
 (11, 184),
 (13, 2),
 (14, 172),
 (15, 12),
 (17, 20),
 (21, 11),
 (23, 10),
 (25, 42),
 (26, 6),
 (28, 35),
 (29, 105),
 (30, 154),
 (32, 9),
 (34, 17),
 (35, 1),
 (38, 1),
 (39, 7),
 (41, 86),
 (42, 52),
 (43, 13),
 (44, 214),
 (45, 4),
 (47, 20),
 (48, 16),
 (49, 36),
 (50, 200),
 (51, 154),
 (56, 86),
 (57, 338),
 (58, 330),
 (62, 14),
 (67, 17),
 (68, 1),
 (71, 1144),
 (73, 203),
 (74, 1),
 (76, 1),
 (81, 68),
 (83, 48),
 (84, 32),
 (86, 11),
 (87, 50),
 (90, 8),
 (93, 3),
 (94, 49),
 (95, 758),
 (96, 3),
 (99, 5),
 (100, 1),
 (102, 63),
 (103, 130),
 (104, 7),
 (110, 349),
 (111, 1),
 (113, 111),
 (114, 12),
 (117, 711),
 (118, 216),
 (119, 1),
 (120, 2),
 (121, 3),
 (122, 2),
 (126, 25),
 (128, 119),
 (130, 2),
 (131, 3),
 (132, 10),
 (133, 45),
 (134, 150),
 (135, 5),
 (137, 12),
 (138, 25),
 (140, 9),
 (141, 8),
 (144, 172),
 (147, 1),
 (149, 22),
 (150, 70),
 (151, 2),
 (153, 171),
 (154, 293),
 (155, 6),
 (156, 31),
 (158, 1),
 (163, 7),
 (164, 2),
 (166,

In [21]:
# bag of words example in Economics, Econometrics and Finance
bow_doc_1 = bow_corpus[1]

for i in range(len(bow_doc_1)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_1[i][0], 
                                                     dictionary[bow_doc_1[i][0]], 
                                                     bow_doc_1[i][1]))

Word 110 ("follow") appears 1 time.
Word 117 ("give") appears 1 time.
Word 144 ("independ") appears 1 time.
Word 186 ("model") appears 5 time.
Word 246 ("provid") appears 1 time.
Word 258 ("repres") appears 1 time.
Word 260 ("research") appears 1 time.
Word 294 ("support") appears 1 time.
Word 328 ("accord") appears 1 time.
Word 329 ("associ") appears 1 time.
Word 330 ("complex") appears 1 time.
Word 331 ("comput") appears 1 time.
Word 332 ("condit") appears 1 time.
Word 333 ("distribut") appears 2 time.
Word 334 ("effici") appears 1 time.
Word 335 ("factor") appears 1 time.
Word 336 ("graph") appears 4 time.
Word 337 ("graphic") appears 2 time.
Word 338 ("impli") appears 1 time.
Word 339 ("nod") appears 1 time.
Word 340 ("probabl") appears 2 time.
Word 341 ("properti") appears 1 time.
Word 342 ("separ") appears 2 time.
Word 343 ("use") appears 1 time.
Word 344 ("variabl") appears 3 time.
Word 345 ("visual") appears 1 time.


TF-IDF method

In [22]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [23]:
corpus_tfidf = tfidf[bow_corpus]

In [24]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.01942468004371076),
 (1, 0.030999255857770868),
 (2, 0.023715142168980616),
 (3, 0.018587680457326106),
 (4, 0.024656993243389585),
 (5, 0.013213920397690388),
 (6, 0.01607170054937349),
 (7, 0.007648930221198545),
 (8, 0.03877588177016929),
 (9, 0.027893912046377388),
 (10, 0.02780741317110187),
 (11, 0.03764608407701659),
 (12, 0.010942724786037675),
 (13, 0.011460566056561436),
 (14, 0.016217642766714214),
 (15, 0.012490831160679092),
 (16, 0.03161380942669552),
 (17, 0.010110687279137776),
 (18, 0.008822194473518764),
 (19, 0.02061137222506658),
 (20, 0.025226381234989194),
 (21, 0.01474708397212484),
 (22, 0.015323179479720289),
 (23, 0.030544464637761747),
 (24, 0.01733562602269375),
 (25, 0.031385203140603335),
 (26, 0.021404664741581255),
 (27, 0.025114657728709827),
 (28, 0.005829199520834839),
 (29, 0.03341849838574794),
 (30, 0.025499719218309114),
 (31, 0.06126143547348521),
 (32, 0.045985922522554824),
 (33, 0.011264120594370545),
 (34, 0.0423839260339628),
 (35, 0.

Running LDA using Bag of Words

In [25]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [26]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.052*"patient" + 0.010*"studi" + 0.010*"group" + 0.009*"year" + 0.009*"signific" + 0.008*"case" + 0.008*"treatment" + 0.008*"clinic" + 0.007*"associ" + 0.007*"follow"
Topic: 1 
Words: 0.015*"method" + 0.015*"imag" + 0.011*"measur" + 0.010*"model" + 0.010*"time" + 0.008*"result" + 0.007*"base" + 0.007*"data" + 0.007*"perform" + 0.007*"propos"
Topic: 2 
Words: 0.014*"studi" + 0.014*"health" + 0.010*"risk" + 0.009*"care" + 0.008*"associ" + 0.008*"patient" + 0.006*"data" + 0.006*"year" + 0.006*"women" + 0.006*"report"
Topic: 3 
Words: 0.017*"speci" + 0.009*"differ" + 0.008*"sampl" + 0.008*"studi" + 0.008*"model" + 0.008*"popul" + 0.008*"data" + 0.006*"plant" + 0.006*"method" + 0.006*"genet"
Topic: 4 
Words: 0.015*"activ" + 0.014*"level" + 0.013*"increas" + 0.012*"induc" + 0.011*"effect" + 0.009*"express" + 0.008*"signific" + 0.008*"studi" + 0.008*"mice" + 0.006*"associ"
Topic: 5 
Words: 0.038*"cell" + 0.017*"gene" + 0.016*"express" + 0.010*"protein" + 0.009*"cancer" + 0.0

Running LDA using TF-IDF

In [27]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [28]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.003*"compound" + 0.003*"acid" + 0.003*"protein" + 0.003*"cell" + 0.003*"activ" + 0.003*"structur" + 0.003*"surfac" + 0.002*"temperatur" + 0.002*"plant" + 0.002*"product"
Topic: 1 Word: 0.005*"patient" + 0.004*"cell" + 0.003*"group" + 0.003*"associ" + 0.003*"cancer" + 0.003*"express" + 0.002*"diseas" + 0.002*"risk" + 0.002*"treatment" + 0.002*"gene"
Topic: 2 Word: 0.003*"dialysi" + 0.003*"hydrogel" + 0.003*"rcts" + 0.003*"hypox" + 0.003*"sirt" + 0.002*"phage" + 0.002*"claudin" + 0.002*"pufa" + 0.002*"periton" + 0.002*"gondii"
Topic: 3 Word: 0.005*"pylori" + 0.004*"leptin" + 0.003*"endometri" + 0.003*"screw" + 0.003*"toxin" + 0.002*"tdcs" + 0.002*"aborigin" + 0.002*"beverag" + 0.002*"literaci" + 0.002*"glioblastoma"
Topic: 4 Word: 0.006*"resect" + 0.006*"tumour" + 0.005*"colon" + 0.005*"pancreat" + 0.005*"surgic" + 0.004*"nerv" + 0.004*"epilepsi" + 0.004*"spinal" + 0.004*"cord" + 0.004*"obes"
Topic: 5 Word: 0.006*"obes" + 0.006*"nurs" + 0.005*"intak" + 0.004*"dietari" + 

Classify sample document using LDA Bag of Words model

In [29]:
processed_docs[1]

['graphic',
 'model',
 'probabl',
 'distribut',
 'associ',
 'graph',
 'follow',
 'properti',
 'nod',
 'graph',
 'repres',
 'variabl',
 'model',
 'separ',
 'graph',
 'impli',
 'condit',
 'independ',
 'variabl',
 'give',
 'separ',
 'probabl',
 'distribut',
 'factor',
 'accord',
 'graph',
 'graphic',
 'model',
 'use',
 'research',
 'support',
 'effici',
 'comput',
 'model',
 'variabl',
 'provid',
 'visual',
 'complex',
 'model']

In [30]:
for index, score in sorted(lda_model[bow_corpus[1]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.6990560293197632	 
Topic: 0.015*"method" + 0.015*"imag" + 0.011*"measur" + 0.010*"model" + 0.010*"time" + 0.008*"result" + 0.007*"base" + 0.007*"data" + 0.007*"perform" + 0.007*"propos"

Score: 0.28093457221984863	 
Topic: 0.017*"speci" + 0.009*"differ" + 0.008*"sampl" + 0.008*"studi" + 0.008*"model" + 0.008*"popul" + 0.008*"data" + 0.006*"plant" + 0.006*"method" + 0.006*"genet"


In [31]:
for index, score in sorted(lda_model_tfidf[bow_corpus[1]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9774827361106873	 
Topic: 0.005*"patient" + 0.004*"cell" + 0.003*"group" + 0.003*"associ" + 0.003*"cancer" + 0.003*"express" + 0.002*"diseas" + 0.002*"risk" + 0.002*"treatment" + 0.002*"gene"


Try to estimate for subjects

In [44]:
processed_subject = tm_all['abstract'].map(preprocess)

In [45]:
processed_subject[11] #words processed in Economics, Econometrics and Finance

['consid',
 'problem',
 'estim',
 'local',
 'sensor',
 'paramet',
 'local',
 'paramet',
 'sensor',
 'observ',
 'relat',
 'linear',
 'stochast',
 'model',
 'studi',
 'gaussian',
 'product',
 'algorithm',
 'wireless',
 'network',
 'gspawn',
 'procedur',
 'compar',
 'popular',
 'diffus',
 'strategi',
 'perform',
 'network',
 'paramet',
 'estim',
 'communic',
 'cost',
 'sensor',
 'increas',
 'increas',
 'network',
 'densiti',
 'gspawn',
 'allow',
 'sensor',
 'broadcast',
 'messag',
 'size',
 'depend',
 'network',
 'size',
 'densiti',
 'make',
 'suitabl',
 'applic',
 'wireless',
 'sensor',
 'network',
 'gspawn',
 'converg',
 'mean',
 'mean',
 'squar',
 'stabil',
 'technic',
 'suffici',
 'condit',
 'applic',
 'gspawn',
 'network',
 'local',
 'problem',
 'line',
 'sight',
 'environ',
 'numer',
 'result',
 'suggest',
 'gspawn',
 'converg',
 'faster',
 'general',
 'diffus',
 'method',
 'lower',
 'communic',
 'cost',
 'sensor',
 'compar',
 'root',
 'mean',
 'squar',
 'error',
 'express',
 'clone

In [46]:
dictionary_sub = gensim.corpora.Dictionary(processed_subject) 
bow_corpus_sub = [dictionary_sub.doc2bow(doc) for doc in processed_subject]


In [47]:
for index, score in sorted(lda_model[bow_corpus_sub[11]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.34674182534217834	 
Topic: 0.008*"structur" + 0.007*"high" + 0.007*"surfac" + 0.006*"base" + 0.006*"result" + 0.005*"method" + 0.005*"applic" + 0.005*"optic" + 0.005*"properti" + 0.004*"materi"

Score: 0.13652276992797852	 
Topic: 0.015*"method" + 0.015*"imag" + 0.011*"measur" + 0.010*"model" + 0.010*"time" + 0.008*"result" + 0.007*"base" + 0.007*"data" + 0.007*"perform" + 0.007*"propos"

Score: 0.10468249022960663	 
Topic: 0.010*"children" + 0.010*"studi" + 0.009*"food" + 0.008*"differ" + 0.006*"relat" + 0.006*"obes" + 0.006*"behavior" + 0.006*"effect" + 0.006*"task" + 0.006*"particip"

Score: 0.09535442292690277	 
Topic: 0.017*"speci" + 0.009*"differ" + 0.008*"sampl" + 0.008*"studi" + 0.008*"model" + 0.008*"popul" + 0.008*"data" + 0.006*"plant" + 0.006*"method" + 0.006*"genet"

Score: 0.09149647504091263	 
Topic: 0.014*"studi" + 0.014*"health" + 0.010*"risk" + 0.009*"care" + 0.008*"associ" + 0.008*"patient" + 0.006*"data" + 0.006*"year" + 0.006*"women" + 0.006*"report"

Sco

In [48]:
for index, score in sorted(lda_model_tfidf[bow_corpus[11]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.5669774413108826	 
Topic: 0.003*"compound" + 0.003*"acid" + 0.003*"protein" + 0.003*"cell" + 0.003*"activ" + 0.003*"structur" + 0.003*"surfac" + 0.002*"temperatur" + 0.002*"plant" + 0.002*"product"

Score: 0.4101311266422272	 
Topic: 0.005*"patient" + 0.004*"cell" + 0.003*"group" + 0.003*"associ" + 0.003*"cancer" + 0.003*"express" + 0.002*"diseas" + 0.002*"risk" + 0.002*"treatment" + 0.002*"gene"
