In [3]:
import re
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import math

**Create dataframe that matches paper abstract with subjects **

In [4]:
paper_id = []
abstract = []

f = open('aminer_2015.txt','r',encoding = 'utf8')
f.readline()
for i, line in enumerate(f):
        if (i+2) % 250000 == 0:
            print('file '+file+': ',round((i+2)/1000000*100,1),"%")
        json_line = json.loads(line)
        if 'year' in json_line and 'keywords' in json_line and \
        'abstract' in json_line and 'lang' in json_line and \
        'references' in json_line and 'issn' in json_line:
            
            if json_line['lang'] == 'en' :
                
                ## store paper info, later use to get the subject of the paper
                paper_id.append(json_line['id'])
                abstract.append(json_line['abstract'])

                
f.close()

In [5]:
df= pd.DataFrame()
df['id'] = paper_id
df['abstract'] = abstract
#df.set_index('id')
df.describe()

Unnamed: 0,id,abstract
count,95421,95421
unique,95421,95413
top,55a6bcc065ce054aad738dc4,This article is one of ten reviews selected fr...
freq,1,2


In [6]:
df.head()

Unnamed: 0,id,abstract
0,53e99796b7602d9701f5d979,The pHealth 2015 Conference is the 12th in a s...
1,53e99796b7602d9701f613ec,A graphical model is a probability distributio...
2,53e9979bb7602d9701f668c8,An inter-governmental body is encouraging the ...
3,53e997a2b7602d9701f74d6a,The most important results from the EU-sponsor...
4,53e997a2b7602d9701f75614,The Nursing and Midwifery Council is seeking t...


In [7]:
df.iloc[1,1]

'A graphical model is a probability distribution associated with a graph with the following properties: (1) the nodes in the graph represent the variables in the model, (2) separation in the graph implies conditional independence of the variables given the separating set, and (3) the probability distribution can be factored according to the graph. Graphical models are useful for researchers because they support efficient computation in models with many variables and provide a visualization of a complex model.'

In [8]:
subject = pd.read_csv('paper_subject_match.csv',index_col = 'id')

In [9]:
subject.head()

Unnamed: 0_level_0,paper_subject
id,Unnamed: 1_level_1
53e99784b7602d9701f3e13e,13.0
53e99784b7602d9701f3e4f2,13.0
53e9978db7602d9701f4f415,13.0
53e99792b7602d9701f56a86,27.0
53e99792b7602d9701f5b087,


In [10]:
tm = pd.merge(df, subject, on = ['id'])

In [11]:
tm.head()

Unnamed: 0,id,abstract,paper_subject
0,53e99796b7602d9701f5d979,The pHealth 2015 Conference is the 12th in a s...,
1,53e99796b7602d9701f613ec,A graphical model is a probability distributio...,
2,53e9979bb7602d9701f668c8,An inter-governmental body is encouraging the ...,10.0
3,53e997a2b7602d9701f74d6a,The most important results from the EU-sponsor...,
4,53e997a2b7602d9701f75614,The Nursing and Midwifery Council is seeking t...,29.0


In [12]:
tm.isnull().sum()

id                  0
abstract            0
paper_subject    6180
dtype: int64

In [13]:
len(tm)

95421

In [14]:
tm = tm.dropna()

In [15]:
tm = tm.drop(columns = ['id'])

In [16]:
tm['paper_subject'] = tm['paper_subject'].apply(np.int64)

**Concatenate all abstracts having the same subject(group by subject)**

In [17]:
tm_all = tm.groupby('paper_subject').agg({'abstract':'-'.join})

In [18]:
tm_all.reset_index(level=0, inplace=True)

In [19]:
tm_all.head()

Unnamed: 0,paper_subject,abstract
0,10,An inter-governmental body is encouraging the ...
1,11,The total variation (TV) regularization method...
2,12,Technologies for automated detection of neonat...
3,13,Skeletal stem cells (SSCs) reside in the postn...
4,14,Despite the extensive application of Monte Car...


**Processing the abstract:**
- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation
- Words that have fewer than 3 characters are removed
- All stopwords are removed
- lemmatized — words in third person to first person, verbs in past and future tenses to present
- Stemmed — words are reduced to their root form

In [20]:
#import sys
#!{sys.executable} -m pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2015)

In [21]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/yihuan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) #lemmatize as verb, default is noun

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

An example of processing the words in the abstract:

In [23]:
stemmer = SnowballStemmer('english') #Create a new instance of a language specific subclass
doc_sample = df.iloc[1,1]
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words[:100])
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample[:1000]))


['A', 'graphical', 'model', 'is', 'a', 'probability', 'distribution', 'associated', 'with', 'a', 'graph', 'with', 'the', 'following', 'properties:', '(1)', 'the', 'nodes', 'in', 'the', 'graph', 'represent', 'the', 'variables', 'in', 'the', 'model,', '(2)', 'separation', 'in', 'the', 'graph', 'implies', 'conditional', 'independence', 'of', 'the', 'variables', 'given', 'the', 'separating', 'set,', 'and', '(3)', 'the', 'probability', 'distribution', 'can', 'be', 'factored', 'according', 'to', 'the', 'graph.', 'Graphical', 'models', 'are', 'useful', 'for', 'researchers', 'because', 'they', 'support', 'efficient', 'computation', 'in', 'models', 'with', 'many', 'variables', 'and', 'provide', 'a', 'visualization', 'of', 'a', 'complex', 'model.']


 tokenized and lemmatized document: 
['graphic', 'model', 'probabl', 'distribut', 'associ', 'graph', 'follow', 'properti', 'nod', 'graph', 'repres', 'variabl', 'model', 'separ', 'graph', 'impli', 'condit', 'independ', 'variabl', 'give', 'separ', 'pr

The real processing of all the abstract, **takes time!**

In [24]:
processed_docs = df['abstract'].map(preprocess)

In [25]:
processed_docs[1] #words processed in Economics, Econometrics and Finance

['graphic',
 'model',
 'probabl',
 'distribut',
 'associ',
 'graph',
 'follow',
 'properti',
 'nod',
 'graph',
 'repres',
 'variabl',
 'model',
 'separ',
 'graph',
 'impli',
 'condit',
 'independ',
 'variabl',
 'give',
 'separ',
 'probabl',
 'distribut',
 'factor',
 'accord',
 'graph',
 'graphic',
 'model',
 'use',
 'research',
 'support',
 'effici',
 'comput',
 'model',
 'variabl',
 'provid',
 'visual',
 'complex',
 'model']

In [26]:
dictionary = gensim.corpora.Dictionary(processed_docs) 
#Dictionary encapsulates the mapping between normalized words and their integer ids.

In [27]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 30:
        break

0 academ
1 accept
2 acknowledg
3 activ
4 actuat
5 adapt
6 add
7 addit
8 address
9 administr
10 affair
11 ahm
12 aim
13 allow
14 analyt
15 anim
16 app
17 applic
18 approach
19 area
20 artifact
21 aspect
22 assist
23 attende
24 attract
25 author
26 autonom
27 autonomi
28 base
29 basic
30 benefit


In [28]:
dictionary.filter_extremes(no_below=15, no_above=0.5)
# How should we limit? I kept all instead of keep_n=100000

In [29]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# for each subject, create a dictionary reporting words and how many times those words appear

In [30]:
bow_corpus[1] #（id in dict, how many times of appearance

[(105, 1),
 (111, 1),
 (137, 1),
 (174, 5),
 (229, 1),
 (241, 1),
 (243, 1),
 (275, 1),
 (306, 1),
 (307, 1),
 (308, 1),
 (309, 1),
 (310, 1),
 (311, 2),
 (312, 1),
 (313, 1),
 (314, 4),
 (315, 2),
 (316, 1),
 (317, 1),
 (318, 2),
 (319, 1),
 (320, 2),
 (321, 1),
 (322, 3),
 (323, 1)]

In [31]:
# bag of words example in Economics, Econometrics and Finance
bow_doc_1 = bow_corpus[1]

for i in range(len(bow_doc_1)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_1[i][0], 
                                                     dictionary[bow_doc_1[i][0]], 
                                                     bow_doc_1[i][1]))

Word 105 ("follow") appears 1 time.
Word 111 ("give") appears 1 time.
Word 137 ("independ") appears 1 time.
Word 174 ("model") appears 5 time.
Word 229 ("provid") appears 1 time.
Word 241 ("repres") appears 1 time.
Word 243 ("research") appears 1 time.
Word 275 ("support") appears 1 time.
Word 306 ("accord") appears 1 time.
Word 307 ("associ") appears 1 time.
Word 308 ("complex") appears 1 time.
Word 309 ("comput") appears 1 time.
Word 310 ("condit") appears 1 time.
Word 311 ("distribut") appears 2 time.
Word 312 ("effici") appears 1 time.
Word 313 ("factor") appears 1 time.
Word 314 ("graph") appears 4 time.
Word 315 ("graphic") appears 2 time.
Word 316 ("impli") appears 1 time.
Word 317 ("nod") appears 1 time.
Word 318 ("probabl") appears 2 time.
Word 319 ("properti") appears 1 time.
Word 320 ("separ") appears 2 time.
Word 321 ("use") appears 1 time.
Word 322 ("variabl") appears 3 time.
Word 323 ("visual") appears 1 time.


TF-IDF method

In [32]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [33]:
corpus_tfidf = tfidf[bow_corpus]

In [34]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.031218741596172533),
 (1, 0.04982103983796237),
 (2, 0.03811423887672271),
 (3, 0.029873541894316645),
 (4, 0.03962795262891248),
 (5, 0.02123700186770666),
 (6, 0.02582993724132953),
 (7, 0.012293122745157729),
 (8, 0.06231939112626948),
 (9, 0.04483022785047973),
 (10, 0.04469120954853038),
 (11, 0.01758680692215315),
 (12, 0.018419065305608443),
 (13, 0.026064491033766783),
 (14, 0.02007487533638507),
 (15, 0.05080873121934377),
 (16, 0.016249582120108512),
 (17, 0.014178756559191681),
 (18, 0.03312595636010068),
 (19, 0.040543055299211914),
 (20, 0.023701054678215133),
 (21, 0.02462693745959774),
 (22, 0.049090113534634675),
 (23, 0.027861278949898927),
 (24, 0.050441322306727814),
 (25, 0.034400911418692115),
 (26, 0.04036349675488041),
 (27, 0.009368508163015862),
 (28, 0.05370917118269671),
 (29, 0.0409823556043125),
 (30, 0.09845747366513535),
 (31, 0.018103344268861492),
 (32, 0.06811812764524715),
 (33, 0.037227544001717496),
 (34, 0.04483133973949594),
 (35, 0.0962089

Running LDA using Bag of Words

In [35]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [36]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.011*"increas" + 0.009*"concentr" + 0.009*"effect" + 0.008*"plant" + 0.008*"level" + 0.008*"differ" + 0.007*"product" + 0.007*"signific" + 0.007*"diet" + 0.007*"speci"
Topic: 1 
Words: 0.040*"cell" + 0.019*"express" + 0.014*"activ" + 0.011*"induc" + 0.010*"effect" + 0.008*"level" + 0.008*"increas" + 0.008*"cancer" + 0.008*"regul" + 0.008*"protein"
Topic: 2 
Words: 0.024*"protein" + 0.014*"activ" + 0.011*"bind" + 0.009*"structur" + 0.008*"acid" + 0.007*"compound" + 0.007*"interact" + 0.007*"complex" + 0.006*"membran" + 0.006*"function"
Topic: 3 
Words: 0.015*"health" + 0.008*"particip" + 0.008*"children" + 0.007*"care" + 0.007*"research" + 0.006*"effect" + 0.006*"intervent" + 0.006*"relat" + 0.005*"provid" + 0.005*"data"
Topic: 4 
Words: 0.015*"model" + 0.012*"method" + 0.011*"imag" + 0.009*"base" + 0.009*"data" + 0.007*"propos" + 0.007*"result" + 0.007*"perform" + 0.006*"approach" + 0.006*"measur"
Topic: 5 
Words: 0.008*"surfac" + 0.008*"high" + 0.007*"properti" + 0.0

Running LDA using TF-IDF

In [37]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [38]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"speci" + 0.006*"gene" + 0.004*"genom" + 0.004*"sequenc" + 0.004*"genet" + 0.003*"mutat" + 0.003*"cell" + 0.003*"plant" + 0.002*"protein" + 0.002*"popul"
Topic: 1 Word: 0.004*"cognit" + 0.004*"task" + 0.004*"research" + 0.003*"particip" + 0.003*"memori" + 0.003*"learn" + 0.003*"train" + 0.003*"placebo" + 0.003*"social" + 0.003*"health"
Topic: 2 Word: 0.004*"surfac" + 0.004*"structur" + 0.004*"electron" + 0.003*"materi" + 0.003*"film" + 0.003*"method" + 0.003*"energi" + 0.003*"layer" + 0.003*"suicid" + 0.003*"graphen"
Topic: 3 Word: 0.014*"asthma" + 0.009*"alcohol" + 0.009*"macrophag" + 0.008*"platelet" + 0.008*"fibrosi" + 0.007*"arthriti" + 0.007*"drink" + 0.007*"cytokin" + 0.007*"inflamm" + 0.007*"sepsi"
Topic: 4 Word: 0.006*"optic" + 0.006*"propos" + 0.005*"power" + 0.004*"mode" + 0.004*"imag" + 0.004*"laser" + 0.004*"method" + 0.004*"fiber" + 0.004*"frequenc" + 0.004*"sensor"
Topic: 5 Word: 0.006*"acid" + 0.006*"activ" + 0.006*"compound" + 0.005*"protein" + 0.00

Classify sample document using LDA Bag of Words model

In [39]:
processed_docs[1]

['graphic',
 'model',
 'probabl',
 'distribut',
 'associ',
 'graph',
 'follow',
 'properti',
 'nod',
 'graph',
 'repres',
 'variabl',
 'model',
 'separ',
 'graph',
 'impli',
 'condit',
 'independ',
 'variabl',
 'give',
 'separ',
 'probabl',
 'distribut',
 'factor',
 'accord',
 'graph',
 'graphic',
 'model',
 'use',
 'research',
 'support',
 'effici',
 'comput',
 'model',
 'variabl',
 'provid',
 'visual',
 'complex',
 'model']

In [40]:
for index, score in sorted(lda_model[bow_corpus[1]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.977489709854126	 
Topic: 0.015*"model" + 0.012*"method" + 0.011*"imag" + 0.009*"base" + 0.009*"data" + 0.007*"propos" + 0.007*"result" + 0.007*"perform" + 0.006*"approach" + 0.006*"measur"


In [41]:
for index, score in sorted(lda_model_tfidf[bow_corpus[1]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.4956255853176117	 
Topic: 0.004*"cognit" + 0.004*"task" + 0.004*"research" + 0.003*"particip" + 0.003*"memori" + 0.003*"learn" + 0.003*"train" + 0.003*"placebo" + 0.003*"social" + 0.003*"health"

Score: 0.4843655526638031	 
Topic: 0.004*"surfac" + 0.004*"structur" + 0.004*"electron" + 0.003*"materi" + 0.003*"film" + 0.003*"method" + 0.003*"energi" + 0.003*"layer" + 0.003*"suicid" + 0.003*"graphen"


Try to estimate for subjects

In [42]:
processed_subject = tm_all['abstract'].map(preprocess)

In [43]:
processed_subject[11] #words processed in Economics, Econometrics and Finance

['consid',
 'problem',
 'estim',
 'local',
 'sensor',
 'paramet',
 'local',
 'paramet',
 'sensor',
 'observ',
 'relat',
 'linear',
 'stochast',
 'model',
 'studi',
 'gaussian',
 'product',
 'algorithm',
 'wireless',
 'network',
 'gspawn',
 'procedur',
 'compar',
 'popular',
 'diffus',
 'strategi',
 'perform',
 'network',
 'paramet',
 'estim',
 'communic',
 'cost',
 'sensor',
 'increas',
 'increas',
 'network',
 'densiti',
 'gspawn',
 'allow',
 'sensor',
 'broadcast',
 'messag',
 'size',
 'depend',
 'network',
 'size',
 'densiti',
 'make',
 'suitabl',
 'applic',
 'wireless',
 'sensor',
 'network',
 'gspawn',
 'converg',
 'mean',
 'mean',
 'squar',
 'stabil',
 'technic',
 'suffici',
 'condit',
 'applic',
 'gspawn',
 'network',
 'local',
 'problem',
 'line',
 'sight',
 'environ',
 'numer',
 'result',
 'suggest',
 'gspawn',
 'converg',
 'faster',
 'general',
 'diffus',
 'method',
 'lower',
 'communic',
 'cost',
 'sensor',
 'compar',
 'root',
 'mean',
 'squar',
 'error',
 'express',
 'clone

In [54]:
dictionary_sub = gensim.corpora.Dictionary(processed_subject) 
dictionary_sub.filter_extremes(no_below=15, no_above=0.5)
# limit? 
bow_corpus_sub = [dictionary_sub.doc2bow(doc) for doc in processed_subject]


In [55]:
for index, score in sorted(lda_model_tfidf[bow_corpus[11]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.5951128602027893	 
Topic: 0.004*"surfac" + 0.004*"structur" + 0.004*"electron" + 0.003*"materi" + 0.003*"film" + 0.003*"method" + 0.003*"energi" + 0.003*"layer" + 0.003*"suicid" + 0.003*"graphen"

Score: 0.38198068737983704	 
Topic: 0.004*"cognit" + 0.004*"task" + 0.004*"research" + 0.003*"particip" + 0.003*"memori" + 0.003*"learn" + 0.003*"train" + 0.003*"placebo" + 0.003*"social" + 0.003*"health"


In [56]:
for index, score in sorted(lda_model[bow_corpus_sub[11]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.10000000149011612	 
Topic: 0.011*"increas" + 0.009*"concentr" + 0.009*"effect" + 0.008*"plant" + 0.008*"level" + 0.008*"differ" + 0.007*"product" + 0.007*"signific" + 0.007*"diet" + 0.007*"speci"

Score: 0.10000000149011612	 
Topic: 0.040*"cell" + 0.019*"express" + 0.014*"activ" + 0.011*"induc" + 0.010*"effect" + 0.008*"level" + 0.008*"increas" + 0.008*"cancer" + 0.008*"regul" + 0.008*"protein"

Score: 0.10000000149011612	 
Topic: 0.024*"protein" + 0.014*"activ" + 0.011*"bind" + 0.009*"structur" + 0.008*"acid" + 0.007*"compound" + 0.007*"interact" + 0.007*"complex" + 0.006*"membran" + 0.006*"function"

Score: 0.10000000149011612	 
Topic: 0.015*"health" + 0.008*"particip" + 0.008*"children" + 0.007*"care" + 0.007*"research" + 0.006*"effect" + 0.006*"intervent" + 0.006*"relat" + 0.005*"provid" + 0.005*"data"

Score: 0.10000000149011612	 
Topic: 0.015*"model" + 0.012*"method" + 0.011*"imag" + 0.009*"base" + 0.009*"data" + 0.007*"propos" + 0.007*"result" + 0.007*"perform" + 0.006