In [3]:
import re
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import math

**Create dataframe that matches paper abstract with subjects **

In [4]:
paper_id = []
abstract = []

f = open('aminer_2015.txt','r',encoding = 'utf8')
f.readline()
for i, line in enumerate(f):
        if (i+2) % 250000 == 0:
            print('file '+file+': ',round((i+2)/1000000*100,1),"%")
        json_line = json.loads(line)
        if 'year' in json_line and 'keywords' in json_line and \
        'abstract' in json_line and 'lang' in json_line and \
        'references' in json_line and 'issn' in json_line:
            
            if json_line['lang'] == 'en' :
                
                ## store paper info, later use to get the subject of the paper
                paper_id.append(json_line['id'])
                abstract.append(json_line['abstract'])

                
f.close()

In [5]:
df= pd.DataFrame()
df['id'] = paper_id
df['abstract'] = abstract
df.set_index('id')

Unnamed: 0_level_0,abstract
id,Unnamed: 1_level_1
53e99796b7602d9701f5d979,The pHealth 2015 Conference is the 12th in a s...
53e99796b7602d9701f613ec,A graphical model is a probability distributio...
53e9979bb7602d9701f668c8,An inter-governmental body is encouraging the ...
53e997a2b7602d9701f74d6a,The most important results from the EU-sponsor...
53e997a2b7602d9701f75614,The Nursing and Midwifery Council is seeking t...
53e997a2b7602d9701f75a62,Contents.
53e997a2b7602d9701f75a78,cover
53e997a6b7602d9701f7c67e,The above comments on our previous paper are t...
53e997b2b7602d9701f91202,Machine learning's focus on ill-defined proble...
53e997b5b7602d9701f969d2,\n Die numerische Mathematik (kurz: Numerik) i...


In [6]:
df.head()

Unnamed: 0,id,abstract
0,53e99796b7602d9701f5d979,The pHealth 2015 Conference is the 12th in a s...
1,53e99796b7602d9701f613ec,A graphical model is a probability distributio...
2,53e9979bb7602d9701f668c8,An inter-governmental body is encouraging the ...
3,53e997a2b7602d9701f74d6a,The most important results from the EU-sponsor...
4,53e997a2b7602d9701f75614,The Nursing and Midwifery Council is seeking t...


In [7]:
subject = pd.read_csv('paper_subject_match.csv',index_col = 'id')

In [8]:
subject.head()

Unnamed: 0_level_0,paper_subject
id,Unnamed: 1_level_1
53e99784b7602d9701f3e13e,13.0
53e99784b7602d9701f3e4f2,13.0
53e9978db7602d9701f4f415,13.0
53e99792b7602d9701f56a86,27.0
53e99792b7602d9701f5b087,


In [9]:
tm = pd.merge(df, subject, on = ['id'])

In [10]:
tm.head()

Unnamed: 0,id,abstract,paper_subject
0,53e99796b7602d9701f5d979,The pHealth 2015 Conference is the 12th in a s...,
1,53e99796b7602d9701f613ec,A graphical model is a probability distributio...,
2,53e9979bb7602d9701f668c8,An inter-governmental body is encouraging the ...,10.0
3,53e997a2b7602d9701f74d6a,The most important results from the EU-sponsor...,
4,53e997a2b7602d9701f75614,The Nursing and Midwifery Council is seeking t...,29.0


In [11]:
tm.isnull().sum()

id                  0
abstract            0
paper_subject    6180
dtype: int64

In [12]:
len(tm)

95421

In [13]:
tm = tm.dropna()

In [14]:
tm = tm.drop(columns = ['id'])

In [15]:
tm['paper_subject'] = tm['paper_subject'].apply(np.int64)

**Concatenate all abstracts having the same subject(group by subject)**

In [16]:
tm_all = tm.groupby('paper_subject').agg({'abstract':'-'.join})

In [17]:
tm_all.reset_index(level=0, inplace=True)

In [18]:
tm_all.head()

Unnamed: 0,paper_subject,abstract
0,10,An inter-governmental body is encouraging the ...
1,11,The total variation (TV) regularization method...
2,12,Technologies for automated detection of neonat...
3,13,Skeletal stem cells (SSCs) reside in the postn...
4,14,Despite the extensive application of Monte Car...


**Processing the abstract:**
- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation
- Words that have fewer than 3 characters are removed
- All stopwords are removed
- lemmatized — words in third person to first person, verbs in past and future tenses to present
- Stemmed — words are reduced to their root form

In [19]:
#import sys
#!{sys.executable} -m pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2015)

In [20]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/yihuan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) #lemmatize as verb, default is noun

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

An example of processing the words in the abstract:

In [22]:
stemmer = SnowballStemmer('english') #Create a new instance of a language specific subclass
doc_sample = tm_all[tm_all['paper_subject'] == 10].values[0][1]
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words[:10])
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample[:100]))


['An', 'inter-governmental', 'body', 'is', 'encouraging', 'the', 'replacement', 'of', 'currency', 'with']


 tokenized and lemmatized document: 
['inter', 'government', 'bodi', 'encourag', 'replac', 'currenc', 'object', 'discoura']


The real processing of all the abstract, **takes time!**

In [23]:
processed_docs = tm_all['abstract'].map(preprocess)

In [24]:
processed_docs[11] #words processed in Economics, Econometrics and Finance

['consid',
 'problem',
 'estim',
 'local',
 'sensor',
 'paramet',
 'local',
 'paramet',
 'sensor',
 'observ',
 'relat',
 'linear',
 'stochast',
 'model',
 'studi',
 'gaussian',
 'product',
 'algorithm',
 'wireless',
 'network',
 'gspawn',
 'procedur',
 'compar',
 'popular',
 'diffus',
 'strategi',
 'perform',
 'network',
 'paramet',
 'estim',
 'communic',
 'cost',
 'sensor',
 'increas',
 'increas',
 'network',
 'densiti',
 'gspawn',
 'allow',
 'sensor',
 'broadcast',
 'messag',
 'size',
 'depend',
 'network',
 'size',
 'densiti',
 'make',
 'suitabl',
 'applic',
 'wireless',
 'sensor',
 'network',
 'gspawn',
 'converg',
 'mean',
 'mean',
 'squar',
 'stabil',
 'technic',
 'suffici',
 'condit',
 'applic',
 'gspawn',
 'network',
 'local',
 'problem',
 'line',
 'sight',
 'environ',
 'numer',
 'result',
 'suggest',
 'gspawn',
 'converg',
 'faster',
 'general',
 'diffus',
 'method',
 'lower',
 'communic',
 'cost',
 'sensor',
 'compar',
 'root',
 'mean',
 'squar',
 'error',
 'express',
 'clone

In [41]:
dictionary = gensim.corpora.Dictionary(processed_docs) 
#Dictionary encapsulates the mapping between normalized words and their integer ids.

In [42]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 30:
        break

0 aaet
1 aakr
2 aapoaii
3 ababa
4 abandon
5 abas
6 abcc
7 abdomen
8 abdomin
9 abduct
10 aberr
11 abil
12 abiogen
13 abiot
14 abl
15 ablat
16 abmd
17 abnorm
18 abolish
19 abort
20 abound
21 abras
22 abrog
23 abrupt
24 abscis
25 absenc
26 absent
27 absentia
28 absolut
29 absorb
30 absorpt


In [59]:
dictionary.filter_extremes(no_above=0.5)
# How should we limit? no_below = 15 returns empty list

In [46]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# for each subject, create a dictionary reporting words and how many times those words appear

In [47]:
bow_corpus[11] #（id in dict, how many times of appearance

[(5, 2),
 (7, 11),
 (11, 1),
 (21, 1),
 (22, 1),
 (26, 11),
 (29, 3),
 (30, 3),
 (32, 5),
 (37, 1),
 (40, 1),
 (41, 2),
 (42, 2),
 (46, 2),
 (47, 3),
 (49, 1),
 (55, 4),
 (57, 4),
 (64, 4),
 (66, 1),
 (68, 6),
 (71, 1),
 (74, 8),
 (75, 1),
 (77, 2),
 (82, 4),
 (86, 29),
 (89, 5),
 (91, 2),
 (92, 2),
 (94, 2),
 (104, 1),
 (107, 3),
 (109, 2),
 (113, 6),
 (118, 17),
 (120, 4),
 (126, 1),
 (133, 2),
 (136, 1),
 (138, 3),
 (142, 1),
 (143, 3),
 (147, 5),
 (157, 4),
 (162, 1),
 (171, 7),
 (173, 7),
 (174, 2),
 (175, 8),
 (178, 18),
 (181, 5),
 (185, 1),
 (188, 1),
 (189, 1),
 (190, 1),
 (195, 1),
 (197, 2),
 (200, 1),
 (202, 2),
 (204, 4),
 (205, 17),
 (206, 1),
 (210, 1),
 (212, 1),
 (213, 1),
 (214, 1),
 (222, 6),
 (231, 5),
 (234, 1),
 (246, 2),
 (250, 3),
 (252, 14),
 (255, 1),
 (256, 1),
 (267, 1),
 (268, 4),
 (269, 2),
 (274, 4),
 (281, 2),
 (283, 2),
 (285, 7),
 (286, 4),
 (290, 3),
 (293, 10),
 (294, 2),
 (295, 1),
 (301, 2),
 (313, 1),
 (318, 1),
 (322, 1),
 (323, 118),
 (326, 9),


In [48]:
# bag of words example in Economics, Econometrics and Finance
bow_doc_11 = bow_corpus[11]

for i in range(len(bow_doc_11)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_11[i][0], 
                                                     dictionary[bow_doc_11[i][0]], 
                                                     bow_doc_11[i][1]))

Word 5 ("abiot") appears 2 time.
Word 7 ("abras") appears 11 time.
Word 11 ("acad") appears 1 time.
Word 21 ("acidif") appears 1 time.
Word 22 ("acidifi") appears 1 time.
Word 26 ("acryl") appears 11 time.
Word 29 ("actinomycet") appears 3 time.
Word 30 ("activat") appears 3 time.
Word 32 ("actomyosin") appears 5 time.
Word 37 ("adaptor") appears 1 time.
Word 40 ("adductor") appears 1 time.
Word 41 ("adenin") appears 2 time.
Word 42 ("adenosin") appears 2 time.
Word 46 ("adiabat") appears 2 time.
Word 47 ("adipocyt") appears 3 time.
Word 49 ("adrenerg") appears 1 time.
Word 55 ("aerodynam") appears 4 time.
Word 57 ("aesthet") appears 4 time.
Word 64 ("agaros") appears 4 time.
Word 66 ("agglutinin") appears 1 time.
Word 68 ("agil") appears 6 time.
Word 71 ("agreeabl") appears 1 time.
Word 74 ("airborn") appears 8 time.
Word 75 ("airflow") appears 1 time.
Word 77 ("akin") appears 2 time.
Word 82 ("alba") appears 4 time.
Word 86 ("algin") appears 29 time.
Word 89 ("alkyl") appears 5 time.

Word 7859 ("radii") appears 15 time.
Word 7862 ("radiofrequ") appears 1 time.
Word 7863 ("radiographi") appears 2 time.
Word 7867 ("radiometr") appears 1 time.
Word 7871 ("radix") appears 1 time.
Word 7873 ("rage") appears 4 time.
Word 7874 ("railway") appears 33 time.
Word 7875 ("rain") appears 12 time.
Word 7877 ("rainfal") appears 4 time.
Word 7879 ("ramp") appears 12 time.
Word 7890 ("raspberri") appears 8 time.
Word 7891 ("ratiometr") appears 2 time.
Word 7894 ("rayleigh") appears 17 time.
Word 7895 ("rcts") appears 3 time.
Word 7896 ("readabl") appears 1 time.
Word 7902 ("reassign") appears 2 time.
Word 7907 ("recalibr") appears 2 time.
Word 7908 ("recaptur") appears 10 time.
Word 7909 ("reced") appears 12 time.
Word 7911 ("recenc") appears 3 time.
Word 7912 ("recharg") appears 13 time.
Word 7913 ("recip") appears 1 time.
Word 7914 ("recircul") appears 1 time.
Word 7923 ("redeploy") appears 2 time.
Word 7924 ("redispers") appears 2 time.
Word 7926 ("reed") appears 6 time.
Word 79

TF-IDF method

In [49]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [50]:
corpus_tfidf = tfidf[bow_corpus]

In [51]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.00975798307718265),
 (1, 0.013837988417219092),
 (2, 0.004366080913502812),
 (3, 0.008586527853304297),
 (4, 0.003921777520635955),
 (5, 0.028299261492936236),
 (6, 0.0054856360488010165),
 (7, 0.007059748307353692),
 (8, 0.017173055706608594),
 (9, 0.00975798307718265),
 (10, 0.003529874153676846),
 (11, 0.0054856360488010165),
 (12, 0.0025726601357214758),
 (13, 0.014119496614707384),
 (14, 0.005724351902202865),
 (15, 0.007059748307353692),
 (16, 0.003529874153676846),
 (17, 0.021942544195204066),
 (18, 0.017173055706608594),
 (19, 0.0054856360488010165),
 (20, 0.003179304645931167),
 (21, 0.02289740760881146),
 (22, 0.005724351902202865),
 (23, 0.0054856360488010165),
 (24, 0.003179304645931167),
 (25, 0.03839945234160712),
 (26, 0.0025726601357214758),
 (27, 0.01746432365401125),
 (28, 0.007059748307353692),
 (29, 0.003921777520635955),
 (30, 0.004878991538591325),
 (31, 0.010971272097602033),
 (32, 0.01960888760317978),
 (33, 0.003529874153676846),
 (34, 0.0043660809135028

 (2464, 0.0388286156904453),
 (2465, 0.06034199653681118),
 (2466, 0.004878991538591325),
 (2467, 0.011531657014349243),
 (2468, 0.011765332561907866),
 (2469, 0.02225513252151817),
 (2470, 0.0054856360488010165),
 (2471, 0.004366080913502812),
 (2472, 0.004366080913502812),
 (2473, 0.003529874153676846),
 (2474, 0.012717218583724668),
 (2475, 0.0028621759511014326),
 (2476, 0.04941823815147584),
 (2477, 0.010290640542885903),
 (2478, 0.008732161827005624),
 (2479, 0.003179304645931167),
 (2480, 0.007717980407164428),
 (2481, 0.028299261492936236),
 (2482, 0.021830404567514065),
 (2483, 0.003529874153676846),
 (2484, 0.014636974615773974),
 (2485, 0.0054856360488010165),
 (2486, 0.014636974615773974),
 (2487, 0.008732161827005624),
 (2488, 0.004878991538591325),
 (2489, 0.014636974615773974),
 (2490, 0.00975798307718265),
 (2491, 0.014636974615773974),
 (2492, 0.004878991538591325),
 (2493, 0.007059748307353692),
 (2494, 0.003921777520635955),
 (2495, 0.0054856360488010165),
 (2496, 0.

Running LDA using Bag of Words

In [52]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=100, id2word=dictionary, passes=2, workers=2)

In [53]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.002*"caregiv" + 0.002*"thyroid" + 0.002*"atrial" + 0.002*"transfus" + 0.002*"aneurysm" + 0.002*"youth" + 0.001*"pylori" + 0.001*"allergen" + 0.001*"oocyt" + 0.001*"remiss"
Topic: 1 
Words: 0.010*"therapist" + 0.007*"concuss" + 0.006*"ptsd" + 0.005*"youth" + 0.005*"caregiv" + 0.005*"snack" + 0.005*"psychosi" + 0.005*"music" + 0.005*"amygdala" + 0.005*"esteem"
Topic: 2 
Words: 0.004*"periodont" + 0.002*"caregiv" + 0.002*"youth" + 0.002*"cow" + 0.001*"violenc" + 0.001*"radiograph" + 0.001*"atrial" + 0.001*"aneurysm" + 0.001*"synapt" + 0.001*"thyroid"
Topic: 3 
Words: 0.004*"tick" + 0.002*"thyroid" + 0.002*"larva" + 0.002*"autophagi" + 0.002*"cow" + 0.002*"atrial" + 0.002*"caregiv" + 0.002*"malaria" + 0.002*"synapt" + 0.001*"youth"
Topic: 4 
Words: 0.013*"fault" + 0.006*"turbin" + 0.005*"lignocellulos" + 0.004*"ieee" + 0.004*"rotor" + 0.003*"torqu" + 0.003*"capacitor" + 0.003*"xylos" + 0.003*"microalga" + 0.003*"substat"
Topic: 5 
Words: 0.003*"periodont" + 0.003*"caregi

Running LDA using TF-IDF

In [54]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary, passes=2, workers=4)

In [55]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.001*"hyperspectr" + 0.001*"radar" + 0.001*"modi" + 0.000*"backscatt" + 0.000*"multispectr" + 0.000*"lidar" + 0.000*"unmix" + 0.000*"albedo" + 0.000*"interferometr" + 0.000*"radiomet"
Topic: 1 Word: 0.001*"sprint" + 0.001*"squat" + 0.001*"soccer" + 0.000*"flexion" + 0.000*"kick" + 0.000*"trainer" + 0.000*"elit" + 0.000*"runner" + 0.000*"ankl" + 0.000*"footbal"
Topic: 2 Word: 0.000*"waveguid" + 0.000*"liposom" + 0.000*"genotox" + 0.000*"glucuronid" + 0.000*"morphin" + 0.000*"agnp" + 0.000*"hepatotox" + 0.000*"pharmacist" + 0.000*"excipi" + 0.000*"anhydras"
Topic: 3 Word: 0.001*"nanowir" + 0.001*"graphen" + 0.001*"dielectr" + 0.001*"fault" + 0.001*"slot" + 0.001*"wireless" + 0.001*"inductor" + 0.000*"mimo" + 0.000*"capacitor" + 0.000*"cmos"
Topic: 4 Word: 0.001*"crash" + 0.001*"youth" + 0.001*"violenc" + 0.000*"lesbian" + 0.000*"stigma" + 0.000*"helmet" + 0.000*"empathi" + 0.000*"gambl" + 0.000*"quit" + 0.000*"migrant"
Topic: 5 Word: 0.000*"interneuron" + 0.000*"synapt" +

Classify sample document using LDA Bag of Words model

In [56]:
processed_docs[11]

['consid',
 'problem',
 'estim',
 'local',
 'sensor',
 'paramet',
 'local',
 'paramet',
 'sensor',
 'observ',
 'relat',
 'linear',
 'stochast',
 'model',
 'studi',
 'gaussian',
 'product',
 'algorithm',
 'wireless',
 'network',
 'gspawn',
 'procedur',
 'compar',
 'popular',
 'diffus',
 'strategi',
 'perform',
 'network',
 'paramet',
 'estim',
 'communic',
 'cost',
 'sensor',
 'increas',
 'increas',
 'network',
 'densiti',
 'gspawn',
 'allow',
 'sensor',
 'broadcast',
 'messag',
 'size',
 'depend',
 'network',
 'size',
 'densiti',
 'make',
 'suitabl',
 'applic',
 'wireless',
 'sensor',
 'network',
 'gspawn',
 'converg',
 'mean',
 'mean',
 'squar',
 'stabil',
 'technic',
 'suffici',
 'condit',
 'applic',
 'gspawn',
 'network',
 'local',
 'problem',
 'line',
 'sight',
 'environ',
 'numer',
 'result',
 'suggest',
 'gspawn',
 'converg',
 'faster',
 'general',
 'diffus',
 'method',
 'lower',
 'communic',
 'cost',
 'sensor',
 'compar',
 'root',
 'mean',
 'squar',
 'error',
 'express',
 'clone

In [57]:
for index, score in sorted(lda_model[bow_corpus[11]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9993623495101929	 
Topic: 0.017*"graphen" + 0.011*"wireless" + 0.010*"dielectr" + 0.010*"fault" + 0.008*"waveguid" + 0.007*"capacitor" + 0.007*"transistor" + 0.006*"slot" + 0.006*"nanowir" + 0.006*"torqu"


In [58]:
for index, score in sorted(lda_model_tfidf[bow_corpus[11]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.508795440196991	 
Topic: 0.001*"nanowir" + 0.001*"graphen" + 0.001*"dielectr" + 0.001*"fault" + 0.001*"slot" + 0.001*"wireless" + 0.001*"inductor" + 0.000*"mimo" + 0.000*"capacitor" + 0.000*"cmos"

Score: 0.24771787226200104	 
Topic: 0.001*"graphen" + 0.001*"nanocomposit" + 0.001*"copolym" + 0.001*"lignin" + 0.001*"nanosheet" + 0.001*"synapt" + 0.001*"cnts" + 0.001*"nanorod" + 0.001*"zeolit" + 0.001*"nanowir"

Score: 0.11542007327079773	 
Topic: 0.001*"mimo" + 0.001*"asymptot" + 0.001*"outag" + 0.001*"beamform" + 0.001*"synapt" + 0.001*"ofdm" + 0.001*"wireless" + 0.001*"secreci" + 0.001*"uplink" + 0.000*"downlink"

Score: 0.054043304175138474	 
Topic: 0.001*"tick" + 0.001*"fault" + 0.001*"turbin" + 0.001*"ieee" + 0.001*"lignocellulos" + 0.000*"rotor" + 0.000*"xylos" + 0.000*"phasor" + 0.000*"substat" + 0.000*"bioga"

Score: 0.03317471593618393	 
Topic: 0.001*"waveguid" + 0.001*"grate" + 0.000*"interferomet" + 0.000*"coupler" + 0.000*"ofdm" + 0.000*"terahertz" + 0.000*"mem" + 