In [1]:
import re
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import math

# Create dataframe that matches paper abstract with subjects

In [3]:
paper_id = []
abstract = []

f = open('aminer_2014.txt','r',encoding = 'utf8')
f.readline()
for i, line in enumerate(f):
        if (i+2) % 250000 == 0:
            print('file '+file+': ',round((i+2)/1000000*100,1),"%")
        json_line = json.loads(line)
        if 'year' in json_line and 'keywords' in json_line and \
        'abstract' in json_line and 'lang' in json_line and \
        'references' in json_line and 'issn' in json_line:
            
            if json_line['lang'] == 'en' :
                
                ## store paper info, later use to get the subject of the paper
                paper_id.append(json_line['id'])
                abstract.append(json_line['abstract'])

                
f.close()

In [4]:
df= pd.DataFrame()
df['id'] = paper_id
df['abstract'] = abstract
df.set_index('id')

Unnamed: 0_level_0,abstract
id,Unnamed: 1_level_1
53e997a2b7602d9701f74cf7,The nursing care of a patient following subara...
53e997a6b7602d9701f7c67f,The authors wish to thank G. W. Beakley and F....
53e997aab7602d9701f827a4,\n Almost all problems known to theoretical ec...
53e997aeb7602d9701f8af9c,Pain management in emergency departments (EDs)...
53e997b5b7602d9701f97a9d,Provides an abstract for each of the two keyno...
53e997bab7602d9701fa1ddc,Howard drifted back into consciousness. For a ...
53e997bab7602d9701fa3207,"Last week, Nature painted a pessimistic pictur..."
53e997c6b7602d9701fb6228,In the first article in the series on risk man...
53e997c6b7602d9701fb7afb,This introduction to the special section on Re...
53e997c6b7602d9701fb8e1b,The notion of a “negative-result” measurement ...


In [5]:
df.head()

Unnamed: 0,id,abstract
0,53e997a2b7602d9701f74cf7,The nursing care of a patient following subara...
1,53e997a6b7602d9701f7c67f,The authors wish to thank G. W. Beakley and F....
2,53e997aab7602d9701f827a4,\n Almost all problems known to theoretical ec...
3,53e997aeb7602d9701f8af9c,Pain management in emergency departments (EDs)...
4,53e997b5b7602d9701f97a9d,Provides an abstract for each of the two keyno...


In [6]:
subject = pd.read_csv('paper_subject_match_subfield.csv',index_col = 'id')

In [7]:
subject.head()

Unnamed: 0_level_0,subfield,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1
53e99784b7602d9701f3e13e,1306,2011
53e99784b7602d9701f3e4f2,1307,2002
53e9978db7602d9701f4f415,1300,2005
53e99792b7602d9701f56a86,2738,2004
53e99792b7602d9701f5af0e,1502,1993


In [8]:
tm = pd.merge(df, subject, on = ['id'])

In [9]:
tm.isnull().sum()

id          0
abstract    0
subfield    0
year        0
dtype: int64

In [10]:
len(tm)

234253

In [11]:
tm = tm.dropna()

In [12]:
tm = tm.drop(columns = ['id', 'year'])

In [13]:
tm.head()

Unnamed: 0,abstract,subfield
0,The nursing care of a patient following subara...,2732
1,The authors wish to thank G. W. Beakley and F....,2736
2,\n Almost all problems known to theoretical ec...,3304
3,Pain management in emergency departments (EDs)...,2907
4,Provides an abstract for each of the two keyno...,2730


In [14]:
tm['subfield'] = tm['subfield'].apply(np.int64)

In [15]:
len(tm.subfield.unique())

288

In [16]:
tm.head()

Unnamed: 0,abstract,subfield
0,The nursing care of a patient following subara...,2732
1,The authors wish to thank G. W. Beakley and F....,2736
2,\n Almost all problems known to theoretical ec...,3304
3,Pain management in emergency departments (EDs)...,2907
4,Provides an abstract for each of the two keyno...,2730


In [17]:
df = tm.groupby('subfield')['abstract'].apply(list).to_dict()

# Build Topic models for each subject

**Processing the abstract:**
- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation
- Words that have fewer than 3 characters are removed
- All stopwords are removed
- lemmatized — words in third person to first person, verbs in past and future tenses to present
- Stemmed — words are reduced to their root form

In [22]:
#import sys
#!{sys.executable} -m pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2015)

In [23]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/Winnie/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [24]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) #lemmatize as verb, default is noun

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [25]:
stemmer = SnowballStemmer('english')

In [28]:
processed_docs = {}
for i in df.keys():
    for j in range(len(df[i])):
        processed_docs.setdefault(i, []).append(preprocess(df[i][j]))

In [26]:
## example using 1000
processed_docs = {}
for j in range(len(df[1000])):
        processed_docs.setdefault(1000, []).append(preprocess(df[1000][j]))

In [30]:
processed_docs[1000][1]

['articl',
 'seri',
 'risk',
 'manag',
 'briefli',
 'discuss',
 'frame',
 'relay',
 'network',
 'versus',
 'leas',
 'line',
 'network',
 'manag',
 'life',
 'cycl',
 'risk',
 'manag',
 'program',
 'coordin',
 'denial',
 'hyphen',
 'hyphen',
 'servic',
 'type',
 'threat',
 'attack',
 'network',
 'copyright',
 'john',
 'wiley',
 'son']

In [31]:
#function for Running LDA using TF-IDF
#i is subject index, text_str is sentence/abstract
from gensim import corpora, models
def LDA_TF_sub(i,text_str = ""):
    dictionary_i = gensim.corpora.Dictionary(processed_docs[i]) 
    bow_corpus_i = [dictionary_i.doc2bow(doc) for doc in processed_docs[i]]
    tfidf_i = models.TfidfModel(bow_corpus_i)
    corpus_tfidf_i = tfidf_i[bow_corpus_i]
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf_i, num_topics=10, id2word=dictionary_i, passes=2, workers=4)
    if len(text_str) != 0:
        bow_vector = dictionary_i.doc2bow(preprocess(text_str))
        for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
            print("\nScore: {}\n Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))
    return(lda_model_tfidf)

In [71]:
final = []
for i in df.keys():
    tem = []
    for topic in LDA_TF_sub(i).print_topics(-1):
        tem.append(topic)
    k = [e[1] for e in tem]
    save = []
    for i in range(len(k)):
        save.append(re.findall('"([^"]*)"', k[i]))
    total = []
    for i in save:
        total += i
    final.append(list(set(total))) 

  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


In [78]:
df_out = pd.DataFrame(np.array(final), columns = ['words'])

In [81]:
df_out['sub'] = df.keys()

In [82]:
df_out.to_csv('words.csv', index=False)

In [98]:
for i in df_out.loc[df_out['sub']==1400]['words']:
    print(i)

['guarante', 'china', 'decis', 'grade', 'univers', 'inequ', 'applic', 'time', 'creat', 'urban', 'compar', 'literatur', 'gender', 'admiss', 'attain', 'texa', 'educ', 'effect', 'student', 'area', 'level']
