In [1]:
import re
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import math

# Create dataframe that matches paper abstract with subjects

In [2]:
paper_id = []
abstract = []

f = open('aminer_2014.txt','r',encoding = 'utf8')
f.readline()
for i, line in enumerate(f):
        if (i+2) % 250000 == 0:
            print('file '+file+': ',round((i+2)/1000000*100,1),"%")
        json_line = json.loads(line)
        if 'year' in json_line and 'keywords' in json_line and \
        'abstract' in json_line and 'lang' in json_line and \
        'references' in json_line and 'issn' in json_line:
            
            if json_line['lang'] == 'en' :
                
                ## store paper info, later use to get the subject of the paper
                paper_id.append(json_line['id'])
                abstract.append(json_line['abstract'])

                
f.close()

In [3]:
df= pd.DataFrame()
df['id'] = paper_id
df['abstract'] = abstract
df.set_index('id')

Unnamed: 0_level_0,abstract
id,Unnamed: 1_level_1
53e997a2b7602d9701f74cf7,The nursing care of a patient following subara...
53e997a6b7602d9701f7c67f,The authors wish to thank G. W. Beakley and F....
53e997aab7602d9701f827a4,\n Almost all problems known to theoretical ec...
53e997aeb7602d9701f8af9c,Pain management in emergency departments (EDs)...
53e997b5b7602d9701f97a9d,Provides an abstract for each of the two keyno...
53e997bab7602d9701fa1ddc,Howard drifted back into consciousness. For a ...
53e997bab7602d9701fa3207,"Last week, Nature painted a pessimistic pictur..."
53e997c6b7602d9701fb6228,In the first article in the series on risk man...
53e997c6b7602d9701fb7afb,This introduction to the special section on Re...
53e997c6b7602d9701fb8e1b,The notion of a “negative-result” measurement ...


In [4]:
df.head()

Unnamed: 0,id,abstract
0,53e997a2b7602d9701f74cf7,The nursing care of a patient following subara...
1,53e997a6b7602d9701f7c67f,The authors wish to thank G. W. Beakley and F....
2,53e997aab7602d9701f827a4,\n Almost all problems known to theoretical ec...
3,53e997aeb7602d9701f8af9c,Pain management in emergency departments (EDs)...
4,53e997b5b7602d9701f97a9d,Provides an abstract for each of the two keyno...


In [5]:
subject = pd.read_csv('paper_subject_match.csv',index_col = 'id')

In [6]:
subject.head()

Unnamed: 0_level_0,paper_subject
id,Unnamed: 1_level_1
53e99784b7602d9701f3e13e,13.0
53e99784b7602d9701f3e4f2,13.0
53e9978db7602d9701f4f415,13.0
53e99792b7602d9701f56a86,27.0
53e99792b7602d9701f5b087,


In [7]:
tm = pd.merge(df, subject, on = ['id'])

In [8]:
tm.isnull().sum()

id                   0
abstract             0
paper_subject    12413
dtype: int64

In [9]:
len(tm)

234253

In [10]:
tm = tm.dropna()

In [11]:
tm = tm.drop(columns = ['id'])

In [12]:
tm['paper_subject'] = tm['paper_subject'].apply(np.int64)

In [13]:
len(tm.paper_subject.unique())

27

In [14]:
tm.head()

Unnamed: 0,abstract,paper_subject
2,\n Almost all problems known to theoretical ec...,33
3,Pain management in emergency departments (EDs)...,29
4,Provides an abstract for each of the two keyno...,27
5,Howard drifted back into consciousness. For a ...,31
6,"Last week, Nature painted a pessimistic pictur...",10


# Build Topic models for each subject

In [15]:
bysub = list()
for x in range(10,37):
    bysub.append(tm[tm['paper_subject'] == x])

**Processing the abstract:**
- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation
- Words that have fewer than 3 characters are removed
- All stopwords are removed
- lemmatized — words in third person to first person, verbs in past and future tenses to present
- Stemmed — words are reduced to their root form

In [16]:
#import sys
#!{sys.executable} -m pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2015)

In [17]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/yihuan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) #lemmatize as verb, default is noun

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [19]:
#example of processed document
stemmer = SnowballStemmer('english') #Create a new instance of a language specific subclass
doc_sample = bysub[1].values[0][0]
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words[:50])
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample[:300]))


['One', 'of', 'the', 'most', 'important', 'challenges', 'in', 'network', 'science', 'is', 'to', 'quantify', 'the', 'information', 'encoded', 'in', 'complex', 'network', 'structures.', 'Disentangling', 'randomness', 'from', 'organizational', 'principles', 'is', 'even', 'more', 'demanding', 'when', 'networks', 'have', 'a', 'multiplex', 'nature.', 'Multiplex', 'networks', 'are', 'multilayer', 'systems', 'of', '[Formula:', 'see', 'text]', 'nodes', 'that', 'can', 'be', 'linked', 'in', 'multiple']


 tokenized and lemmatized document: 
['import', 'challeng', 'network', 'scienc', 'quantifi', 'inform', 'encod', 'complex', 'network', 'structur', 'disentangl', 'random', 'organiz', 'principl', 'demand', 'network', 'multiplex', 'natur', 'multiplex', 'network', 'multilay', 'system', 'formula']


In [None]:
processed_docs = list()
for i in range(len(bysub)):
    processed_docs.append(bysub[i]['abstract'].map(preprocess))

In [None]:
processed_docs[2].values[0] #words processed in first abstract of Arts and Humanities

In [None]:
#function for Running LDA using TF-IDF
#i is subject index, text_str is sentence/abstract
from gensim import corpora, models
def LDA_TF_sub(i,text_str = ""):
    dictionary_i = gensim.corpora.Dictionary(processed_docs[i]) 
    bow_corpus_i = [dictionary_i.doc2bow(doc) for doc in processed_docs[i]]
    tfidf_i = models.TfidfModel(bow_corpus_i)
    corpus_tfidf_i = tfidf_i[bow_corpus_i]
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf_i, num_topics=10, id2word=dictionary_i, passes=2, workers=4)
    if len(text_str) != 0:
        bow_vector = dictionary_i.doc2bow(preprocess(text_str))
        for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
            print("\nScore: {}\n Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))
    return(lda_model_tfidf)


In [None]:
#sample models in Arts and Humanities:
for idx, topic in LDA_TF_sub(2).print_topics(-1):
        print('Topic: {} Word: {}'.format(idx, topic))
        
    

In [None]:
# sample article in art and see how good is the result
sub2abs = np.array2string(bysub[2].values[595])
LDA_TF_sub(2,text_str = sub2abs)

In [None]:
np.array2string(bysub[2].values[595])

sample 100 papers in arts and feed into each model

In [None]:
dictionary = []
for i in range(len(processed_docs)):
    dictionary.append(gensim.corpora.Dictionary(processed_docs[i]))


In [None]:
dictionary[0]

In [None]:
from gensim import corpora, models
def LDA_TF_sub_models(i,text_str = ""):
    dictionary[i] = gensim.corpora.Dictionary(processed_docs[i]) 
    bow_corpus_i = [dictionary[i].doc2bow(doc) for doc in processed_docs[i]]
    tfidf_i = models.TfidfModel(bow_corpus_i)
    corpus_tfidf_i = tfidf_i[bow_corpus_i]
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf_i, num_topics=10, id2word=dictionary[i], passes=2, workers=4)
    return(lda_model_tfidf)


In [None]:
LDA_TF_sub_models(2)

In [None]:
allmodels = []
for i in range(27):
    allmodels.append(LDA_TF_sub_models(i))

In [None]:
for idx, topic in allmodels[2].print_topics(-1):
        print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
def feed_models(i, text_str):
        bow_vector = dictionary[i].doc2bow(preprocess(text_str))
        score = sorted(allmodels[i][bow_vector], key=lambda tup: -1*tup[1])[0]
        return score

In [None]:
feed_models(2,sub2abs)[1] #the score of sample article in art, for the art model

In [None]:
feed_models(8,sub2abs)[1] #the score of sample article in art, for the computer science model

In [None]:
sample = bysub[2]['abstract'].sample(n=200, random_state=1) #sample of 200 subjects from art and humanities


In [None]:
# if the sample abstract have a highest topic score > 0.6, then count it in the result
result = {}
for x in range(27):
    highscore = 0
    for i,val in enumerate(sample):
        if feed_models(x,text_str = val)[1] > 0.6:
            highscore += 1
    result.update({x: highscore})
  

In [None]:
df = pd.DataFrame(list(result.items()), columns = ["subject", "high score count"])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(15,6))
plt.title("count for 200 sampes from Humanity/Art topics")
ax = sns.barplot(x="subject", y="high score count", data=df)

In [None]:
samples = []
n = 100
for i in range(n):
    sample = bysub[2]['abstract'].sample(n=200, random_state=1)

In [187]:
#Don't run, only used if you want highest score returned for every string
from gensim import corpora, models
def LDA_TF_sub_easy(i,text_str = ""):
    bow_corpus_i = [dictionary[i].doc2bow(doc) for doc in processed_docs[i]]
    tfidf_i = models.TfidfModel(bow_corpus_i)
    corpus_tfidf_i = tfidf_i[bow_corpus_i]
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf_i, num_topics=10, id2word=dictionary[i], passes=2, workers=4)
    if len(text_str) != 0:
        bow_vector = dictionary[i].doc2bow(preprocess(text_str))
        score = sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1])[0]
    return(score[1])

In [None]:
#Don't run. Only used if you want to store the topic scores and words
from gensim import corpora, models
def LDA_TF_sub_store(i,text_str = ""):
    bow_corpus_i = [dictionary[i].doc2bow(doc) for doc in processed_docs[i]]
    tfidf_i = models.TfidfModel(bow_corpus_i)
    corpus_tfidf_i = tfidf_i[bow_corpus_i]
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf_i, num_topics=10, id2word=dictionary[i], passes=2, workers=4)
    topic_score = []
    topic_perc = []
    if len(text_str) != 0:
        bow_vector = dictionary[i].doc2bow(preprocess(text_str))
        for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
            topic_score.append(score)
            topic_perc.append(lda_model_tfidf.show_topics(formatted = False)[i][1])
    result = list([topic_score, topic_perc])
    return(result)