In [2]:
import os
import pandas as pd 
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim.models import LdaModel,Phrases,phrases
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense


Data retrival from current directory

In [3]:
DataPath = 'nipstxt/'
print(os.listdir(DataPath))

['nips12', 'nips02', 'nips04', 'nips00', 'MATLAB_NOTES', 'idx', 'README_yann', 'nips05', 'nips08', 'nips06', 'nips07', 'RAW_DATA_NOTES', 'nips03', 'nips09', 'orig', 'nips01', 'nips11', 'nips10']


Load and view dataset

In [4]:
folders = ["nips{0:02}".format(i) for i in range(0,13)]
#Read all texts into a list
research_papers = []
for folder in folders:
    f_names  = os.listdir(DataPath+folder) # file names
    for f_name in f_names:
        with open(DataPath+folder+'/'+f_name, encoding='utf-8',errors='ignore',mode='r+') as f:
            data = f.read()
            research_papers.append(data)
#Total number of research papers in the corpora
len(research_papers)

1740

Preprocessing the data

In [6]:
%%time
stop_words = stopwords.words('english')
word_tk = RegexpTokenizer(r'\w+')
word_nl = WordNetLemmatizer()

def norm_corpus(papers):
    # Storing normalized papers
    normalize_papers = []
    for paper in papers:
        # Lowercasing text
        paper = paper.lower()
        # Tokenizing the text
        paper_tokens = [token.strip() for token in word_tk.tokenize(paper)]
        # lemmatization of text
        paper_tokens = [word_nl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        # length of each token is greater than 1
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        # Removing the stopwords
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None,paper_tokens))
        if paper_tokens:
            normalize_papers.append(paper_tokens)
    return normalize_papers

normalize_papers = norm_corpus(research_papers)
print(len(normalize_papers))

1740
CPU times: user 30.2 s, sys: 172 ms, total: 30.4 s
Wall time: 30.6 s


In [7]:
normalize_papers[1][0:40]

['presynaptic',
 'neural',
 'information',
 'processing',
 'carley',
 'department',
 'electrical',
 'computer',
 'engineering',
 'carnegie',
 'mellon',
 'university',
 'pittsburgh',
 'pa',
 'abstract',
 'potential',
 'presynaptic',
 'information',
 'processing',
 'within',
 'arbor',
 'single',
 'axon',
 'discussed',
 'paper',
 'current',
 'knowledge',
 'activity',
 'dependence',
 'firing',
 'threshold',
 'condition',
 'required',
 'conduction',
 'failure',
 'similarity',
 'node',
 'along',
 'single',
 'axon']

Topic models with Gensim

Text representation with feature engineering

In [8]:
bigram = Phrases(normalize_papers,min_count=25,threshold=25,delimiter=b'_',)
bigram_model = phrases.Phraser(bigram)

#sample bigram features
bigram_model[normalize_papers[0][0:40]]

['scaling',
 'property',
 'coarse_coded',
 'symbol',
 'memory',
 'ronald',
 'rosenfeld',
 'david_touretzky',
 'computer_science',
 'department',
 'carnegie_mellon',
 'university_pittsburgh',
 'pennsylvania',
 'abstract',
 'coarse_coded',
 'symbol',
 'memory',
 'appeared',
 'several',
 'neural_network',
 'symbol',
 'processing',
 'model',
 'order',
 'determine',
 'model',
 'would',
 'scale',
 'one',
 'must',
 'first',
 'understanding',
 'mathematics']

Let's obtain a unique term or phrase to number mappings

In [9]:
normalize_corpus_bigrams = [bigram_model[doc] for doc in normalize_papers]

# Dictionary representation of the documents
dictionary = Dictionary(normalize_corpus_bigrams)
print('word to number mappings',list(dictionary.items())[0:10])
print('Size of vocabulary',len(dictionary))

word to number mappings [(0, '8a'), (1, 'abandon'), (2, 'able'), (3, 'abo'), (4, 'abstract'), (5, 'accommodate'), (6, 'accuracy'), (7, 'achieved'), (8, 'acknowledgment_thank'), (9, 'across')]
Size of vocabulary 78112


In [10]:
# Filter out words that occur less than 20 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=20,no_above=0.6)
print('Size of vocabulary', len(dictionary)) 

Size of vocabulary 7512


In [11]:
# Transforming corpus into bags of words vectors
bow_corpus = [dictionary.doc2bow(text) for text in normalize_corpus_bigrams]
print(bow_corpus[0][0:20])


[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 6), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2), (19, 1)]


In [12]:
# Viewing the actual terms and their counts
print([(dictionary[idx],freq) for idx, freq in bow_corpus[0][0:20]])

# Total number of papers in the corpus
print('Total number of papers',len(bow_corpus))

[('able', 1), ('accommodate', 1), ('accuracy', 1), ('achieved', 1), ('acknowledgment_thank', 1), ('across', 1), ('active', 6), ('activity', 1), ('actual', 1), ('adjusted', 1), ('adjusting', 2), ('agrees', 1), ('algebraic', 1), ('allow', 1), ('along', 1), ('alphabet', 2), ('alternative', 1), ('alternatively', 1), ('although', 2), ('american_institute', 1)]
Total number of papers 1740


Latent Dirichelt Allocation (LDA)

It's a generative probabilistic model in which each document is assumed to have a combination of topics similar to probabilistic Latent Semantic Index.

In [15]:
%%time
total_topics = 5
lda_bow = LdaModel(bow_corpus,id2word=dictionary,num_topics=total_topics,passes=10,chunksize=1740,iterations=400)  

CPU times: user 2min 34s, sys: 66.8 ms, total: 2min 34s
Wall time: 2min 35s


View the major topics or themes in corpus

In [16]:
for topic_id, topic in lda_bow.print_topics(num_topics=8, num_words=15):
    print('Topic #'+str(topic_id+1)+':')
    print(topic)
    print()

Topic #1:
0.013*"image" + 0.007*"feature" + 0.006*"recognition" + 0.006*"object" + 0.005*"task" + 0.004*"signal" + 0.004*"word" + 0.004*"visual" + 0.004*"human" + 0.004*"trained" + 0.004*"unit" + 0.004*"speech" + 0.004*"classification" + 0.003*"representation" + 0.003*"position"

Topic #2:
0.016*"neuron" + 0.012*"cell" + 0.007*"response" + 0.006*"stimulus" + 0.006*"activity" + 0.006*"pattern" + 0.004*"unit" + 0.004*"dynamic" + 0.004*"control" + 0.004*"signal" + 0.004*"spike" + 0.004*"synaptic" + 0.004*"layer" + 0.003*"visual" + 0.003*"connection"

Topic #3:
0.015*"unit" + 0.007*"pattern" + 0.007*"vector" + 0.006*"state" + 0.006*"net" + 0.005*"layer" + 0.005*"hidden_unit" + 0.005*"rule" + 0.005*"node" + 0.004*"sequence" + 0.004*"architecture" + 0.004*"memory" + 0.004*"activation" + 0.004*"signal" + 0.003*"representation"

Topic #4:
0.012*"state" + 0.005*"action" + 0.004*"class" + 0.004*"probability" + 0.004*"let" + 0.004*"optimal" + 0.004*"step" + 0.004*"linear" + 0.004*"vector" + 0.003

Overall mean coherence score of the model is used measure quality of topic models.

In [27]:
topics_coherences = lda_bow.top_topics(bow_corpus,topn=15)
avg_coherence_score = np.mean([item[1] for item in topics_coherences])
print('Average coherence score:',avg_coherence_score)

Average coherence score: -0.9276235007333614


Perplexity

In [31]:
perplexity = lda_bow.log_perplexity(bow_corpus)
print('Model perplexity :',perplexity)

Model perplexity : -7.868744647919709
