In [83]:
import os
import pandas as pd 
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim.models import LsiModel,Phrases,phrases
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense


Data retrival from current directory

In [2]:
DataPath = 'nipstxt/'
print(os.listdir(DataPath))

['nips12', 'nips02', 'nips04', 'nips00', 'MATLAB_NOTES', 'idx', 'README_yann', 'nips05', 'nips08', 'nips06', 'nips07', 'RAW_DATA_NOTES', 'nips03', 'nips09', 'orig', 'nips01', 'nips11', 'nips10']


Load and view dataset

In [40]:
folders = ["nips{0:02}".format(i) for i in range(0,13)]
#Read all texts into a list
research_papers = []
for folder in folders:
    f_names  = os.listdir(DataPath+folder) # file names
    for f_name in f_names:
        with open(DataPath+folder+'/'+f_name, encoding='utf-8',errors='ignore',mode='r+') as f:
            data = f.read()
            research_papers.append(data)
#Total number of research papers in the corpora
len(research_papers)

1740

Preprocessing the data

In [45]:
%%time
stop_words = stopwords.words('english')
word_tk = RegexpTokenizer(r'\w+')
word_nl = WordNetLemmatizer()

def norm_corpus(papers):
    # Storing normalized papers
    normalize_papers = []
    for paper in papers:
        # Lowercasing text
        paper = paper.lower()
        # Tokenizing the text
        paper_tokens = [token.strip() for token in word_tk.tokenize(paper)]
        # lemmatization of text
        paper_tokens = [word_nl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        # length of each token is greater than 1
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        # Removing the stopwords
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None,paper_tokens))
        if paper_tokens:
            normalize_papers.append(paper_tokens)
    return normalize_papers

normalize_papers = norm_corpus(research_papers)
print(len(normalize_papers))

1740
CPU times: user 31.4 s, sys: 185 ms, total: 31.6 s
Wall time: 31.6 s


In [46]:
normalize_papers[1][0:40]

['presynaptic',
 'neural',
 'information',
 'processing',
 'carley',
 'department',
 'electrical',
 'computer',
 'engineering',
 'carnegie',
 'mellon',
 'university',
 'pittsburgh',
 'pa',
 'abstract',
 'potential',
 'presynaptic',
 'information',
 'processing',
 'within',
 'arbor',
 'single',
 'axon',
 'discussed',
 'paper',
 'current',
 'knowledge',
 'activity',
 'dependence',
 'firing',
 'threshold',
 'condition',
 'required',
 'conduction',
 'failure',
 'similarity',
 'node',
 'along',
 'single',
 'axon',
 'reviewed',
 'electronic',
 'circuit',
 'model',
 'site',
 'low',
 'conduction',
 'safety',
 'axon',
 'presented']

Topic models with Gensim

Text representation with feature engineering

In [63]:
bigram = Phrases(normalize_papers,min_count=25,threshold=25,delimiter=b'_',)
bigram_model = phrases.Phraser(bigram)

#sample bigram features
bigram_model[normalize_papers[0][0:40]]

['scaling',
 'property',
 'coarse_coded',
 'symbol',
 'memory',
 'ronald',
 'rosenfeld',
 'david_touretzky',
 'computer_science',
 'department',
 'carnegie_mellon',
 'university_pittsburgh',
 'pennsylvania',
 'abstract',
 'coarse_coded',
 'symbol',
 'memory',
 'appeared',
 'several',
 'neural_network',
 'symbol',
 'processing',
 'model',
 'order',
 'determine',
 'model',
 'would',
 'scale',
 'one',
 'must',
 'first',
 'understanding',
 'mathematics']

Let's obtain a unique term or phrase to number mappings

In [59]:
normalize_corpus_bigrams = [bigram_model[doc] for doc in normalize_papers]

# Dictionary representation of the documents
dictionary = Dictionary(normalize_corpus_bigrams)
print('word to number mappings',list(dictionary.items())[0:10])
print('Size of vocabulary',len(dictionary))

word to number mappings [(0, '8a'), (1, 'abandon'), (2, 'able'), (3, 'abo'), (4, 'abstract'), (5, 'accommodate'), (6, 'accuracy'), (7, 'achieved'), (8, 'acknowledgment_thank'), (9, 'across')]
Size of vocabulary 78112


In [60]:
# Filter out words that occur less than 20 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=20,no_above=0.6)
print('Size of vocabulary', len(dictionary)) 

Size of vocabulary 7512


In [61]:
# Transforming corpus into bags of words vectors
bow_corpus = [dictionary.doc2bow(text) for text in normalize_corpus_bigrams]
print(bow_corpus[0][0:20])


[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 6), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2), (19, 1)]


In [64]:
# Viewing the actual terms and their counts
print([(dictionary[idx],freq) for idx, freq in bow_corpus[0][0:20]])

# Total number of papers in the corpus
print('Total number of papers',len(bow_corpus))

[('able', 1), ('accommodate', 1), ('accuracy', 1), ('achieved', 1), ('acknowledgment_thank', 1), ('across', 1), ('active', 6), ('activity', 1), ('actual', 1), ('adjusted', 1), ('adjusting', 2), ('agrees', 1), ('algebraic', 1), ('allow', 1), ('along', 1), ('alphabet', 2), ('alternative', 1), ('alternatively', 1), ('although', 2), ('american_institute', 1)]
Total number of papers 1740


Latent Semantic Index (LSI)

It's a statistical technique to correlate semantically terms to form topics. It's uses Singular Value Decomposition(SVD) technique.

In [66]:
%%time
total_topics = 8
lsi_bow = LsiModel(bow_corpus,id2word=dictionary,num_topics=total_topics,onepass=True,chunksize=1740,power_iters=1200)  

CPU times: user 22min 25s, sys: 5min 36s, total: 28min 1s
Wall time: 16min 48s


View the major topics or themes in corpus

In [68]:
for topic_id, topic in lsi_bow.print_topics(num_topics=8, num_words=15):
    print('Topic #'+str(topic_id+1)+':')
    print(topic)
    print()

Topic #1:
0.215*"unit" + 0.214*"state" + 0.180*"neuron" + 0.160*"pattern" + 0.151*"image" + 0.139*"vector" + 0.132*"feature" + 0.127*"cell" + 0.109*"layer" + 0.102*"probability" + 0.100*"task" + 0.100*"distribution" + 0.095*"class" + 0.090*"rate" + 0.089*"signal"

Topic #2:
-0.455*"neuron" + -0.399*"cell" + 0.340*"state" + -0.189*"response" + -0.170*"stimulus" + 0.117*"action" + -0.116*"activity" + -0.107*"pattern" + 0.096*"class" + 0.095*"vector" + -0.094*"visual" + -0.093*"spike" + -0.093*"synaptic" + -0.090*"circuit" + 0.089*"probability"

Topic #3:
0.539*"state" + -0.464*"image" + 0.287*"neuron" + -0.243*"feature" + 0.172*"action" + -0.168*"object" + 0.116*"control" + -0.112*"unit" + -0.098*"recognition" + -0.095*"classifier" + -0.094*"class" + 0.092*"policy" + 0.083*"cell" + -0.080*"classification" + 0.080*"dynamic"

Topic #4:
0.746*"unit" + -0.235*"image" + -0.187*"neuron" + 0.153*"pattern" + 0.126*"layer" + 0.125*"net" + 0.122*"hidden_unit" + 0.114*"activation" + -0.112*"distrib

We can observe that terms or words and assined weights have positive or negative sign. We are going to separate posive and negative weights corresponding terms. Higher the weight, more important the contribution. Each term indicates a sign of direction or orientation in the vector space for a particualr topic.

In [81]:
for n in range(total_topics):
    print('Topic #'+str(n+1)+':')
    print('-'*64)
    p_d = []
    n_d = []
    for term,weight in lsi_bow.show_topic(n, topn=15):
        if weight >= 0:
            p_d.append((term,round(weight,2)))
        else:
            n_d.append((term,round(weight,2)))
    print('Positive Direction:',p_d)
    print('-'*64)
    print('Negative Direction:',n_d)
    print()


Topic #1:
----------------------------------------------------------------
Positive Direction: [('unit', 0.22), ('state', 0.21), ('neuron', 0.18), ('pattern', 0.16), ('image', 0.15), ('vector', 0.14), ('feature', 0.13), ('cell', 0.13), ('layer', 0.11), ('probability', 0.1), ('task', 0.1), ('distribution', 0.1), ('class', 0.09), ('rate', 0.09), ('signal', 0.09)]
----------------------------------------------------------------
Negative Direction: []

Topic #2:
----------------------------------------------------------------
Positive Direction: [('state', 0.34), ('action', 0.12), ('class', 0.1), ('vector', 0.09), ('probability', 0.09)]
----------------------------------------------------------------
Negative Direction: [('neuron', -0.45), ('cell', -0.4), ('response', -0.19), ('stimulus', -0.17), ('activity', -0.12), ('pattern', -0.11), ('visual', -0.09), ('spike', -0.09), ('synaptic', -0.09), ('circuit', -0.09)]

Topic #3:
----------------------------------------------------------------
P

Let's try to get 3 major metrics like left singular vectors(U), singular values(S) and right singular vectors(VT)

In [85]:
term_topic = lsi_bow.projection.u
singular_values = lsi_bow.projection.s
document_topic = (corpus2dense(lsi_bow[bow_corpus],len(singular_values)).T/ singular_values).T 

term_topic.shape, singular_values.shape, document_topic.shape

((7512, 8), (8,), (8, 1740))

In [87]:
#We can transpose topic document matrix to form document-topic matrix
document_topics = pd.DataFrame(np.round(document_topic.T,3),columns=['T'+str(i) for i in range(1,total_topics+1)])
document_topics.head()

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8
0,0.039,0.004,-0.007,0.077,0.028,-0.012,-0.005,0.059
1,0.024,-0.034,0.014,-0.007,0.009,0.009,0.009,-0.02
2,0.01,-0.003,0.009,-0.011,0.007,-0.017,-0.011,0.003
3,0.025,-0.047,-0.006,0.003,-0.02,0.044,0.007,0.008
4,0.015,-0.021,0.013,0.002,0.01,-0.014,-0.001,0.005


The important topics for a few research papers

In [89]:
document_numbers = [15,240,450,600]

for document_number in document_numbers:
    top_topics = list(document_topics.columns[np.argsort(-np.absolute(document_topics.iloc[document_number].values))[:5]])

    print('Document #'+str(document_number)+':')
    print('Dominant topics(top3',top_topics)
    print('Paper summary')
    print(research_papers[document_number][0:600])
    print()



Document #15:
Dominant topics(top3 ['T7', 'T1', 'T4', 'T2', 'T3']
Paper summary
474 
OPTIMIZATION WITH ARTIFICIAL NEURAL NETWORK SYSTEMS: 
A MAPPING PRINCIPLE 
AND 
A COMPARISON TO GRADIENT BASED METHODS * 
Harrison MonFook Leong 
Research Institute for Advanced Computer Science 
NASA Ames Research Center 230-5 
Moffett Field, CA, 94035 
ABSTRACT 
General formulae for mapping optimization problems into systems of ordinary differential 
equations associated with artificial neural networks are presented. A comparison is made to optim- 
ization using gradient-search methods. The performance measure is the settling time from an initial 
state to a target state. A simple analy

Document #240:
Dominant topics(top3 ['T2', 'T4', 'T6', 'T1', 'T8']
Paper summary
Note on Development of Modularity in Simple Cortical Models 133 
Note 
on Development of Modularity 
in Simple Cortical Models 
Alex Chernjavsky 1 
Neuroscience Graduate Program 
Section of Molecular Neurobiology 
Howard Hughes Medical I