In [26]:
import pandas as pd 
import numpy as np
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error
import logging
import gensim

In [27]:
data_df=pd.read_csv("nips-papers/papers.csv")

In [28]:
data_df.columns

Index(['id', 'year', 'title', 'event_type', 'pdf_name', 'abstract',
       'paper_text'],
      dtype='object')

In [29]:
data_df['paper_text'].head().apply(len)

0    21643
1    15505
2    20523
3    19441
4    20219
Name: paper_text, dtype: int64

# Removing unneccesary data 

In [30]:
data_df[['title','pdf_name']].head(10)

Unnamed: 0,title,pdf_name
0,Self-Organization of Associative Database and ...,1-self-organization-of-associative-database-an...
1,A Mean Field Theory of Layer IV of Visual Cort...,10-a-mean-field-theory-of-layer-iv-of-visual-c...
2,Storing Covariance by the Associative Long-Ter...,100-storing-covariance-by-the-associative-long...
3,Bayesian Query Construction for Neural Network...,1000-bayesian-query-construction-for-neural-ne...
4,"Neural Network Ensembles, Cross Validation, an...",1001-neural-network-ensembles-cross-validation...
5,Using a neural net to instantiate a deformable...,1002-using-a-neural-net-to-instantiate-a-defor...
6,Plasticity-Mediated Competitive Learning,1003-plasticity-mediated-competitive-learning.pdf
7,ICEG Morphology Classification using an Analog...,1004-iceg-morphology-classification-using-an-a...
8,Real-Time Control of a Tokamak Plasma Using Ne...,1005-real-time-control-of-a-tokamak-plasma-usi...
9,Pulsestream Synapses with Non-Volatile Analogu...,1006-pulsestream-synapses-with-non-volatile-an...


Since title and pdf_name is  same hence removing pdf_name 

In [31]:
data_df['event_type'].unique()

array([nan, 'Oral', 'Spotlight', 'Poster'], dtype=object)

In [32]:
data_df.drop(['pdf_name','event_type'],axis=1,inplace=True)

Since our main object is to summarize the text hence and tagging the key words hence we donot require the event type 

In [33]:
data_df.head()

Unnamed: 0,id,year,title,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [34]:
paper_separate_abstract = data_df[(data_df['abstract']!='Abstract Missing')].index

In [35]:
paper_separate_abstract

Int64Index([ 941, 1067, 2384, 2385, 2388, 2389, 2390, 2393, 2394, 2396,
            ...
            6937, 6938, 6939, 6940, 6941, 6943, 6944, 6945, 6946, 6947],
           dtype='int64', length=3924)

In [36]:
len(data_df[(data_df['abstract']!='Abstract Missing')].index)

3924

In [37]:
print(data_df['abstract'].iloc[941])

Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 


In [38]:
data_df = data_df.iloc[paper_separate_abstract]

In [39]:
data_df.drop('paper_text',axis=1,inplace=True)

Since abstract is present in 3924 text hence removing the full papers of this journals

In [40]:
data_df.reset_index(inplace = True)

In [41]:
data_df.drop('index',axis=1,inplace= True)

In [42]:
print(data_df['abstract'][0])

Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 


Having the word count 

In [43]:
data_df.head()

Unnamed: 0,id,year,title,abstract
0,1861,2000,Algorithms for Non-negative Matrix Factorization,Non-negative matrix factorization (NMF) has pr...
1,1975,2001,Characterizing Neural Gain Control using Spike...,Spike-triggered averaging techniques are effec...
2,3163,2007,Competition Adds Complexity,It is known that determinining whether a DEC-P...
3,3164,2007,Efficient Principled Learning of Thin Junction...,We present the first truly polynomial algorith...
4,3167,2007,Regularized Boost for Semi-Supervised Learning,Semi-supervised inductive learning concerns ho...


In [44]:
def text_processing(df,col):
    temp_df = df[col]
    # 1.Remove punctuation
    temp_df = temp_df.apply(lambda x: re.sub('[[^a-zA-Z]]',' ',x))
    # 2. converting lower case
    temp_df = temp_df.apply(lambda x: x.lower())
    # 3. removing special character and digit
    temp_df = temp_df.apply(lambda x: re.sub("(\\d|\\W)+"," ",x))
    return temp_df

In [45]:
data_df['abstract'] =text_processing(data_df,'abstract')

In [46]:
print(data_df['abstract'][0])

non negative matrix factorization nmf has previously been shown to be a useful decomposition for multivariate data two different multi plicative algorithms for nmf are analyzed they differ only slightly in the multiplicative factor used in the update rules one algorithm can be shown to minimize the conventional least squares error while the other minimizes the generalized kullback leibler divergence the monotonic convergence of both algorithms can be proven using an auxiliary func tion analogous to that used for proving convergence of the expectation maximization algorithm the algorithms can also be interpreted as diag onally rescaled gradient descent where the rescaling factor is optimally chosen to ensure convergence 


In [47]:
def tokenize_lemmatize(df,col):
    temp_df =df[col]
    #1. Word Tokenization:
    temp_df = temp_df.apply(lambda x : word_tokenize(x))
    word_no_pre = temp_df.apply(lambda x: len(x))
    temp_df = temp_df.apply(lambda x : [i for i in x if not i in stopwords.words('english')])
    #2. Word Lemmatization:
    lemmatize =WordNetLemmatizer()
    temp_df = temp_df.apply(lambda x: [lemmatize.lemmatize(i) for i in x])
    word_no_post =temp_df.apply(lambda x: len(x))
    #temp_df = temp_df.apply(lambda x: " ".join(x))
    return temp_df,word_no_pre,word_no_post

In [48]:
data_df['post_abstract'],data_df['word_count_pre'],data_df['word_count_post']=tokenize_lemmatize(data_df,'abstract')

In [49]:
dictionary = gensim.corpora.Dictionary(data_df['post_abstract'])

In [50]:
len(dictionary)

12641

In [51]:
#converting dictionary into a bag of words 

In [58]:
word_map =[dictionary.doc2bow(text) for text in data_df['post_abstract']]

In [60]:
len(word_map[0])

57

In [61]:
word_map[0]

[(0, 5),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 3),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 2),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 2),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 2),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 2),
 (55, 1),
 (56, 1)]

In [63]:
#applying lda model in genism   using bag of words and Lda with count vectorizer

In [68]:
lda =gensim.models.ldamodel.LdaModel(corpus, num_topics=5,id2word = dictionary, passes=20,random_state=23,chunksize=128)

In [98]:
import pyLDAvis.gensim

ModuleNotFoundError: No module named 'pyLDAvis'