# Natural Language Processing Assignment
### Harshitha M.U. (J076)

## Research paper topic generation 

### Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/Papers.csv')
df = df.dropna()
df

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
4261,4857,2013,Scalable Influence Estimation in Continuous-Ti...,Oral,4857-scalable-influence-estimation-in-continuo...,If a piece of information is released from a m...,Scalable Influence Estimation in\nContinuous-T...
4262,4858,2013,Adaptive Anonymity via,Spotlight,4858-adaptive-anonymity-via-b-matching.pdf,The adaptive anonymity problem is formalized w...,Adaptive Anonymity via b-Matching\n\nKrzysztof...
4263,4859,2013,Exact and Stable Recovery of Pairwise Interact...,Spotlight,4859-exact-and-stable-recovery-of-pairwise-int...,Tensor completion from incomplete observations...,Exact and Stable Recovery of Pairwise Interact...
4265,4860,2013,Matrix factorization with binary components,Spotlight,4860-matrix-factorization-with-binary-componen...,Motivated by an application in computational b...,Matrix factorization with Binary Components\n\...
4266,4861,2013,On the Complexity and Approximation of Binary ...,Spotlight,4861-on-the-complexity-and-approximation-of-bi...,Lifted inference algorithms exploit symmetries...,On the Complexity and Approximation of\nBinary...
...,...,...,...,...,...,...,...
6943,7280,2017,"On Separability of Loss Functions, and Revisit...",Poster,7280-on-separability-of-loss-functions-and-rev...,We revisit the classical analysis of generativ...,"On Separability of Loss Functions, and Revisit..."
6944,7281,2017,Maxing and Ranking with Few Assumptions,Poster,7281-maxing-and-ranking-with-few-assumptions.pdf,PAC maximum ...,Maxing and Ranking with Few Assumptions\nMoein...
6945,7282,2017,On clustering network-valued data,Poster,7282-on-clustering-network-valued-data.pdf,"Community detection, which focuses on clusteri...",On clustering network-valued data\n\nSoumendu ...
6946,7283,2017,A General Framework for Robust Interactive Lea...,Poster,7283-a-general-framework-for-robust-interactiv...,We propose a general framework for interactive...,A General Framework for Robust Interactive\nLe...


In [None]:
data = df[['paper_text']]
data['index'] = data.index
documents = data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
documents[:5]

Unnamed: 0,paper_text,index
4261,Scalable Influence Estimation in\nContinuous-T...,4261
4262,Adaptive Anonymity via b-Matching\n\nKrzysztof...,4262
4263,Exact and Stable Recovery of Pairwise Interact...,4263
4265,Matrix factorization with Binary Components\n\...,4265
4266,On the Complexity and Approximation of\nBinary...,4266


### Lemmatizing & Stemming

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import numpy as np
np.random.seed(2018)

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
doc_sample = documents[documents['index'] == 4310].values[0][0]

stemmer = SnowballStemmer('english')
print('Original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n Tokenized and lemmatized document: ')
print(preprocess(doc_sample))

Original document: 
['Non-strongly-convex', 'smooth', 'stochastic\napproximation', 'with', 'convergence', 'rate', 'O(1/n)\n\nEric', 'Moulines\nLTCI\nTelecom', 'ParisTech,', 'Paris,', 'France\neric.moulines@enst.fr\n\nFrancis', 'Bach\nINRIA', '-', 'Sierra', 'Project-team\nEcole', 'Normale', 'Sup?erieure,', 'Paris,', 'France\nfrancis.bach@ens.fr\n\nAbstract\nWe', 'consider', 'the', 'stochastic', 'approximation', 'problem', 'where', 'a', 'convex', 'function', 'has\nto', 'be', 'minimized,', 'given', 'only', 'the', 'knowledge', 'of', 'unbiased', 'estimates', 'of', 'its', 'gradients\nat', 'certain', 'points,', 'a', 'framework', 'which', 'includes', 'machine', 'learning', 'methods', 'based\non', 'the', 'minimization', 'of', 'the', 'empirical', 'risk.', 'We', 'focus', 'on', 'problems', 'without', 'strong\nconvexity,', 'for', 'which', 'all', 'previously\nknown', 'algorithms', 'achieve', 'a', 'convergence', 'rate\n?\nfor', 'function', 'values', 'of', 'O(1/', 'n)', 'after', 'n', 'iterations.', 'W

### Data preprocessing

In [None]:
processed_docs = documents['paper_text'].map(preprocess)
processed_docs[:10]

4261    [scalabl, influenc, estim, continu, time, diff...
4262    [adapt, anonym, match, krzysztof, choromanski,...
4263    [exact, stabl, recoveri, pairwis, interact, te...
4265    [matrix, factor, binari, compon, martin, slaws...
4266    [complex, approxim, binari, evid, lift, infer,...
4267    [unsupervis, spectral, learn, fsts, rapha, bai...
4268    [decompos, proxim, yaoliang, depart, comput, s...
4269    [uniform, camera, shake, remov, spatial, adapt...
4270    [provabl, subspac, cluster, meet, xiang, wang,...
4271    [matrix, complet, give, observ, troy, nanyang,...
Name: paper_text, dtype: object

### Bag of words on the dataset

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 absolut
1 abstract
2 accumul
3 accur
4 accuraci
5 achiev
6 acknowledg
7 actual
8 acycl
9 adapt
10 add


In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=len(documents))

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
bow_doc = bow_corpus[10]

for i in range(len(bow_doc)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc[i][0], 
                                                     dictionary[bow_doc[i][0]], 
                                                     bow_doc[i][1]))

Word 2 ("accur") appears 1 time.
Word 6 ("address") appears 1 time.
Word 7 ("adopt") appears 9 time.
Word 15 ("appendix") appears 2 time.
Word 20 ("argument") appears 2 time.
Word 23 ("artifici") appears 3 time.
Word 33 ("binari") appears 3 time.
Word 36 ("cambridg") appears 1 time.
Word 37 ("captur") appears 7 time.
Word 44 ("chen") appears 1 time.
Word 50 ("collect") appears 1 time.
Word 58 ("context") appears 1 time.
Word 74 ("difficult") appears 2 time.
Word 86 ("easi") appears 1 time.
Word 87 ("easili") appears 1 time.
Word 94 ("entir") appears 2 time.
Word 104 ("exponenti") appears 2 time.
Word 115 ("flexibl") appears 1 time.
Word 117 ("formul") appears 7 time.
Word 122 ("furthermor") appears 1 time.
Word 123 ("gain") appears 2 time.
Word 135 ("greater") appears 1 time.
Word 144 ("heurist") appears 3 time.
Word 145 ("hide") appears 3 time.
Word 157 ("infinit") appears 1 time.
Word 165 ("intuit") appears 2 time.
Word 166 ("invari") appears 3 time.
Word 167 ("investig") appears 4 t

### TF-IDF

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)

In [None]:
corpus_tfidf = tfidf[bow_corpus]

In [None]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.010573822214664387),
 (1, 0.008537777306848201),
 (2, 0.011563731961775459),
 (3, 0.003391710143789426),
 (4, 0.010971074137179739),
 (5, 0.01073944337777098),
 (6, 0.0027352773815965668),
 (7, 0.014824890423594625),
 (8, 0.011554676922430665),
 (9, 0.01112530646217485),
 (10, 0.0069654144948065495),
 (11, 0.012789038602890905),
 (12, 0.011153326184540647),
 (13, 0.022866533390483883),
 (14, 0.005698014203755539),
 (15, 0.0329471533825011),
 (16, 0.003387900122181393),
 (17, 0.008593547357834882),
 (18, 0.009265009766316694),
 (19, 0.006446292573761128),
 (20, 0.00457331801389896),
 (21, 0.004382296408484927),
 (22, 0.031201641977399622),
 (23, 0.0057686207783572575),
 (24, 0.006775800244362786),
 (25, 0.01078588551655733),
 (26, 0.006428439594294781),
 (27, 0.01735316618623997),
 (28, 0.0038204710217507793),
 (29, 0.004037477762656758),
 (30, 0.0447999182149534),
 (31, 0.014872035482003486),
 (32, 0.010441987804218088),
 (33, 0.005729031571889921),
 (34, 0.016669281107526687),


### LDA using TF-IDF

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [None]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))

Topic: 0 
Word: 0.003*"action" + 0.003*"submodular" + 0.003*"polici" + 0.003*"reward" + 0.003*"regret" + 0.003*"graph" + 0.002*"edg" + 0.002*"tree" + 0.002*"score" + 0.002*"bandit"
Topic: 1 
Word: 0.003*"privaci" + 0.003*"dropout" + 0.003*"layer" + 0.002*"label" + 0.002*"posterior" + 0.002*"variat" + 0.002*"deep" + 0.002*"privat" + 0.002*"classifi" + 0.002*"tree"
Topic: 2 
Word: 0.003*"kernel" + 0.003*"tree" + 0.003*"latent" + 0.003*"item" + 0.002*"rank" + 0.002*"user" + 0.002*"layer" + 0.002*"tensor" + 0.002*"neuron" + 0.002*"cluster"
Topic: 3 
Word: 0.003*"cluster" + 0.003*"convex" + 0.002*"topic" + 0.002*"kernel" + 0.002*"tensor" + 0.002*"norm" + 0.002*"proxim" + 0.002*"rank" + 0.002*"posterior" + 0.002*"subspac"
Topic: 4 
Word: 0.004*"layer" + 0.003*"cluster" + 0.003*"tensor" + 0.003*"neuron" + 0.003*"rank" + 0.002*"convolut" + 0.002*"worker" + 0.002*"lstm" + 0.002*"deep" + 0.002*"queri"
Topic: 5 
Word: 0.007*"submodular" + 0.003*"item" + 0.002*"rank" + 0.002*"graph" + 0.002*"svrg"

### Performance evaluation

In [None]:
for index, score in sorted(lda_model_tfidf[bow_corpus[10]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8359755873680115	 
Topic: 0.004*"layer" + 0.004*"cluster" + 0.003*"posterior" + 0.003*"graph" + 0.003*"deep" + 0.003*"neuron" + 0.002*"latent" + 0.002*"convolut" + 0.002*"label" + 0.002*"spike"

Score: 0.09352096915245056	 
Topic: 0.003*"cluster" + 0.003*"convex" + 0.002*"topic" + 0.002*"kernel" + 0.002*"tensor" + 0.002*"norm" + 0.002*"proxim" + 0.002*"rank" + 0.002*"posterior" + 0.002*"subspac"

Score: 0.05353422090411186	 
Topic: 0.004*"kernel" + 0.003*"rank" + 0.003*"lasso" + 0.003*"graph" + 0.002*"norm" + 0.002*"convex" + 0.002*"tensor" + 0.002*"subspac" + 0.002*"recoveri" + 0.002*"cluster"


### Data visualization

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
%matplotlib inline

  from collections import Iterable


In [None]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_tfidf, bow_corpus, dictionary)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
