In [1]:
import pandas as pd
import os

papers = pd.read_csv("data/NEURIPS Papers/papers.csv")

In [2]:
papers.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [3]:
papers = papers.drop(columns=['id','event_type','pdf_name'], axis=1).sample(100)

In [4]:
papers.head()

Unnamed: 0,year,title,abstract,paper_text
3165,2009,"Slow, Decorrelated Features for Pretraining Co...",We introduce a new type of neural network acti...,"Slow, Decorrelated Features for\nPretraining C..."
5981,2016,A Non-convex One-Pass Framework for Generalize...,We develop an efficient alternating framework ...,A Non-convex One-Pass Framework for Generalize...
3601,2011,A More Powerful Two-Sample Test in High Dimens...,We consider the hypothesis testing problem of ...,A More Powerful Two-Sample Test in High\nDimen...
6132,2016,Reward Augmented Maximum Likelihood for Neural...,A key problem in structured output prediction ...,Reward Augmented Maximum Likelihood\nfor Neura...
6299,1993,Tonal Music as a Componential Code: Learning T...,Abstract Missing,Tonal Music as a Componential Code:\nLearning ...


In [5]:
import re

papers['paper_text_processed'] = \
papers['paper_text'].map(lambda x: re.sub('[,\.!?]','',x))
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: x.lower())
papers['paper_text_processed'].head()

3165    slow decorrelated features for\npretraining co...
5981    a non-convex one-pass framework for generalize...
3601    a more powerful two-sample test in high\ndimen...
6132    reward augmented maximum likelihood\nfor neura...
6299    tonal music as a componential code:\nlearning ...
Name: paper_text_processed, dtype: object

In [6]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shionguha/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [9]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [13]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
            if word not in stop_words] for doc in texts]

In [14]:
data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))

In [15]:
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['slow', 'decorrelated', 'features', 'pretraining', 'complex', 'cell', 'like', 'networks', 'yoshua', 'bengio', 'university', 'montreal', 'yoshuabengio', 'umontrealca', 'james', 'bergstra', 'university', 'montreal', 'jamesbergstra', 'umontrealca', 'abstract', 'introduce', 'new', 'type', 'neural', 'network', 'activation', 'function', 'based', 'recent']


In [16]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(data_words)

texts = data_words

corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1][0][:30])

[(0, 3), (1, 1), (2, 2), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 6), (9, 4), (10, 1), (11, 4), (12, 3), (13, 17), (14, 2), (15, 1), (16, 2), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 2), (24, 4), (25, 3), (26, 1), (27, 1), (28, 21), (29, 14)]


In [18]:
from pprint import pprint

num_topics = 5

lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word,num_topics=num_topics)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.006*"data" + 0.005*"model" + 0.004*"learning" + 0.004*"set" + '
  '0.004*"figure" + 0.004*"one" + 0.003*"log" + 0.003*"algorithm" + '
  '0.003*"two" + 0.003*"function"'),
 (1,
  '0.007*"model" + 0.006*"learning" + 0.005*"data" + 0.004*"one" + '
  '0.004*"algorithm" + 0.004*"function" + 0.003*"using" + 0.003*"figure" + '
  '0.003*"set" + 0.003*"training"'),
 (2,
  '0.006*"model" + 0.005*"data" + 0.005*"learning" + 0.005*"set" + '
  '0.005*"algorithm" + 0.005*"function" + 0.004*"log" + 0.004*"one" + '
  '0.004*"using" + 0.003*"number"'),
 (3,
  '0.006*"learning" + 0.005*"model" + 0.004*"one" + 0.004*"data" + 0.004*"set" '
  '+ 0.004*"algorithm" + 0.004*"using" + 0.003*"two" + 0.003*"also" + '
  '0.003*"training"'),
 (4,
  '0.006*"model" + 0.005*"learning" + 0.005*"data" + 0.004*"function" + '
  '0.004*"two" + 0.003*"algorithm" + 0.003*"set" + 0.003*"problem" + '
  '0.003*"information" + 0.003*"training"')]
