In [1]:
import pandas as pd
import os

papers = pd.read_csv("data/NEURIPS Papers/papers.csv")

In [2]:
papers.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [3]:
papers = papers.drop(columns=['id','event_type','pdf_name'], axis=1).sample(100)

In [4]:
papers.head()

Unnamed: 0,year,title,abstract,paper_text
1707,2004,An Auditory Paradigm for Brain-Computer Interf...,Abstract Missing,An Auditory Paradigm for\nBrain?Computer Inter...
2799,2008,Differentiable Sparse Coding,Abstract Missing,Differentiable Sparse Coding\n\nDavid M. Bradl...
1622,2003,A Functional Architecture for Motion Pattern P...,Abstract Missing,A Functional Architecture for Motion\nPattern ...
5138,2015,Bidirectional Recurrent Neural Networks as Gen...,Bidirectional recurrent neural networks (RNN) ...,Bidirectional Recurrent Neural Networks as\nGe...
670,1998,An Integrated Vision Sensor for the Computatio...,Abstract Missing,An Integrated Vision Sensor for the\nComputati...


In [5]:
import re

papers['paper_text_processed'] = \
papers['paper_text'].map(lambda x: re.sub('[,\.!?]','',x))
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: x.lower())
papers['paper_text_processed'].head()

1707    an auditory paradigm for\nbraincomputer interf...
2799    differentiable sparse coding\n\ndavid m bradle...
1622    a functional architecture for motion\npattern ...
5138    bidirectional recurrent neural networks as\nge...
670     an integrated vision sensor for the\ncomputati...
Name: paper_text_processed, dtype: object

In [6]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shionguha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [9]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
            if word not in stop_words] for doc in texts]

In [10]:
data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))

In [11]:
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['auditory', 'paradigm', 'braincomputer', 'interfaces', 'jeremy', 'hill', 'navin', 'lal', 'karin', 'bierig', 'niels', 'birbaumer', 'bernhard', 'sch', 'olkopf', 'max', 'planck', 'institute', 'biological', 'cybernetics', 'spemannstrae', 'ubingen', 'germany', 'jez', 'navin', 'bierig', 'bs', 'tuebingenmpgde', 'institute', 'medical']


In [12]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(data_words)

texts = data_words

corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 2), (5, 1), (6, 1), (7, 1), (8, 2), (9, 2), (10, 2), (11, 2), (12, 2), (13, 1), (14, 1), (15, 2), (16, 3), (17, 2), (18, 4), (19, 1), (20, 1), (21, 3), (22, 3), (23, 1), (24, 1), (25, 4), (26, 1), (27, 1), (28, 1), (29, 1)]


In [13]:
from pprint import pprint

num_topics = 5

lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word,num_topics=num_topics)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.009*"learning" + 0.007*"model" + 0.006*"data" + 0.005*"algorithm" + '
  '0.005*"function" + 0.004*"using" + 0.004*"set" + 0.004*"models" + '
  '0.003*"training" + 0.003*"two"'),
 (1,
  '0.008*"learning" + 0.006*"model" + 0.005*"data" + 0.005*"log" + '
  '0.004*"algorithm" + 0.004*"function" + 0.004*"set" + 0.004*"one" + '
  '0.004*"using" + 0.004*"time"'),
 (2,
  '0.007*"data" + 0.006*"learning" + 0.005*"model" + 0.005*"algorithm" + '
  '0.004*"one" + 0.004*"function" + 0.003*"models" + 0.003*"matrix" + '
  '0.003*"using" + 0.003*"set"'),
 (3,
  '0.006*"learning" + 0.006*"data" + 0.005*"model" + 0.005*"function" + '
  '0.004*"using" + 0.004*"algorithm" + 0.004*"one" + 0.003*"set" + 0.003*"log" '
  '+ 0.003*"time"'),
 (4,
  '0.007*"model" + 0.005*"set" + 0.005*"learning" + 0.005*"using" + '
  '0.004*"data" + 0.004*"algorithm" + 0.004*"one" + 0.003*"distribution" + '
  '0.003*"time" + 0.003*"function"')]
