In [20]:
import pandas as pd
import os

papers = pd.read_csv("data/NEURIPS Papers/papers.csv")

In [21]:
papers.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [22]:
papers = papers.drop(columns=['id','event_type','pdf_name'], axis=1).sample(100)

In [23]:
papers.head()

Unnamed: 0,year,title,abstract,paper_text
4721,2014,Global Belief Recursive Neural Networks,Recursive Neural Networks have recently obtain...,Global Belief Recursive Neural Networks\nRomai...
338,1996,Interpreting Images by Propagating Bayesian Be...,Abstract Missing,Interpreting images by propagating\nBayesian b...
6333,1993,A Connectionist Model of the Owl's Sound Local...,Abstract Missing,A Connectionist Model of the Owl's\nSound Loca...
5268,1991,Reverse TDNN: An Architecture For Trajectory G...,Abstract Missing,Reverse TDNN: An Architecture for Trajectory\n...
5058,1991,Analog LSI Implementation of an Auto-Adaptive ...,Abstract Missing,?\n\n?\n\n \n\t \n\n \n ...


In [24]:
import re

papers['paper_text_processed'] = \
papers['paper_text'].map(lambda x: re.sub('[,\.!?]','',x))
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: x.lower())
papers['paper_text_processed'].head()

4721    global belief recursive neural networks\nromai...
338     interpreting images by propagating\nbayesian b...
6333    a connectionist model of the owl's\nsound loca...
5268    reverse tdnn: an architecture for trajectory\n...
5058    \n\n\n\n 
\n\t \n\n

 \n  ...
Name: paper_text_processed, dtype: object

In [25]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shionguha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [27]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [28]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
            if word not in stop_words] for doc in texts]

In [29]:
data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))

In [30]:
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['global', 'belief', 'recursive', 'neural', 'networks', 'romain', 'paulus', 'richard', 'socher', 'metamind', 'palo', 'alto', 'ca', 'romainrichard', 'metamindio', 'christopher', 'manning', 'stanford', 'university', 'serra', 'mall', 'stanford', 'ca', 'manning', 'stanfordedu', 'abstract', 'recursive', 'neural', 'networks', 'recently']


In [31]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(data_words)

texts = data_words

corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 2), (6, 4), (7, 1), (8, 2), (9, 1), (10, 2), (11, 1), (12, 3), (13, 1), (14, 1), (15, 1), (16, 3), (17, 7), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 2), (24, 2), (25, 1), (26, 3), (27, 1), (28, 1), (29, 1)]


In [32]:
from pprint import pprint

num_topics = 5

lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word,num_topics=num_topics)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.006*"model" + 0.005*"time" + 0.005*"learning" + 0.005*"algorithm" + '
  '0.004*"data" + 0.004*"one" + 0.004*"set" + 0.003*"using" + 0.003*"function" '
  '+ 0.003*"problem"'),
 (1,
  '0.006*"algorithm" + 0.006*"learning" + 0.004*"using" + 0.004*"network" + '
  '0.004*"time" + 0.004*"problem" + 0.004*"data" + 0.004*"set" + 0.004*"model" '
  '+ 0.004*"function"'),
 (2,
  '0.006*"data" + 0.005*"learning" + 0.005*"algorithm" + 0.005*"using" + '
  '0.005*"model" + 0.004*"set" + 0.004*"problem" + 0.004*"log" + 0.003*"time" '
  '+ 0.003*"distribution"'),
 (3,
  '0.006*"model" + 0.005*"data" + 0.005*"algorithm" + 0.005*"learning" + '
  '0.004*"set" + 0.004*"function" + 0.003*"using" + 0.003*"one" + 0.003*"time" '
  '+ 0.003*"problem"'),
 (4,
  '0.007*"model" + 0.005*"algorithm" + 0.005*"set" + 0.005*"one" + '
  '0.005*"learning" + 0.004*"data" + 0.004*"function" + 0.004*"using" + '
  '0.004*"problem" + 0.004*"network"')]


In [33]:
#BTM example
import numpy as np
import bitermplus as btm
import tmplot as tmp
import pickle as pkl
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [34]:
df = pd.read_csv('/Users/shionguha/Documents/GitHub/inf2209-humancentredtopicmodels-fa21/data/SearchSnippets.txt.gz')

In [36]:
texts = df['texts'].str.strip().tolist()

KeyError: 'texts'

In [None]:
#preprocessing
X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
tf = np.array(X.sum(axis=0)).ravel()

In [None]:
docs_vec = btm.get_vectorized_docs(texts,vocabulary)
docs_lens = list(map(len,docs_vec))

biterms = btm.get_biterms(docs_vec)