In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nips-papers/paper_authors.csv
/kaggle/input/nips-papers/papers.csv
/kaggle/input/nips-papers/authors.csv
/kaggle/input/nips-papers/database.sqlite


In [2]:
papers = pd.read_csv('/kaggle/input/nips-papers/papers.csv')
papers.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [3]:
# Remove the columns
papers = papers.drop(columns=['id', 'title', 'abstract', 
                              'event_type', 'pdf_name', 'year'], axis=1)# sample only 100 papers
papers = papers.sample(100)# Print out the first rows of papers
papers.head()

Unnamed: 0,paper_text
6699,Synchronization and Grammatical Inference\nin ...
2113,Searching for Character Models\n\nJaety Edward...
4661,Learning Generative Models with the\nUp-Propag...
63,Adaptive Retina with Center-Surround\nReceptiv...
2002,A Bayesian Spatial Scan Statistic\n\nDaniel B....


## Data cleaning

### Remove punctuation/lower casing

In [4]:
# Load the regular expression library
import re# Remove punctuation
papers['paper_text_processed'] = papers['paper_text'].map(lambda x: re.sub('[,\.!?]', '', x))# Convert the titles to lowercase
papers['paper_text_processed'] = papers['paper_text_processed'].map(lambda x: x.lower())# Print out the first rows of papers
papers['paper_text_processed'].head()

6699    synchronization and grammatical inference\nin ...
2113    searching for character models\n\njaety edward...
4661    learning generative models with the\nup-propag...
63      adaptive retina with center-surround\nreceptiv...
2002    a bayesian spatial scan statistic\n\ndaniel b ...
Name: paper_text_processed, dtype: object

### Tokenize words and further clean-up text

In [6]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['synchronization', 'and', 'grammatical', 'inference', 'in', 'an', 'oscillating', 'elman', 'net', 'bill', 'baird', 'dept', 'mathematics', 'cberkeley', 'berkeley', 'ca', 'baird', 'mathberkeleyedu', 'todd', 'troyer', 'dept', 'mathematics', 'cberkeley', 'berkeley', 'ca', 'frank', 'eeckman', 'lawrence', 'livermore', 'national']


## Phrase Modeling: Bigram and Trigram Models

In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)# Faster way to get a sentence clubbed as a trigram/bigram

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

### Remove Stopwords, Make Bigrams and Lemmatize

In [10]:
# NLTK Stop words
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Define functions for stopwords, bigrams, trigrams and lemmatization

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
import spacy# Remove Stop Words

data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['synchronization', 'oscillate', 'elman', 'net', 'bill', 'abstract', 'design', 'architecture', 'span', 'gap', 'biophysic', 'cognitive_science', 'address', 'explore', 'issue', 'discrete', 'symbol', 'processing', 'system', 'arise', 'continuum', 'complex', 'dynamic', 'oscillation', 'synchronization', 'employ', 'operation', 'affect', 'learn', 'show', 'discrete', 'time', 'recurrent', 'elman', 'network', 'architecture', 'construct', 'recurrently', 'connect', 'oscillatory', 'module', 'describe', 'continuous', 'nonlinear', 'ordinary', 'module', 'learn', 'connection', 'weight', 'cause', 'system', 'evolve', 'clock', 'machine', 'cycle', 'sequence', 'transition', 'attractor', 'module', 'much', 'digital', 'computer', 'evolve', 'transition', 'binary', 'attractor', 'architecture', 'thus', 'employ', 'principle', 'computing', 'attractor', 'use', 'macroscopic', 'system', 'reliable', 'presence', 'noise', 'specifically', 'construct', 'system', 'function', 'recognize', 'generate', 'infinite', 'set', 'symb

## Data Transformation: Corpus and Dictionary

In [12]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

## Base Model

In [13]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

### View the topics in LDA model

In [14]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.010*"use" + 0.008*"loss" + 0.008*"show" + 0.008*"price" + 0.008*"figure" '
  '+ 0.007*"function" + 0.007*"output" + 0.007*"input" + 0.006*"result" + '
  '0.006*"circuit"'),
 (1,
  '0.013*"function" + 0.009*"use" + 0.009*"cost" + 0.007*"set" + 0.006*"time" '
  '+ 0.006*"show" + 0.006*"solution" + 0.006*"problem" + 0.005*"method" + '
  '0.005*"algorithm"'),
 (2,
  '0.009*"learn" + 0.009*"set" + 0.009*"model" + 0.009*"policy" + '
  '0.009*"image" + 0.008*"worker" + 0.008*"packet" + 0.008*"datum" + '
  '0.007*"problem" + 0.007*"use"'),
 (3,
  '0.019*"network" + 0.010*"function" + 0.010*"use" + 0.009*"learn" + '
  '0.008*"task" + 0.008*"neural" + 0.008*"training" + 0.007*"cell" + '
  '0.007*"policy" + 0.007*"set"'),
 (4,
  '0.021*"model" + 0.010*"use" + 0.008*"state" + 0.008*"memory" + '
  '0.007*"distribution" + 0.006*"datum" + 0.006*"show" + 0.006*"give" + '
  '0.005*"sample" + 0.005*"learn"'),
 (5,
  '0.013*"problem" + 0.009*"model" + 0.006*"algorithm" + 0.006*"parameter" + '
 

### Compute Model Perplexity and Coherence Score

In [15]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, 
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.27168941102212896


## Hyperparameter Tuning

In [16]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

Let’s call the function, and iterate it over the range of topics, alpha, and beta parameter values.

In [19]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

In [25]:
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, k=k, a=a, b=b)

                    # Save the model results
#                     model_results['Validation_Set'].append(corpus_title[i])
#                     model_results['Topics'].append(k)
#                     model_results['Alpha'].append(a)
#                     model_results['Beta'].append(b)
#                     model_results['Coherence'].append(cv)
                    
#                     pbar.update(1)
                    print(f'topic is {k}, alpha is {a}, beta is {b}, cv is {cv}')
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()


  0%|          | 0/540 [00:00<?, ?it/s][A

ValueError: Stop argument for islice() must be None or an integer: 0 <= x <= sys.maxsize.

## Final Model

In [26]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.01,
                                           eta=0.9)

  0%|          | 0/540 [11:28<?, ?it/s]
  0%|          | 0/540 [11:29<?, ?it/s]
  0%|          | 0/540 [11:29<?, ?it/s]


## Visualize Topics

In [27]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

LDAvis_prepared

  0%|          | 0/540 [12:15<?, ?it/s]


ModuleNotFoundError: No module named 'pyLDAvis.gensim'