# load packages

In [1]:
import re, string, unicodedata
import numpy as np
import pandas as pd
from pprint import pprint
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess, lemmatize
from gensim.models import CoherenceModel
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.models import LsiModel

nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
#from bs4 import BeautifulSoup
#from nltk import word_tokenize, sent_tokenize

# Plotting tools 
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
# %matplotlib inline

# Enable logging for gensim - optional
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
#logging.root.setLevel(level=logging.INFO)
#import warnings
#warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kornelius\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kornelius\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Import data and prepare data

In [2]:
# Load dataset
df = pd.read_csv('C:/Users/Kornelius/Desktop/Data 2/nips-papers/papers.csv', header = 0, sep = ';', error_bad_lines=False)
# Drop the columns not needed
df = df.drop(columns=['id', 'event_type', 'pdf_name'], axis=1)

In [3]:
# Drop a row by condition
df = df[df.abstract != 'Abstract Missing']

In [4]:
data = df.abstract.values.tolist()

# Pre-processing (Baseline)

In [5]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
ps = PorterStemmer()
def clean(data):
    stop_free = " ".join([i for i in data.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(ps.stem(word) for word in punc_free.split())
    return normalized

data = [clean(data).split() for data in data]   

In [6]:
# remove characters and numbers
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [7]:
data = data_words
# Create Dictionary
id2word = corpora.Dictionary(data)
print((id2word))

Dictionary(12827 unique tokens: ['activ', 'also', 'analysi', 'appli', 'assum']...)


In [8]:
# Create Corpus
texts = data
# Term Document Frequency and creating corpus
corpus = [id2word.doc2bow(text) for text in texts]

# LSI

In [9]:
# Build the LSI Model
lsi_model = LsiModel(corpus=corpus, id2word=id2word, num_topics=1, decay=0.5)

lsi_model5 = LsiModel(corpus=corpus, id2word=id2word, num_topics=5, decay=0.5)

lsi_model10 = LsiModel(corpus=corpus, id2word=id2word, num_topics=10, decay=0.5)

lsi_model20 = LsiModel(corpus=corpus, id2word=id2word, num_topics=20, decay=0.5)

lsi_model50 = LsiModel(corpus=corpus, id2word=id2word, num_topics=50, decay=0.5)

lsi_model100 = LsiModel(corpus=corpus, id2word=id2word, num_topics=100, decay=0.5)



In [10]:
# View Topics
pprint(lsi_model.print_topics(-1))

# Print the Keyword in the 5 topics
pprint(lsi_model5.print_topics(-1))
doc_lsi = lsi_model5[corpus]

# Print the Keyword in the 5 topics
pprint(lsi_model10.print_topics(-1))
doc_lsi = lsi_model10[corpus]

# Print the Keyword in the 20 topics
pprint(lsi_model20.print_topics(-1))
doc_lsi = lsi_model20[corpus]

# Print the Keyword in the 20 topics
pprint(lsi_model50.print_topics(-1))
doc_lsi = lsi_model50[corpus]

# Print the Keyword in the 40 topics
pprint(lsi_model100.print_topics(-1))
doc_lsi = lsi_model100[corpus]

[(0,
  '0.352*"model" + 0.290*"learn" + 0.267*"algorithm" + 0.209*"use" + '
  '0.200*"method" + 0.192*"problem" + 0.190*"data" + 0.149*"propos" + '
  '0.148*"show" + 0.138*"gener"')]
[(0,
  '0.352*"model" + 0.290*"learn" + 0.267*"algorithm" + 0.209*"use" + '
  '0.200*"method" + 0.192*"problem" + 0.190*"data" + 0.149*"propos" + '
  '0.148*"show" + 0.138*"gener"'),
 (1,
  '0.731*"model" + -0.407*"algorithm" + -0.230*"problem" + -0.164*"optim" + '
  '-0.130*"function" + -0.099*"bound" + 0.099*"network" + 0.094*"imag" + '
  '-0.088*"method" + 0.081*"neural"'),
 (2,
  '0.692*"learn" + -0.294*"algorithm" + -0.294*"model" + 0.216*"network" + '
  '-0.172*"estim" + 0.150*"task" + 0.141*"train" + 0.134*"imag" + '
  '0.110*"featur" + 0.107*"deep"'),
 (3,
  '0.506*"algorithm" + -0.490*"method" + 0.390*"learn" + 0.272*"model" + '
  '-0.232*"estim" + -0.137*"use" + -0.135*"network" + -0.109*"data" + '
  '-0.104*"imag" + -0.097*"propos"'),
 (4,
  '-0.649*"network" + 0.347*"data" + 0.273*"learn" + -0.

# Evaluation

In [12]:
from gensim.models.coherencemodel import CoherenceModel

# Compute Perplexity not working?
#print('\nPerplexity: ', lsi_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.




coherence_model_lsa = CoherenceModel(model=lsi_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('\nCoherence Score: ', coherence_lsa)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model5, texts=data, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score5: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model10, texts=data, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score10: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model20, texts=data, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score20: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model50, texts=data, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score50: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model100, texts=data, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score100: ', coherence_lsi)

coherence_model_lsa = CoherenceModel(model=lsi_model, texts=texts, dictionary=id2word, coherence='u_mass')
coherence_lsa = coherence_model_lsa.get_coherence()
print('\nCoherence Score: ', coherence_lsa)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model5, texts=data, dictionary=id2word, coherence='u_mass')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score5: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model10, texts=data, dictionary=id2word, coherence='u_mass')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score10: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model20, texts=data, dictionary=id2word, coherence='u_mass')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score20: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model50, texts=data, dictionary=id2word, coherence='u_mass')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score50: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model100, texts=data, dictionary=id2word, coherence='u_mass')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score100: ', coherence_lsi)


Coherence Score:  0.25545931125308097

Coherence Score5:  0.39581850693804266

Coherence Score10:  0.33438864521992995

Coherence Score20:  0.3129252455390608

Coherence Score50:  0.26852417815632545

Coherence Score100:  0.23019417570332634

Coherence Score:  -1.1490767037588188

Coherence Score5:  -2.1067155489734253

Coherence Score10:  -1.9292452901755552

Coherence Score20:  -1.8362967538194226

Coherence Score50:  -2.010876425762292

Coherence Score100:  -2.2579269381318907
