# load packages

In [1]:
import re, string, unicodedata
import numpy as np
import pandas as pd
from pprint import pprint
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess, lemmatize
from gensim.models import CoherenceModel
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.models import LsiModel

nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
#from bs4 import BeautifulSoup
#from nltk import word_tokenize, sent_tokenize

# Plotting tools 
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
# %matplotlib inline

# Enable logging for gensim - optional
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
#logging.root.setLevel(level=logging.INFO)
#import warnings
#warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kornelius\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kornelius\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Import data and prepare data

In [2]:
# Load dataset
df = pd.read_csv('C:/Users/Kornelius/Desktop/Data 2/nips-papers/papers.csv', header = 0, sep = ';', error_bad_lines=False)
# Drop the columns not needed
df = df.drop(columns=['id', 'event_type', 'pdf_name'], axis=1)

In [3]:
# Drop a row by condition
df = df[df.abstract != 'Abstract Missing']

In [4]:
data = df.abstract.values.tolist()

# Pre-processing (Baseline)

In [5]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(data):
    stop_free = " ".join([i for i in data.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

data = [clean(data).split() for data in data] 

In [6]:
# remove characters and numbers
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [7]:
data = data_words
# Create Dictionary
id2word = corpora.Dictionary(data)
print((id2word))

Dictionary(16129 unique tokens: ['activity', 'also', 'analysis', 'applied', 'assume']...)


In [8]:
# less than 10 documents,
id2word = corpora.Dictionary(data)
id2word.filter_extremes(no_below = 4)

In [9]:
# Create Corpus
texts = data
# Term Document Frequency and creating corpus
corpus = [id2word.doc2bow(text) for text in texts]

# LSI

In [10]:
# Build the LSI Model
lsi_model = LsiModel(corpus=corpus, id2word=id2word, num_topics=1, decay=0.5)

lsi_model5 = LsiModel(corpus=corpus, id2word=id2word, num_topics=5, decay=0.5)

lsi_model10 = LsiModel(corpus=corpus, id2word=id2word, num_topics=10, decay=0.5)

lsi_model20 = LsiModel(corpus=corpus, id2word=id2word, num_topics=20, decay=0.5)

lsi_model50 = LsiModel(corpus=corpus, id2word=id2word, num_topics=50, decay=0.5)

lsi_model100 = LsiModel(corpus=corpus, id2word=id2word, num_topics=100, decay=0.5)



In [11]:
# View Topics
pprint(lsi_model.print_topics(-1))

# Print the Keyword in the 5 topics
pprint(lsi_model5.print_topics(-1))
doc_lsi = lsi_model5[corpus]

# Print the Keyword in the 5 topics
pprint(lsi_model10.print_topics(-1))
doc_lsi = lsi_model10[corpus]

# Print the Keyword in the 20 topics
pprint(lsi_model20.print_topics(-1))
doc_lsi = lsi_model20[corpus]

# Print the Keyword in the 20 topics
pprint(lsi_model50.print_topics(-1))
doc_lsi = lsi_model50[corpus]

# Print the Keyword in the 40 topics
pprint(lsi_model100.print_topics(-1))
doc_lsi = lsi_model100[corpus]

[(0,
  '0.380*"model" + 0.312*"algorithm" + 0.260*"learning" + 0.235*"method" + '
  '0.227*"problem" + 0.222*"data" + 0.160*"show" + 0.136*"approach" + '
  '0.131*"function" + 0.123*"network"')]
[(0,
  '0.380*"model" + 0.312*"algorithm" + 0.260*"learning" + 0.235*"method" + '
  '0.227*"problem" + 0.222*"data" + 0.160*"show" + 0.136*"approach" + '
  '0.131*"function" + 0.123*"network"'),
 (1,
  '-0.728*"model" + 0.434*"algorithm" + 0.240*"problem" + 0.134*"function" + '
  '0.114*"learning" + -0.108*"network" + -0.103*"image" + 0.090*"bound" + '
  '-0.086*"neural" + 0.079*"method"'),
 (2,
  '0.519*"algorithm" + 0.391*"model" + -0.334*"learning" + -0.327*"network" + '
  '-0.296*"method" + -0.165*"image" + -0.155*"task" + -0.143*"feature" + '
  '-0.125*"deep" + -0.123*"neural"'),
 (3,
  '-0.574*"method" + 0.440*"learning" + 0.360*"network" + 0.292*"algorithm" + '
  '-0.254*"data" + -0.122*"matrix" + -0.104*"problem" + 0.100*"neural" + '
  '0.098*"deep" + 0.087*"task"'),
 (4,
  '0.588*"netw

# Evaluation

In [12]:
from gensim.models.coherencemodel import CoherenceModel

# Compute Perplexity not working?
#print('\nPerplexity: ', lsi_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.



coherence_model_lsa = CoherenceModel(model=lsi_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('\nCoherence Score: ', coherence_lsa)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model5, texts=data, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score5: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model10, texts=data, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score10: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model20, texts=data, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score20: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model50, texts=data, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score50: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model100, texts=data, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score100: ', coherence_lsi)


coherence_model_lsa = CoherenceModel(model=lsi_model, texts=texts, dictionary=id2word, coherence='u_mass')
coherence_lsa = coherence_model_lsa.get_coherence()
print('\nCoherence Score: ', coherence_lsa)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model5, texts=data, dictionary=id2word, coherence='u_mass')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score5: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model10, texts=data, dictionary=id2word, coherence='u_mass')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score10: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model20, texts=data, dictionary=id2word, coherence='u_mass')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score20: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model50, texts=data, dictionary=id2word, coherence='u_mass')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score50: ', coherence_lsi)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model100, texts=data, dictionary=id2word, coherence='u_mass')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score100: ', coherence_lsi)


Coherence Score:  0.2627923251812317

Coherence Score5:  0.4052500695756426

Coherence Score10:  0.3227329434451162

Coherence Score20:  0.32978395001874505

Coherence Score50:  0.26423153175216396

Coherence Score100:  0.23638106012127644

Coherence Score:  -1.3206663098063114

Coherence Score5:  -1.943964901627469

Coherence Score10:  -2.1270621799675835

Coherence Score20:  -2.1483660187754046

Coherence Score50:  -2.260180135326383

Coherence Score100:  -2.4729963021171226
