## Install packages

In [1]:
#pip install --upgrade gensim
#pip install spacy==2.1.0
#pip install neuralcoref
#pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz

## Import packages

In [16]:
import spacy
from spacy.lang.en.examples import sentences 
import neuralcoref
import en_core_web_sm
import pandas as pd

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import words, stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer, porter
from nltk.tokenize import word_tokenize

import gensim
from gensim import corpora, models, similarities, downloader
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import CoherenceModel

import re

import numpy as np
#To fix random output
np.random.seed(2018)

#Compute execution time
from datetime import datetime

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jason13nn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jason13nn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Preprocessing

In [3]:
# Corpus: 20-Newsgroups dataset 
# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
df.head()

#stopwords
stop_words = stopwords.words('english')

# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

# Creating Bigram
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

# Remove Stopwords, Make Bigrams and Lemmatize
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

nlp = spacy.load("en_core_web_sm")

In [4]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Do lemmatization keeping only noun
data_lemmatized_noun = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])

### LDA Model with noun only vs. only noun, adj, vb, adv

In [18]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
id2word_noun = corpora.Dictionary(data_lemmatized_noun)

# Create Corpus
texts = data_lemmatized
texts_noun = data_lemmatized_noun

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
corpus_noun = [id2word_noun.doc2bow(text) for text in texts_noun]

### LDA with all words

In [19]:
start_time = datetime.now()
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:03:23.739249


In [7]:
# Print the Keyword in the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.090*"team" + 0.089*"game" + 0.046*"play" + 0.046*"year" + 0.028*"season" + 0.027*"fan" + 0.021*"nhl" + 0.018*"division" + 0.017*"boston" + 0.015*"lose"'), (1, '0.141*"space" + 0.038*"earth" + 0.035*"launch" + 0.031*"mission" + 0.030*"orbit" + 0.029*"nasa" + 0.025*"satellite" + 0.022*"mar" + 0.022*"plane" + 0.016*"map"'), (2, '0.072*"drive" + 0.046*"card" + 0.041*"mac" + 0.036*"driver" + 0.030*"cpu" + 0.026*"memory" + 0.022*"scsi" + 0.021*"chip" + 0.021*"machine" + 0.020*"device"'), (3, '0.037*"armenian" + 0.031*"soldier" + 0.029*"greek" + 0.027*"village" + 0.025*"jew" + 0.023*"turk" + 0.021*"muslim" + 0.019*"turkish" + 0.016*"occupy" + 0.016*"jewish"'), (4, '0.114*"image" + 0.077*"scan" + 0.063*"format" + 0.035*"brian" + 0.031*"nec" + 0.019*"specification" + 0.019*"finland" + 0.016*"hewlett_packard" + 0.014*"pd" + 0.010*"timing"'), (5, '0.067*"line" + 0.064*"subject" + 0.061*"organization" + 0.035*"write" + 0.030*"article" + 0.025*"university" + 0.023*"host" + 0.017*"get" + 0.0

Topic 0 is a represented as '0.048*"peter" + 0.036*"ice" + 0.024*"spring" + 0.022*"last_night" + 0.020*"minnesota" + 0.019*"speaker" + 0.017*"backup" + 0.017*"custom" + 0.013*"significance" + 0.011*"tour"'.

It means the top 10 keywords that contribute to this topic are: ‘peter’, ‘ice’, ‘spring’.. and so on and the weight of ‘peter’ on topic 0 is 0.048.

The weights reflect how important a keyword is to that topic.

### LDA with noun only

In [20]:
start_time = datetime.now()
lda_model_noun = gensim.models.ldamodel.LdaModel(corpus=corpus_noun,
                                           id2word=id2word_noun,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:02:23.717327


In [9]:
# Print the Keyword in the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.090*"team" + 0.089*"game" + 0.046*"play" + 0.046*"year" + 0.028*"season" + 0.027*"fan" + 0.021*"nhl" + 0.018*"division" + 0.017*"boston" + 0.015*"lose"'), (1, '0.141*"space" + 0.038*"earth" + 0.035*"launch" + 0.031*"mission" + 0.030*"orbit" + 0.029*"nasa" + 0.025*"satellite" + 0.022*"mar" + 0.022*"plane" + 0.016*"map"'), (2, '0.072*"drive" + 0.046*"card" + 0.041*"mac" + 0.036*"driver" + 0.030*"cpu" + 0.026*"memory" + 0.022*"scsi" + 0.021*"chip" + 0.021*"machine" + 0.020*"device"'), (3, '0.037*"armenian" + 0.031*"soldier" + 0.029*"greek" + 0.027*"village" + 0.025*"jew" + 0.023*"turk" + 0.021*"muslim" + 0.019*"turkish" + 0.016*"occupy" + 0.016*"jewish"'), (4, '0.114*"image" + 0.077*"scan" + 0.063*"format" + 0.035*"brian" + 0.031*"nec" + 0.019*"specification" + 0.019*"finland" + 0.016*"hewlett_packard" + 0.014*"pd" + 0.010*"timing"'), (5, '0.067*"line" + 0.064*"subject" + 0.061*"organization" + 0.035*"write" + 0.030*"article" + 0.025*"university" + 0.023*"host" + 0.017*"get" + 0.0

### Comparison: Coherence Score

In [10]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
coherence_model_lda_noun = CoherenceModel(model=lda_model_noun, texts=data_lemmatized_noun, dictionary=id2word_noun, coherence='c_v')
coherence_lda_noun = coherence_model_lda_noun.get_coherence()

print('\nCoherence Score for LDA with all: ', coherence_lda)
print('\nCoherence Score for LDA with noun only: ', coherence_lda_noun)


Coherence Score for LDA with all:  0.5431104817051807

Coherence Score for LDA with noun only:  0.501404199685498


There you have the difference of coherence score between two models are small.
Therefore, we can use the model with noun only because it greatly reduce the execution time. (16,004,176 to 8,940,795)

## Neural Coref

In [12]:
## Data preprocessing
# Neural Coreference
nlp = en_core_web_sm.load()
neuralcoref.add_to_pipe(nlp)

nlp.max_length = 1000000

data_nc = ' '.join(map(str, data))
data_nc = data_nc[:100000]
data_nc = nlp(data_nc)
data_nc = nlp(data_nc._.coref_resolved)

#Remove punctuations
data_words_nc = list(sent_to_words(data_nc))

# Creating Bigram
bigram_nc = gensim.models.Phrases(data_words_nc, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod_nc = gensim.models.phrases.Phraser(bigram_nc)

# Remove Stop Words
data_words_nc_nostops = remove_stopwords(data_words_nc)

# Form Bigrams
data_words_nc_bigrams = make_bigrams(data_words_nc_nostops)

# Do lemmatization keeping only noun
data_nc_lemmatized_noun = lemmatization(data_words_nc_bigrams, allowed_postags=['NOUN'])
data_nc_lemmatized_noun = [x for x in data_nc_lemmatized_noun if x] 

### LDA model with noun only using neural coref

In [13]:
# Create Dictionary
id2word_nc_noun = corpora.Dictionary(data_nc_lemmatized_noun)

# Create Corpus
texts_nc_noun = data_nc_lemmatized_noun

# Term Document Frequency
corpus_nc_noun = [id2word_nc_noun.doc2bow(text) for text in texts_nc_noun]

In [14]:
lda_model_nc_noun = gensim.models.ldamodel.LdaModel(corpus=corpus_nc_noun,
                                           id2word=id2word_nc_noun,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [15]:
coherence_model_lda_nc_noun = CoherenceModel(model=lda_model_nc_noun, texts=data_nc_lemmatized_noun, dictionary=id2word_nc_noun, coherence='c_v')
coherence_lda_nc_noun = coherence_model_lda_nc_noun.get_coherence()

print('\nCoherence Score for LDA with noun only using neural coref: ', coherence_lda_nc_noun)


Coherence Score for LDA with noun only using neural coref:  0.8218067004115035


Based on coherence score, we can see the model improved using neural coref.