In [4]:
import numpy as np
import pandas as pd
from pprint import pprint


# Regular Expressions Library
import re

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Spacy for lemmatization
import spacy

# Import plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

# nltk related libraries
import nltk
from nltk.corpus import stopwords

In [5]:
# Preparing stopwords
stopwords = stopwords.words('english')

# Adding new words into the stopwords l

In [6]:
# Loading in the newsgroup dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')

In [None]:
# Target_names is the category that the email belongs to
# Target is the numerical representation that the email belongs to
pprint(df.target_names.unique())
df.head()

In [7]:
# Let's take a look of what's inside the contents column
pprint(df.iloc[0]['content'])

("From: lerxst@wam.umd.edu (where's my thing)\n"
 'Subject: WHAT car is this!?\n'
 'Nntp-Posting-Host: rac3.wam.umd.edu\n'
 'Organization: University of Maryland, College Park\n'
 'Lines: 15\n'
 '\n'
 ' I was wondering if anyone out there could enlighten me on this car I saw\n'
 'the other day. It was a 2-door sports car, looked to be from the late 60s/\n'
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition,\n'
 'the front bumper was separate from the rest of the body. This is \n'
 'all I know. If anyone can tellme a model name, engine specs, years\n'
 'of production, where this car is made, history, or whatever info you\n'
 'have on this funky looking car, please e-mail.\n'
 '\n'
 'Thanks,\n'
 '- IL\n'
 '   ---- brought to you by your neighborhood Lerxst ----\n'
 '\n'
 '\n'
 '\n'
 '\n')


In [8]:
# Converting the data in a list
data = df.content.values.tolist()

# Removing the emails with regular expression
data = [re.sub('\S*@\S*\s?', '', x) for x in data]

# Removing \n
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", y) for y in data]

  data = [re.sub('\S*@\S*\s?', '', x) for x in data]
  data = [re.sub('\s+', ' ', sent) for sent in data]


In [9]:
pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [10]:
# Using GenSim's simple_preproess function to remove punctuations
# And to change it into a list of list of individual words
# Each sublist is one 'document/sentence'
# deacc=True is a parameter that controls whether punctuations are removed
def sentence_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [11]:
data = list(sentence_to_words(data))
print(data[:2])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst'], ['from', 'guy', 'kuo', 'subje

In [12]:
# Building the bigram and trigram models
# Higher threshold fewer phrases.
### Go find out what each parameter do in detail ###
bigram = gensim.models.Phrases(data, min_count=5, threshold=100) 
trigram = gensim.models.Phrases(bigram[data], threshold=100)  

In [13]:
print(trigram[bigram[data[0]]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [14]:
# Creating a function to remove stopwords, create bigrams & trigrams 
# and lemmatize the documents

# Stopwords function
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in texts]

# Bigrams function
def make_bigrams(texts):
    return [bigram[doc] for doc in texts]

# Trigrams function
def make_trigrams(texts):
    return [trigram[bigram[doc]] for doc in texts]

# Lemmatization function
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [15]:
# Removing stopwords
data = remove_stopwords(data)
pprint(data[:1])

[['wheres',
  'thing',
  'subject',
  'car',
  'nntp',
  'posting',
  'host',
  'rac',
  'wam',
  'umd',
  'edu',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'lines',
  'wondering',
  'anyone',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  'door',
  'sports',
  'car',
  'looked',
  'late',
  'early',
  'called',
  'bricklin',
  'doors',
  'really',
  'small',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'production',
  'car',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'car',
  'please',
  'mail',
  'thanks',
  'il',
  'brought',
  'neighborhood',
  'lerxst']]


In [16]:
# Forming bigrams
data = make_bigrams(data)
pprint(data[:1])

[['wheres',
  'thing',
  'subject',
  'car',
  'nntp_posting',
  'host',
  'rac_wam',
  'umd_edu',
  'organization',
  'university',
  'maryland_college',
  'park',
  'lines',
  'wondering',
  'anyone',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  'door',
  'sports',
  'car',
  'looked',
  'late',
  'early',
  'called',
  'bricklin',
  'doors',
  'really',
  'small',
  'addition',
  'front_bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'production',
  'car',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'car',
  'please',
  'mail',
  'thanks',
  'il',
  'brought',
  'neighborhood',
  'lerxst']]


In [17]:
# Lemmatize words
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [18]:
data = lemmatization(data, 
                     allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [19]:
data[:1]

[['where',
  's',
  'thing',
  'subject',
  'car',
  'nntp_poste',
  'host',
  'organization',
  'university',
  'maryland_college',
  'park',
  'line',
  'wonder',
  'anyone',
  'could',
  'enlighten',
  'car',
  'see',
  'day',
  'door',
  'sport',
  'car',
  'look',
  'late',
  'early',
  'call',
  'bricklin',
  'door',
  'really',
  'small',
  'addition',
  'front_bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'year',
  'production',
  'car',
  'make',
  'history',
  'info',
  'funky',
  'look',
  'car',
  'mail',
  'thank',
  'bring',
  'neighborhood',
  'lerxst']]

In [20]:
# Creating the dictionary of words
dict = corpora.Dictionary(data)
## dict.token2id ## - Use to check the unique ids of each word

In [21]:
# Creating the corpus
corpus = [dict.doc2bow(sent) for sent in data]
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 5), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1)]]


In [22]:
# Building the LDA model
# LDA is one of the models used for topic modelling
## Go check out what each hyperparameter does ##
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dict,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [23]:
# Printing out the topics in the topic model
pprint(lda_model.print_topics())

[(0,
  '0.140*"file" + 0.073*"image" + 0.048*"color" + 0.041*"scan" + '
  '0.038*"package" + 0.034*"format" + 0.031*"library" + 0.021*"input" + '
  '0.021*"load" + 0.021*"generate"'),
 (1,
  '0.064*"armenian" + 0.049*"greek" + 0.039*"turk" + 0.032*"turkish" + '
  '0.025*"turkey" + 0.016*"serdar_argic" + 0.015*"armenia" + 0.013*"massacre" '
  '+ 0.013*"genocide" + 0.013*"moslem"'),
 (2,
  '0.095*"space" + 0.028*"earth" + 0.027*"science" + 0.024*"launch" + '
  '0.022*"moon" + 0.021*"mission" + 0.020*"orbit" + 0.020*"nasa" + '
  '0.017*"satellite" + 0.015*"mar"'),
 (3,
  '0.682*"ax" + 0.051*"max" + 0.014*"family" + 0.012*"telnet" + '
  '0.009*"circuit" + 0.008*"intel" + 0.007*"blind" + 0.006*"honda" + '
  '0.005*"pointer" + 0.005*"travel"'),
 (4,
  '0.087*"patient" + 0.053*"md" + 0.045*"disease" + 0.032*"announcement" + '
  '0.030*"pittsburgh" + 0.030*"medical" + 0.029*"health" + 0.028*"treatment" + '
  '0.023*"zone" + 0.022*"detroit"'),
 (5,
  '0.086*"tape" + 0.065*"material" + 0.053*"co

In [24]:
# Compute Perplexity
# a measure of how good the model is. lower the better.
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, 
                                     texts=data, 
                                     dictionary=dict, 
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -14.046673725834745

Coherence Score:  0.5496813728375849


In [None]:
# 
def format_topics(idamodel=lda_model, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()

In [None]:
for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']