In [1]:
#!pip install pyLDAvis
#!pip install Spacy
#!pip install nltk
#!pip install gensim

In [None]:
#!conda update conda --yes

In [3]:
#!conda install Spacy -y

In [4]:
## import required libraries

import re
from pprint import pprint

## NumPy
import numpy as np
import pandas as pd

## MatplotLib
import matplotlib.pyplot as plt

## Gesim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import hdpmodel as HDPModel

## NLTK
import nltk
from nltk.corpus import stopwords

## Spacy
import spacy

## pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models

In [5]:
## loda data
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm',disable = ['parser', 'ner'])

#importing the Stopwords to use them

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','for'])

#downloading the data
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset = 'train')

data = newsgroups_train.data
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]

## cleaning the text 
def tokeniz(sentences):

    for sentence in sentences:
         yield(gensim.utils.simple_preprocess(str(sentence), deacc = True))

processed_data = list(tokeniz(data))

  data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
  data = [re.sub('\s+', ' ', sent) for sent in data]
[nltk_data] Downloading package stopwords to /Users/kowk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
## Building Bigram & Trigram Models
bigram      = gensim.models.Phrases(processed_data, min_count = 5, threshold = 100)
trigram     = gensim.models.Phrases(bigram[processed_data], threshold = 100)
bigram_mod  = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

##function to filter out stopwords
def remove_stopwords(texts):

    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


## function to create bigrams
def create_bigrams(texts):

    return [bigram_mod[doc] for doc in texts]

## function to create trigrams

def create_trigrams(texts):

    return [trigram_mod[bigram_mod[doc]] for doc in texts]


## function for lemmatization
def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB']):

    texts_op = [ ]
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_op.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])

    return texts_op

## removing stopwords, creating bigrams and lemmatizing the text
data_wo_stopwords = remove_stopwords(processed_data)
data_bigrams = create_bigrams(data_wo_stopwords)
data_lemmatized = lemmatize(data_bigrams, allowed_postags = [ 'NOUN', 'ADJ', 'VERB'])


#printing the lemmatized data
print(data_lemmatized[:3])

#creating a dictionary
gensim_dictionary = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized

#building a corpus for the topic model
gensim_corpus = [gensim_dictionary.doc2bow(text) for text in texts]

#printing the corpus we created above.
print(gensim_corpus[:3]) 

#we can print the words with their frequencies.
[[(gensim_dictionary[id], freq) for id, freq in cp] for cp in gensim_corpus[:4]] 

[['s', 'thing', 'car', 'nntp_poste', 'host', 'rac_wam', 'university', 'park', 'line', 'wonder', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'call', 'door', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst'], ['final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'report', 'keyword', 'acceleration', 'clock', 'upgrade', 'article', 'line', 'nntp_poste', 'host', 'fair', 'number', 'brave', 'soul', 'upgrade', 'clock', 'oscillator', 'share', 'experience', 'poll', 'send', 'brief', 'message', 'detail', 'experience', 'procedure', 'top', 'speed', 'attain', 'cpu', 'rate', 'speed', 'add', 'card', 'adapter', 'heat_sink', 'hour', 'usage', 'day', 'floppy_disk', 'functionality', 'floppy', 'request', 'summarize', 'day', 'add', 'network', 'knowledge', 'base', 'do', 'clock', 'upgrade', 'answer', 'poll', 'tha

[[('addition', 1),
  ('body', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('day', 1),
  ('door', 2),
  ('engine', 1),
  ('enlighten', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp_poste', 1),
  ('park', 1),
  ('production', 1),
  ('rac_wam', 1),
  ('rest', 1),
  ('s', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('spec', 1),
  ('sport', 1),
  ('thank', 1),
  ('thing', 1),
  ('university', 1),
  ('wonder', 1),
  ('year', 1)],
 [('call', 2),
  ('day', 2),
  ('host', 1),
  ('line', 1),
  ('nntp_poste', 1),
  ('thank', 1),
  ('acceleration', 1),
  ('adapter', 1),
  ('add', 2),
  ('answer', 1),
  ('article', 1),
  ('attain', 1),
  ('base', 1),
  ('brave', 1),
  ('brief', 1),
  ('card', 1),
  ('clock', 4),
  ('cpu', 1),
  ('detail', 1),
  ('do', 1),
  ('experience', 2),
  ('fair', 1),
  ('final', 2),


In [7]:
#creating hdp model
hdp_model = HDPModel.HdpModel(corpus = gensim_corpus, id2word = gensim_dictionary)

#viewing topics
pprint(hdp_model.print_topics())

[(0,
  '0.010*line + 0.008*write + 0.007*get + 0.006*say + 0.006*article + '
  '0.005*people + 0.005*know + 0.005*make + 0.005*go + 0.005*organization'),
 (1,
  '0.015*line + 0.010*write + 0.008*organization + 0.007*article + 0.007*get + '
  '0.006*know + 0.006*nntp_poste + 0.005*host + 0.005*think + 0.005*go'),
 (2,
  '0.014*line + 0.011*write + 0.008*article + 0.007*organization + 0.007*get + '
  '0.006*say + 0.005*know + 0.005*people + 0.005*nntp_poste + 0.005*think'),
 (3,
  '0.760*ax + 0.049*max + 0.001*ei + 0.001*wm + 0.001*pl_pl + 0.001*qax + '
  '0.001*bhj_bhj + 0.000*wm_wm + 0.000*giz_giz + 0.000*tm'),
 (4,
  '0.010*line + 0.006*write + 0.006*get + 0.005*article + 0.005*organization + '
  '0.004*know + 0.004*nntp_poste + 0.004*host + 0.003*use + 0.003*go'),
 (5,
  '0.010*line + 0.006*get + 0.006*write + 0.005*organization + 0.004*article + '
  '0.004*nntp_poste + 0.004*host + 0.004*need + 0.003*go + 0.003*know'),
 (6,
  '0.006*line + 0.004*good + 0.004*write + 0.004*think + 0.