In [79]:
import numpy as np
import pandas as pd
import re, gensim, spacy, nltk

from nltk.corpus import stopwords

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint


# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline


  and should_run_async(code)


In [80]:
# NLTK Stop words
# stop_words = stopwords.words('english')

nltk.download('words')
words = set(nltk.corpus.words.words())

  and should_run_async(code)
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [81]:
email_dataset = pd.read_csv("../csv-dataset/custom_email_dataset.csv") 
email_dataset.head()

  and should_run_async(code)


Unnamed: 0,DateTime,From,To,Subject,Message_body
0,2021-04-16 11:34:34+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,Learn how to code video games [free 7-hour Uni...,[<p>Here are this week's five links that are w...
1,2021-04-09 19:15:54+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,Two free Python courses for you this week – Dj...,[<p>Here are this week's five links that are w...
2,2021-04-02 14:59:32+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,Learn to code APIs using Node.js [free 8-hour ...,[<p>Here are this week's five links that are w...
3,2021-03-26 17:00:08+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,40 JavaScript project ideas so you can learn b...,[<p>Here are this week's five links that are w...
4,2021-03-19 17:27:22+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,Learn Data Structures and Algorithms [free 6-h...,[<p>Here are this week's five links that are w...


In [82]:
email_bodies=email_dataset.drop(['DateTime','From','To','Subject'], axis = 1)
email_bodies.head()

  and should_run_async(code)


Unnamed: 0,Message_body
0,[<p>Here are this week's five links that are w...
1,[<p>Here are this week's five links that are w...
2,[<p>Here are this week's five links that are w...
3,[<p>Here are this week's five links that are w...
4,[<p>Here are this week's five links that are w...


In [83]:
# Convert to list
data = email_bodies.Message_body
pprint(data[:1])


0    [<p>Here are this week's five links that are w...
Name: Message_body, dtype: object
  and should_run_async(code)


In [84]:
# Using regex expressions to get rid of distractions
# Remove <p> and</p> tags
data = [re.sub("<p[^>]*>", "", sent) for sent in data]
data = [re.sub("</?p[^>]*>", "", sent) for sent in data]

# Remove non alphanumeric and underscore values
data = [re.sub(r'[^\w]', ' ', sent) for sent in data]

# each document is in a list, convert to string
data = [''.join(str(sent)) for sent in data]


  and should_run_async(code)


In [85]:
data[0]

  and should_run_async(code)


' Here are this week s five links that are worth your time     1  Building video games can be just as much fun as playing them  And this in depth Unity 3D course for beginners will show you how to get started as a game developer  You ll learn how to install Unity  program game physics  animate your characters  code your enemy AI  and more   7 hour YouTube course   https   www freecodecamp org news game development for beginners unity course     2  As of 2021  more than 40  of all websites use WordPress  It s a relatively easy tool for building blogs  ecommerce sites  and more elaborate applications as well  This free course will show you how to host a WordPress site on the web  add custom features through plugins  and design it to look however you want   2 hour YouTube course   https   www freecodecamp org news how to make a website with wordpress     3  You may have heard about the branch of science called Game Theory  This tutorial will show you how Evolutionary Game Theory works in 

In [86]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))
pprint(data_words)

  'https',
  'www',
  'freecodecamp',
  'org',
  'ue',
  'ystttlxqo',
  'nnegosgnvb'],
 ['here',
  'are',
  'this',
  'week',
  'five',
  'links',
  'that',
  'are',
  'worth',
  'your',
  'time',
  'learn',
  'back',
  'end',
  'development',
  'with',
  'node',
  'js',
  'and',
  'express',
  'using',
  'this',
  'free',
  'in',
  'depth',
  'course',
  'hour',
  'watch',
  'https',
  'www',
  'freecodecamp',
  'org',
  'news',
  'learn',
  'express',
  'js',
  'in',
  'this',
  'complete',
  'course',
  'kevin',
  'got',
  'his',
  'first',
  'job',
  'as',
  'web',
  'developer',
  'when',
  'he',
  'was',
  'years',
  'old',
  'he',
  'shares',
  'his',
  'advice',
  'for',
  'how',
  'you',
  'can',
  'learn',
  'to',
  'code',
  'and',
  'get',
  'developer',
  'job',
  'too',
  'minute',
  'read',
  'https',
  'www',
  'freecodecamp',
  'org',
  'forum',
  'from',
  'es',
  'to',
  'esnext',
  'here',
  'every',
  'feature',
  'added',
  'to',
  'javascript',
  'since',
  'minu

In [87]:
# Define functions for stopwords, bigrams, trigrams and lemmatization

nlp = spacy.load("en_core_web_sm",disable=['parser', 'ner']) 
 
# Adding additional stopwords
new_stopwords= ['dear', 'thanks','regards', 'hello','hi', 'bye','goodbye', 'say', 'https',  'www', 'freecodecamp', 'org', 'news', 'five', 'links', 'worth', 'time', 'teach', 'read', 'email', 'week', 'minute', 'twitter', 'learn', 'course', 'quincy', 'larson', 'code', 'happy', 'ystttlxqo', 'nnegosgnvb', 'donate', 'build', 'teacher', 'hour', 'start', 'free', 'new', 'thing', 'use', 'tip']

for word in new_stopwords:
    nlp.Defaults.stop_words.add(word) 
    nlp.vocab[word].is_stop = True
    
def remove_stopwords_spacy(texts):
    return [[word.text for word in nlp(str(text)) if not word.is_stop] for text in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for           token in doc if token.pos_ in allowed_postags]))
    return texts_out

  and should_run_async(code)


In [88]:
# # Remove Stop Words
data_words_nostops = remove_stopwords_spacy(data_words)

# #remove words that only occur once to make process faster
# #all_tokens = sum(data_words_nostops, [])
# #tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
# #text_no_single_words = [[term for term in words if term not in tokens_once] for words in data_words_nostops]

#data_vocab_words=remove_non_vocab(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])


  and should_run_async(code)


In [89]:
print(data_lemmatized[0])

building video game fun play depth unity beginner start game developer install unity program game physics animate character enemy ai youtube game development beginner unity website wordpress relatively easy tool building blog ecommerce site elaborate application host wordpress site web add custom feature plugin design look want youtube website wordpress hear branch science call game theory tutorial evolutionary game theory work ecosystem simulation python good old fashion math introduction evolutionary game theory kubernete powerful devop tool manage software cloud haven hear year old say search find job opening mention kubernete lot company sergio recently pass linux foundation exam certify kubernete application developer share tip certify kubernete application developer dhawal update massive list course ivy league university online browsable list ivy league online course quick update make steady progress datum science curriculum expansion certification advanced mathematic computer sc

In [90]:
# Create Document-Word matrix

vectorizer = CountVectorizer(analyzer='word',       
                             min_df=1, # minimum reqd occurences of a word 
                             stop_words='english', # remove stop words
                             lowercase=True, # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}', # num chars > 3
                             # max_features=50000,  # max number of uniq words
                             )


  and should_run_async(code)


In [91]:
data_vectorized = vectorizer.fit_transform(data_lemmatized)

  and should_run_async(code)


In [92]:
# Build LDA Model with SK learn
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=5,# Number of topics
                                      max_iter=10,    # Max learning iterations
                                      learning_method='online',   
                                      random_state=100, # Random state
                                      batch_size=100,   # n docs in each learning iter
                                      evaluate_every = -1, # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,        # Use all available CPUs
                                      total_samples=1000000.0
                                     )

lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)  # Model attributes


  and should_run_async(code)
LatentDirichletAllocation(batch_size=100, learning_method='online',
                          n_components=5, n_jobs=-1, random_state=100)


In [93]:
# Log Likelihood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -83347.76916579407
Perplexity:  1533.0499283651156
{'batch_size': 100,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 5,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}
  and should_run_async(code)


In [94]:
# Define Search Param
search_params = {'n_components': [3, 5, 7, 8], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)



  and should_run_async(code)


GridSearchCV(estimator=LatentDirichletAllocation(learning_method='online',
                                                 learning_offset=50.0,
                                                 max_iter=5, random_state=0),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [3, 5, 7, 8]})

In [95]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.7, 'n_components': 3}
Best Log Likelihood Score:  -19225.286663834242
Model Perplexity:  1562.4416797280355
  and should_run_async(code)


In [96]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

  and should_run_async(code)


Unnamed: 0,Topic0,Topic1,Topic2,dominant_topic
Doc0,0.0,1.0,0.0,1
Doc1,0.0,1.0,0.0,1
Doc2,0.0,1.0,0.0,1
Doc3,0.0,1.0,0.0,1
Doc4,0.0,1.0,0.0,1
Doc5,0.0,0.99,0.0,1
Doc6,0.0,1.0,0.0,1
Doc7,0.0,0.99,0.0,1
Doc8,0.0,0.99,0.0,1
Doc9,0.0,0.99,0.0,1


In [97]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=10)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

  and should_run_async(code)


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,developer,aren,code,medium,game,turn,learn,react,watch,nonprofit
Topic 1,developer,datum,learn,code,app,web,email,turn,aren,share
Topic 2,developer,code,programming,python,computer,science,turn,aren,watch,email


In [98]:
Topics = ["Programming","Interviews/Job","Work"]
df_topic_keywords["Topics"]=Topics
df_topic_keywords


  and should_run_async(code)


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Topics
Topic 0,developer,aren,code,medium,game,turn,learn,react,watch,nonprofit,Programming
Topic 1,developer,datum,learn,code,app,web,email,turn,aren,share,Interviews/Job
Topic 2,developer,code,programming,python,computer,science,turn,aren,watch,email,Work


In [99]:
# Define function to predict topic for a given text document.
#nlp = spacy.load('en', disable=['parser', 'ner'])

def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization
# Step 1: Clean with simple_preprocess
    mytext_2 = list(sent_to_words(text))
# Step 2: Lemmatize
    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# Step 3: Vectorize transform
    mytext_4 = vectorizer.transform(mytext_3)
# Step 4: LDA Transform
    topic_probability_scores = best_lda_model.transform(mytext_4)
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()
    
    # Step 5: Infer Topic
    infer_topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), -1]
    
    #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
    return infer_topic, topic, topic_probability_scores

# Predict the topic
mytext = ["Here are this week's five links that are worth your time: \
1. If you used the internet today, you probably used NGINX. It's a powerful web server that most major websites use to handle traffic. And freeCodeCamp just published a free full-length NGINX book that will show you how to use this web server tool for routing, reverse proxying, and even load balancing. (2 hour read): https://www.freecodecamp.org/news/the-nginx-handbook/ \
2. You can also learn the MERN Stack by building your own Yelp-like restaurant review site. MERN stands for MongoDB + Express + React + Node.js. Then in the second half of the course, you'll learn how to swap out your Node.js/Express back end in favor of Serverless Architecture. (3 hour YouTube course): https://www.freecodecamp.org/news/create-a-mern-stack-app-with-a-serverless-backend/\
3. Learn how to create your own 3D graphics using OpenGL. You'll work with polygons, textures, shaders, and other important rendering tools. (2 hour YouTube course): https://www.freecodecamp.org/news/how-to-create-3d-and-2d-graphics-with-opengl-and-cpp/\
4. If you're learning Python, I encourage you to bookmark this. Prolific teacher and developer Estefania walks you through dozens of Python syntax examples that all beginners should learn. Data structures, loops, exception handling, dependency inclusion – everything. (90 minute read): https://www.freecodecamp.org/news/python-code-examples-sample-script-coding-tutorial-for-beginners/\
5. And while you're expanding your Python skills, you can learn how to do back end web development using the popular Python Django framework. You'll build data visualization web apps using Pandas dataframes, Matplotlib, and Seaborn. You'll also work with PDF rendering and even base-64 encoding. (7 hour YouTube course): https://www.freecodecamp.org/news/learn-django-3-and-start-creating-websites-with-python/\
Finally, a quick update on freeCodeCamp's Data Science Curriculum Expansion. We're designing 12 new certifications that will teach advanced mathematics, computer science, and machine learning. So far 2,031 people have donated to ensure that our nonprofit has a big enough budget to hire experienced teachers. You can learn more and get involved here: https://www.freecodecamp.org/news/building-a-data-science-curriculum-with-advanced-math-and-machine-learning/\
Quote of the Week: “Anytime someone builds a little application that runs on a cell phone, there's something that goes on the server.” – James Gosling, creator of the Java programming language\
Happy coding.\
- Quincy Larson \
Teacher at https://www.freecodecamp.org\
I share useful things on Twitter at https://www.twitter.com/ossia \
If these emails aren't worth your time, you can turn them off: https://www.freecodecamp.org/ue/YSTttLxqo1nneGosGnvB"] 

infer_topic, topic, prob_scores = predict_topic(text = mytext)
print(topic)
print(infer_topic)

  and should_run_async(code)
['datum', 'learn', 'code', 'app', 'web', 'email', 'turn', 'aren', 'share', 'Interviews/Job']
Interviews/Job


In [100]:
def apply_predict_topic(text):
 text = [text]
 infer_topic, topic, prob_scores = predict_topic(text = text)
 return(infer_topic)

email_dataset["Topic_key_word"]= email_dataset['Message_body'].apply(apply_predict_topic)
email_dataset

  and should_run_async(code)


Unnamed: 0,DateTime,From,To,Subject,Message_body,Topic_key_word
0,2021-04-16 11:34:34+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,Learn how to code video games [free 7-hour Uni...,[<p>Here are this week's five links that are w...,Interviews/Job
1,2021-04-09 19:15:54+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,Two free Python courses for you this week – Dj...,[<p>Here are this week's five links that are w...,Interviews/Job
2,2021-04-02 14:59:32+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,Learn to code APIs using Node.js [free 8-hour ...,[<p>Here are this week's five links that are w...,Interviews/Job
3,2021-03-26 17:00:08+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,40 JavaScript project ideas so you can learn b...,[<p>Here are this week's five links that are w...,Interviews/Job
4,2021-03-19 17:27:22+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,Learn Data Structures and Algorithms [free 6-h...,[<p>Here are this week's five links that are w...,Interviews/Job
...,...,...,...,...,...,...
100,2019-01-25 05:45:19+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,Harvard's CS50 Intro to Computer Science cours...,[<p>Here are this week's five links that are w...,Work
101,2019-01-17 17:45:13+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,How to build your own e-commerce website from ...,[<p>Here are this week's five links that are w...,Work
102,2019-01-10 17:19:18+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,The React Handbook - a massive free guide to b...,[<p>Here are this week's five links that are w...,Interviews/Job
103,2018-12-20 22:09:04+00:00,Quincy Larson <quincy@freecodecamp.org>,malvisbid@gmail.com,Learn React.js - a free 5-hour course for begi...,"[<p>This is my final ""links worth your time"" l...",Interviews/Job


In [103]:
email_dataset.groupby('Topic_key_word').nunique()

  and should_run_async(code)


Unnamed: 0_level_0,DateTime,From,To,Subject,Message_body
Topic_key_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Interviews/Job,62,2,1,62,62
Work,43,1,1,43,43
