In [1]:
# Required Libraries

#Base and Cleaning 
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
from collections import Counter

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)


  from imp import reload
  from .autonotebook import tqdm as notebook_tqdm
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
import pandas as pd

# Converting the dataset to pandas DataFrame and renaming the columns 
df = pd.read_csv('Habitat_Post_level_Merged.csv')
df['Post Message'] = df['Post Message'].astype(str)

#Removing emojies from text
#Refrence 1 : https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
#Refrence 2 : https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python

def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
df['emoji_free_tweets'] = df['Post Message'].apply(call_emoji_free)

#Create a new column with url free tweets
df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)

df = df[["Post Message" , "emoji_free_tweets", "url_free_tweets"]]
df.head()

Unnamed: 0,Post Message,emoji_free_tweets,url_free_tweets
0,,,
1,Join our team as a full-time ReStore Truck Dri...,Join our team as a full-time ReStore Truck Dri...,Join our team as a full-time ReStore Truck Dri...
2,Research has shown that when single women emba...,Research has shown that when single women emba...,Research has shown that when single women emba...
3,"Today, on International Women's Day, we celebr...","Today, on International Women's Day, we celebr...","Today, on International Women's Day, we celebr..."
4,"Clouds come floating into my life, no longer t...","Clouds come floating into my life, no longer t...","Clouds come floating into my life, no longer t..."


In [3]:
# Load spacy
# Make sure to restart the runtime after running installations and libraries tab
# nlp = spacy.load("en_core_web_trf")
import en_core_web_trf
nlp = en_core_web_trf.load()



In [4]:
"""
Import Gensim and Wordcloud to use their stopwords as well and use the combined stopwords of ALL as the variable:
ALL_STOP_WORDS
"""
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)


# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


tokens = []

for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

# View df
df

Unnamed: 0,Post Message,emoji_free_tweets,url_free_tweets,tokens
0,,,,[nan]
1,Join our team as a full-time ReStore Truck Dri...,Join our team as a full-time ReStore Truck Dri...,Join our team as a full-time ReStore Truck Dri...,"[join, team, full-time, restore, truck, driver..."
2,Research has shown that when single women emba...,Research has shown that when single women emba...,Research has shown that when single women emba...,"[research, shown, single, women, embark, conve..."
3,"Today, on International Women's Day, we celebr...","Today, on International Women's Day, we celebr...","Today, on International Women's Day, we celebr...","[today,, international, women's, day,, celebra..."
4,"Clouds come floating into my life, no longer t...","Clouds come floating into my life, no longer t...","Clouds come floating into my life, no longer t...","[clouds, come, floating, life,, longer, carry,..."
...,...,...,...,...
408,A sense of connectedness and a willingness to ...,A sense of connectedness and a willingness to ...,A sense of connectedness and a willingness to ...,"[sense, connectedness, willingness, actively, ..."
409,COVID-19 has forced everyone to deeply conside...,COVID-19 has forced everyone to deeply conside...,COVID-19 has forced everyone to deeply conside...,"[covid-19, forced, deeply, consider, fundament..."
410,Everyone deserves a decent and affordable plac...,Everyone deserves a decent and affordable plac...,Everyone deserves a decent and affordable plac...,"[deserves, decent, affordable, place, live.]"
411,"We build strength, stability, and self-relianc...","We build strength, stability, and self-relianc...","We build strength, stability, and self-relianc...","[build, strength,, stability,, self-reliance, ..."


In [5]:
# Refrence 4 : https://stackoverflow.com/questions/45306988/column-of-lists-convert-list-to-string-as-a-new-column

# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]
# df[['original_tweet', 'lemmas_back_to_text']]
df




Unnamed: 0,Post Message,emoji_free_tweets,url_free_tweets,tokens,tokens_back_to_text,lemmas,lemmas_back_to_text
0,,,,[nan],,[nan],
1,Join our team as a full-time ReStore Truck Dri...,Join our team as a full-time ReStore Truck Dri...,Join our team as a full-time ReStore Truck Dri...,"[join, team, full-time, restore, truck, driver...",join team full-time restore truck driver! pers...,"[join, team, time, restore, truck, driver, per...",join team time restore truck driver person ser...
2,Research has shown that when single women emba...,Research has shown that when single women emba...,Research has shown that when single women emba...,"[research, shown, single, women, embark, conve...",research shown single women embark conventiona...,"[research, show, single, woman, embark, conven...",research show single woman embark conventional...
3,"Today, on International Women's Day, we celebr...","Today, on International Women's Day, we celebr...","Today, on International Women's Day, we celebr...","[today,, international, women's, day,, celebra...","today, international women's day, celebrate st...","[today, international, woman, day, celebrate, ...",today international woman day celebrate strong...
4,"Clouds come floating into my life, no longer t...","Clouds come floating into my life, no longer t...","Clouds come floating into my life, no longer t...","[clouds, come, floating, life,, longer, carry,...","clouds come floating life, longer carry rain u...","[cloud, come, float, life, long, carry, rain, ...",cloud come float life long carry rain usher st...
...,...,...,...,...,...,...,...
408,A sense of connectedness and a willingness to ...,A sense of connectedness and a willingness to ...,A sense of connectedness and a willingness to ...,"[sense, connectedness, willingness, actively, ...",sense connectedness willingness actively help ...,"[sense, connectedness, willingness, actively, ...",sense connectedness willingness actively help ...
409,COVID-19 has forced everyone to deeply conside...,COVID-19 has forced everyone to deeply conside...,COVID-19 has forced everyone to deeply conside...,"[covid-19, forced, deeply, consider, fundament...",covid-19 forced deeply consider fundamental im...,"[covid-19, force, deeply, consider, fundamenta...",covid-19 force deeply consider fundamental imp...
410,Everyone deserves a decent and affordable plac...,Everyone deserves a decent and affordable plac...,Everyone deserves a decent and affordable plac...,"[deserves, decent, affordable, place, live.]",deserves decent affordable place live.,"[deserve, decent, affordable, place, live]",deserve decent affordable place live
411,"We build strength, stability, and self-relianc...","We build strength, stability, and self-relianc...","We build strength, stability, and self-relianc...","[build, strength,, stability,, self-reliance, ...","build strength, stability, self-reliance shelter.","[build, strength, stability, self, reliance, s...",build strength stability self reliance shelter


In [8]:
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # Removing url's
    pattern = r"http\S+"
    
    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    tokens = re.sub('@*!*\$*', '', text) # Remove @ ! $
    tokens = tokens.strip(',') # TESTING THIS LINE
    tokens = tokens.strip('?') # TESTING THIS LINE
    tokens = tokens.strip('!') # TESTING THIS LINE
    tokens = tokens.strip("'") # TESTING THIS LINE
    tokens = tokens.strip(".") # TESTING THIS LINE

    tokens = tokens.lower().split() # Make text lowercase and split it
    
    return tokens

# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)

# View those tokens (the 4th column)
df

  tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
  tokens = re.sub('@*!*\$*', '', text) # Remove @ ! $


Unnamed: 0,Post Message,emoji_free_tweets,url_free_tweets,tokens,tokens_back_to_text,lemmas,lemmas_back_to_text,lemma_tokens
0,,,,[nan],,[nan],,[nan]
1,Join our team as a full-time ReStore Truck Dri...,Join our team as a full-time ReStore Truck Dri...,Join our team as a full-time ReStore Truck Dri...,"[join, team, full-time, restore, truck, driver...",join team full-time restore truck driver! pers...,"[join, team, time, restore, truck, driver, per...",join team time restore truck driver person ser...,"[join, team, time, restore, truck, driver, per..."
2,Research has shown that when single women emba...,Research has shown that when single women emba...,Research has shown that when single women emba...,"[research, shown, single, women, embark, conve...",research shown single women embark conventiona...,"[research, show, single, woman, embark, conven...",research show single woman embark conventional...,"[research, show, single, woman, embark, conven..."
3,"Today, on International Women's Day, we celebr...","Today, on International Women's Day, we celebr...","Today, on International Women's Day, we celebr...","[today,, international, women's, day,, celebra...","today, international women's day, celebrate st...","[today, international, woman, day, celebrate, ...",today international woman day celebrate strong...,"[today, international, woman, day, celebrate, ..."
4,"Clouds come floating into my life, no longer t...","Clouds come floating into my life, no longer t...","Clouds come floating into my life, no longer t...","[clouds, come, floating, life,, longer, carry,...","clouds come floating life, longer carry rain u...","[cloud, come, float, life, long, carry, rain, ...",cloud come float life long carry rain usher st...,"[cloud, come, float, life, long, carry, rain, ..."
...,...,...,...,...,...,...,...,...
408,A sense of connectedness and a willingness to ...,A sense of connectedness and a willingness to ...,A sense of connectedness and a willingness to ...,"[sense, connectedness, willingness, actively, ...",sense connectedness willingness actively help ...,"[sense, connectedness, willingness, actively, ...",sense connectedness willingness actively help ...,"[sense, connectedness, willingness, actively, ..."
409,COVID-19 has forced everyone to deeply conside...,COVID-19 has forced everyone to deeply conside...,COVID-19 has forced everyone to deeply conside...,"[covid-19, forced, deeply, consider, fundament...",covid-19 forced deeply consider fundamental im...,"[covid-19, force, deeply, consider, fundamenta...",covid-19 force deeply consider fundamental imp...,"[covid-19, force, deeply, consider, fundamenta..."
410,Everyone deserves a decent and affordable plac...,Everyone deserves a decent and affordable plac...,Everyone deserves a decent and affordable plac...,"[deserves, decent, affordable, place, live.]",deserves decent affordable place live.,"[deserve, decent, affordable, place, live]",deserve decent affordable place live,"[deserve, decent, affordable, place, live]"
411,"We build strength, stability, and self-relianc...","We build strength, stability, and self-relianc...","We build strength, stability, and self-relianc...","[build, strength,, stability,, self-reliance, ...","build strength, stability, self-reliance shelter.","[build, strength, stability, self, reliance, s...",build strength stability self reliance shelter,"[build, strength, stability, self, reliance, s..."


In [9]:
# Create a id2word dictionary
id2word = Dictionary(df['lemma_tokens'])
print(len(id2word))

2449


In [10]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

1138


In [11]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

# 4 Topics:

In [12]:
# Instantiating a LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=4, id2word=id2word, workers=12, passes=5)

In [13]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]


In [14]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [15]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
home matthews volunteer family thank health time great community repair

------ Topic 1 ------
home build community family matthews habitat help repair program year

------ Topic 2 ------
housing habitat affordable home humanity family help work matthews build

------ Topic 3 ------
home community habitat build gift housing live match good americans



In [17]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)



Perplexity:  -6.695508571480972

Coherence Score:  0.3729640989598583


# 3 Topics:

In [19]:
# Instantiating a LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=3, id2word=id2word, workers=12, passes=5)

In [20]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]


In [21]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [22]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
build habitat home family housing help donate humanity learn affordable

------ Topic 1 ------
housing community home affordable matthews habitat income need build household

------ Topic 2 ------
home habitat family matthews help humanity repair place build live



In [23]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)



Perplexity:  -6.62849647317972

Coherence Score:  0.3744647840670063


# 6 Topics:

In [30]:
# Instantiating a LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=6, id2word=id2word, workers=12, passes=5)

In [31]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]


In [32]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [33]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
home community family housing affordable repair health homeownership habitat improve

------ Topic 1 ------
housing home matthews application family community affordable hour place nan

------ Topic 2 ------
home housing year program repair gift visit increase match need

------ Topic 3 ------
habitat matthews housing help build family home humanity community learn

------ Topic 4 ------
volunteer build work community construction thank home matthews church team

------ Topic 5 ------
home habitat family humanity world place matthews build happy live



In [34]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)



Perplexity:  -6.7918476618903085

Coherence Score:  0.4585735703598634


# 9 Topics:

In [35]:
# Instantiating a LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=9, id2word=id2word, workers=12, passes=5)

In [36]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]


In [37]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [38]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
habitat build black community donate humanity housing family home dream

------ Topic 1 ------
home community habitat build housing old matthews gift time opportunity

------ Topic 2 ------
housing application affordable matthews habitat community humanity homeownership income year

------ Topic 3 ------
housing affordable access nan increase year family house day economic

------ Topic 4 ------
home habitat family humanity build world place decent work live

------ Topic 5 ------
housing matthews habitat family affordable humanity greater restore build home

------ Topic 6 ------
home help family old place americans need matthews habitat program

------ Topic 7 ------
home volunteer family build matthews church 💚 thank dollar housing

------ Topic 8 ------
home repair program help critical day learn good family community



In [39]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)



Perplexity:  -6.861622561935722

Coherence Score:  0.38750917719363787


# 7 Topics:

In [40]:
# Instantiating a LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=7, id2word=id2word, workers=12, passes=5)

In [41]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]


In [42]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [43]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
home build community affordable family housing habitat decent help need

------ Topic 1 ------
housing family household home homeownership cost pandemic year income black

------ Topic 2 ------
home matthews family habitat application repair housing cost volunteer affordable

------ Topic 3 ------
home family program help habitat matthews build life repair nan

------ Topic 4 ------
habitat matthews humanity home community old learn greater housing work

------ Topic 5 ------
habitat build humanity volunteer world day home help happy know

------ Topic 6 ------
housing home church matthews thank habitat americans community help new



In [44]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)



Perplexity:  -6.767383419788621

Coherence Score:  0.41252684039245147
