In [139]:
import pandas as pd
import random

import string 
import re
import wordcloud
from PIL import Image
from os import path

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [215]:
eng_stopwords_and_punct = stopwords.words('english') + list(string.punctuation) + ['“','”', "''", '""', '...', '``', '—', '->', '’ ’']

# TweetTokenizer will put all text in the tweet to lowercase, strip out usernames/handles and reduce reated chars in words
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) 

lemmatizer = WordNetLemmatizer()

desired_num_topics = 5


def clean_tokenize_lemmatize_tweet(tweet):
    ''' 
    Apply text cleaning, tokenization then lemmatization on the given tweet 
    '''
    #remove urls
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    #remove RT 
    tweet = re.sub(r'^RT\s+', '', tweet)
    #remove the # symbol
    tweet = re.sub('#', '', tweet)
    #remove words containing numbers
    tweet = re.sub(r'\w*\d\w*', '', tweet)
    #tokenize
    tokens = tweet_tokenizer.tokenize(tweet)
    # remove stopwords and punctuation
    tokens = [token for token in tokens if token not in eng_stopwords_and_punct]
    #lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

def split_content_by_sentiment(region_df):
    pos = region_df[region_df['sentiment'] == 'positive']['content']
    neg = region_df[region_df['sentiment'] == 'negative']['content']
    neu = region_df[region_df['sentiment'] == 'neutral']['content']
    return [pos, neg, neu]

# show top words per topic
def show_top_words_per_topic(topcount, fitted_lda, count_vectorizer):
    for index, topic in enumerate(fitted_lda.components_):
        print(f'THE TOP {topcount} WORDS FOR TOPIC #{index}')
        print([count_vectorizer.get_feature_names()[i] for i in topic.argsort()[-topcount:]])
        print('\n')

In [216]:
#vetted_usa_tweets = pd.read_csv('../data/vetted_usa_tweets.csv')
northeast_tweets = vetted_usa_tweets[vetted_usa_tweets['region'] == 'Northeast']
southeast_tweets = vetted_usa_tweets[vetted_usa_tweets['region'] == 'Southeast']
southwest_tweets = vetted_usa_tweets[vetted_usa_tweets['region'] == 'Southwest']
west_tweets = vetted_usa_tweets[vetted_usa_tweets['region'] == 'West']
midwest_tweets = vetted_usa_tweets[vetted_usa_tweets['region'] == 'Midwest']
general_usa_tweets = vetted_usa_tweets[vetted_usa_tweets['region'] == 'General USA']

In [202]:
ne_sentiment_series = split_content_by_sentiment(northeast_tweets)
se_sentiment_series = split_content_by_sentiment(southeast_tweets)
sw_sentiment_series = split_content_by_sentiment(southwest_tweets)
w_sentiment_series = split_content_by_sentiment(west_tweets)
mw_sentiment_series = split_content_by_sentiment(midwest_tweets)
gen_us_sentiment_series = split_content_by_sentiment(general_usa_tweets)

Before performing the topic modeling, I'll need to clean this text. 
With the text classifier to detect sentiment, punctuation and capitalization could actually be useful to the classifer.
In topic modeling they are not, so I'm going to the following text processing before I apply LDA.

Text Cleaning
* Change to lowercase  (TweetTokenizer handles this)
* Remove RT and # (will need a regex)
* Remove urls (will need a regex)
* Remove stopwords and punctuation ()
* Peform lemmatization (after all other cleaning applied, apply lemmatization)


In [236]:
cv = CountVectorizer(max_df=0.95, min_df=2, tokenizer=clean_tokenize_lemmatize_tweet, ngram_range=(2,4))
lda = LatentDirichletAllocation(n_components=7, random_state=42, verbose=1)

# Unsupervised learning...no need to do a test/train split here ;)
dtm_ne = cv.fit_transform(w_sentiment_series[1])
print(cv.vocabulary_)


8, 'get online teaching': 489, 'hmu anytime need': 561, 'study hmu anytime need': 1251, 'hmu anytime need help': 562, 'online school': 854, 'school online': 1119, 'homework due': 582, 'email com': 367, 'homework due essay': 583, 'due dm': 317, 'dissertation pay': 277, 'research pay': 1103, 'pay discussion': 933, 'discussion pay': 271, 'class ..': 154, 'essay pay homework': 396, 'due essay due': 319, 'essay due dm': 382, 'english pay dissertation': 376, 'pay dissertation pay': 936, 'dissertation pay research': 278, 'pay research pay': 990, 'pay discussion pay': 934, 'due essay due dm': 320, 'pay essay pay homework': 948, 'essay pay homework pay': 397, 'english pay dissertation pay': 377, 'pay dissertation pay research': 937, 'dissertation pay research pay': 279, 'would affect': 1384, 'took online': 1327, '😭 😭': 1468, 'took online class': 1328, '😭 😭 😭': 1469, 'one class': 808, 'pay write': 1001, 'pay philosophy': 978, 'philosophy pay': 1015, 'someone pay write': 1169, 'pay write pay': 10

In [237]:
# This might take awhile if you are dealing with a large amount of documents!
lda.fit(dtm_ne)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


LatentDirichletAllocation(n_components=7, random_state=42, verbose=1)

In [238]:
show_top_words_per_topic(topcount=25, fitted_lda=lda, count_vectorizer=cv)

THE TOP 25 WORDS FOR TOPIC #0
['accounting pay economics pay', 'assignment pay paper due', 'assignment pay paper', 'pay assignment pay paper', 'pay accounting pay', 'pay project', 'class online', 'essay due', 'pay essay pay', 'paper due', 'pay online', 'pay assignment', 'pay online class', 'pay assignment pay', 'research paper pay', 'paper pay', 'assignment pay', 'essay pay', 'pay online class pay', 'due pay', 'research paper', 'class pay', 'online class pay', 'pay essay', 'online class']


THE TOP 25 WORDS FOR TOPIC #1
['pay business', 'essay pay', 'business law', 'school year', 'kid school', 'pay business law', 'teaching online', 'assignment homework', 'online distance', "i've never", 'history psychology', '’ going', 'get sick', 'last semester', 'online class assignment', 'class assignment', 'go back', 'research paper', 'term paper', 'essay due', 'remote learning', 'first day', 'high school', 'distance learning', 'online class']


THE TOP 25 WORDS FOR TOPIC #2
['go school', 'know sch