# This notebook will process text from the comments dataset 

Run LDA model

Run KMeans Clustering model

In [1]:
import nltk
import numpy as np 
import pandas as pd 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
import re
from collections import Counter
import emoji

In [2]:
main_comments = pd.read_csv('data/main_comments.csv')
main_comments.head()

Unnamed: 0,author,body,created_utc
0,guesswhatihate,#DIPPING,1607957482
1,k12nmonky,NIO and SPCE ftw,1607957482
2,tnmtnmtnm,Flat,1607957482
3,everlastingdeath,Lol what happened to the limit up everyone's b...,1607957481
4,ZakkuTurner,"Jan 22 2021, 40.5",1607957481


In [3]:
main_comments.shape

(7491333, 3)

In [4]:
corpus = []
for i in range(len(main_comments)):
    corpus.append([main_comments.body[i]])

# main functions for processing the text of each post

In [5]:
# several of the processing functions were copied from the NLP lecture


# the toked function:
# removes newline characters
# removes apostrophes, easier to just remove apostrophes and leave naked contractions rather than expand them
    # there's only like what a handful of common contractions, they can be words for this as far as i'm concerned
# removes links that have the format 'https://etc'
# removes special characters
# replaces any number in the string with num
# returns this split stripped and joined back together to remove multiple blanks
# runs this through the nltk RegexTokenizer modified to handle emojis, which splits individual emojis
    # so that three rocketships is 3 occurrences of one rocketship, etc. instead of its own separate character
# finally translates emojis from their emoji symbol to a word representation of it so that ngrams captures these
tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
link_regex = re.compile(r'(https://)[a-z./0-9?=&;-]*')
numbers = re.compile(r'([0-9],?)+')
apostrophe = re.compile(r"\'|’") # heads up for both versions of the apostrophe!! dammit ' and ’
newline = re.compile(r'(\\n)')
specials = re.compile(r'[-+~`:;@=\'’"”#“$|,%/&^•·)(\]\[\*\\?!\._]')
def toked(doc):
    modified = newline.sub(' ', doc)
    modified = apostrophe.sub('', modified)
    modified = link_regex.sub('', modified)
    modified = specials.sub('', modified)
    modified = numbers.sub('', modified)
    modified = ' '.join(modified.split()).strip()
    modified = tokenizer.tokenize(modified)
    
    translated = []
    for word in modified:
        if word in emoji.UNICODE_EMOJI_ENGLISH:
            translated.append('emoji_' + emoji.UNICODE_EMOJI_ENGLISH[word])
        else:
            translated.append(word)
    
    translated = ' '.join(translated)
    modified = ' '.join(specials.sub('', translated).split()).strip()
            
    return modified


# lemmatize
def lemmatize(doc):
    # get the words in the document
    words = re.findall("\w+|[^\w\s]", doc)
    # get the parts of speech
    pos_tokens = nltk.pos_tag(words)
    
    # process_word(*token) uses star args to supply both word and part of speech to process_word
    # for token in pos_tokens - we want to do this for every token in the document
    
    return ' '.join([process_word(*token) for token in pos_tokens])


def process_word(word, pos):
    lemmatizer = WordNetLemmatizer()
    pos_dict = {'J':'a', 'V':'v', 'N':'n', 'R': 'r'}
    if pos[0] in pos_dict:
        return lemmatizer.lemmatize(word.lower(), pos_dict[pos[0]])
    else:
        return lemmatizer.lemmatize(word.lower())
    

# extract n-grams
def get_ngrams(doc):
    # build vectorizer and analyzer
    vectorizer = CountVectorizer(ngram_range=(2,3), preprocessor=None, tokenizer=None).build_analyzer()
    # return the n-grams of size 2 and 3 without stop words in this case
    return vectorizer(doc, stop_words=stopwords.words('english'))


# remove stopwords
my_additional_stop_words =  ('im', '', 'ud', 'pc', 'ampxb', 'unum', 'numc', 'unuma', 'unumd', 'num', 'numm', 'numk', 'xnumb', 'amp', 'im', 'ampxnumb', 'numlt', 'numnd', "ampxnumb")
stop_words = text.ENGLISH_STOP_WORDS #.union(my_additional_stop_words)
real_words = ''
def remove_stop_words(words):
#     words_split = words.split()
#     for word in words.split
    return [word for word in words.split() if word not in stop_words]

## function for doing all the text processing at once

In [6]:
# full processing workflow, removing stop words before generating ngrams
def full_processing(doc):
    tokens = toked(str(doc)) # returns a string
    words = lemmatize(tokens) # returns a string
    words = remove_stop_words(words) # returns a list
    ngrams = get_ngrams(' '.join(words)) # returns a list
    return words + ngrams

In [7]:
%%time


# the processed doc 
processed = []
for doc in corpus:
    processed.append(full_processing(doc))

vectorizer = CountVectorizer(stop_words=None, 
                             max_df=0.7, 
                             min_df=2, 
                             max_features=2500,
                             tokenizer=None,
                             lowercase=False, 
                             preprocessor=None,
                             analyzer=lambda x: x)
#                              ngram_range=(1,3))

X = vectorizer.fit_transform(processed)
features = vectorizer.get_feature_names()

lda = LatentDirichletAllocation(n_components=10, random_state=42, learning_method='online', n_jobs=-1)
lda.fit(X)
print(lda.components_.shape)
top_components = lda.components_.argsort()[:,-1:-11:-1]

# top_components.shape
for i, v in enumerate(top_components):
    print(i, [features[i] for i in v])

(10, 2500)
0 ['buy', 'hold', 'money', 'sell', 'im', 'today', 'dip', 'good', 'lose', 'happen']
1 ['gme', 'youre', 'yolo', 'flair', 'come', 'fund', 'question', 'bot', 'action', 'change']
2 ['emojigemstone', 'delete', 'emojieggplant', 'emojiraisinghands', 'emojieggplant emojieggplant', 'emojieggplant emojieggplant emojieggplant', 'hold', 'emojigemstone emojiraisinghands', 'robinhood', 'emojigemstone emojigemstone']
3 ['stock', 'like', 'u', 'look', 'just', 'market', 'think', 'people', 'company', 'big']
4 ['emojirocket', 'emojirocket emojirocket', 'emojirocket emojirocket emojirocket', 'nok', 'emojigemstone', 'gme', 'love', 'man', 'emojigorilla', 'check']
5 ['amc', 'retard', 'way', 'people', 'long', 'play', 'yes', 'yeah', 'im', 'new']
6 ['share', 'buy', 'price', 'right', 'know', 'sell', 'just', 'week', 'dont', 'try']
7 ['short', 'need', 'time', 'lol', 'day', 'squeeze', 'gonna', 'thats', 'trade', 'trading']
8 ['use', 'bb', 'post', 'tsla', 'tesla', 'best', '⣿', 'comment', 'dude', 'crash']
9 [