In [1]:
#Imports libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Imports libraries for text cleaning and manipulation
import nltk
import re
import collections
import string
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer

#Imports libraries for modeling and evaluation
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, recall_score, accuracy_score, make_scorer


#Imports dataset
df = pd.read_csv('../data/small_merged_chats')

In [2]:
df = df.dropna(subset = ['body'])


def ad(chat):
    result = False
    #Change result to True if link present in chat
    result = bool(re.search(r'www\.[a-z]?\.?(com)+|[a-z]+\.(com)', chat))
    result = bool(re.search(r'http\S+', chat))
    return result

#Makes ad column and gets rid of any ad messages
df['is_ad'] = df['body'].apply(ad)
df = df[df['is_ad'] == False]


def emoji_shorten(chat):
    chat = re.sub(r'(?i) \bpog(\w)*\b |\bpog(\w)*\b', 'pog', chat)
    chat = re.sub(r'(?i) \blul(\w)*\b |\blul(\w)*\b', 'lul', chat)
    chat = re.sub(r'(?i) \bkappa(\w)*\b |\bkappa(\w)*\b', 'kappa', chat)
    return chat

#Creates new column with emojis shortened to simple form
df['chats'] = df.body.apply(lambda x: emoji_shorten(x))


df = df.drop(columns = ['body', 'commenter_id', 'is_ad', 'created_at', 
                  'offset', 'twitch_chat', 'emotes'], axis = 1)

In [3]:
df

Unnamed: 0,channel_id,video_id,chats
0,66691674,264485130,wazupp
1,66691674,264485130,Yo
2,66691674,264485130,What up
3,66691674,264485130,yes!!!
4,66691674,264485130,Wassgud
...,...,...,...
103926,60056333,266069120,ello
103927,60056333,266069120,eyyy
103928,60056333,266069120,Kingrichard
103929,60056333,266069120,"BrokeBack My belly is fat, my brain has delay,..."


In [4]:
#Replaces pos tags with lemmatize compatable tags
def pos_replace(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
#Makes list of punctuation to exclude, keeps certain symbols
punct = list(string.punctuation)
keep_punct = ['?', '!', '@', ',', '.']
punct = [p for p in punct if p not in keep_punct]

#Removes non-ASCII characters (aka emojis that cant be converted to original symbol)
def remove_junk(tweet):
    return ''.join([i if ord(i) < 128 else ' ' for i in tweet])

def chat_tokenizer(doc):
    #Gets rid of weird characters
    doc = remove_junk(doc)
    #Tokenizes using NLTK Twitter Tokenizer as chats like tweets
    chat_token = TweetTokenizer(strip_handles = True)
    doc = chat_token.tokenize(doc)
    #Strips extra puntuation I don't want to keep
    doc = [w for w in doc if w not in punct]
    #Lemmatizes tokens
    doc = pos_tag(doc)
    doc = [(w[0], pos_replace(w[1])) for w in doc]
    lemmatizer = WordNetLemmatizer() 
    doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
    return doc

In [5]:
df.chats = df.chats.apply(chat_tokenizer)

In [8]:
import gensim.corpora as corpora
import gensim

# Create Dictionary
id2word = corpora.Dictionary(df.chats)

# Create Corpus
texts = df.chats

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [9]:
num_topics = 3
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
doc_lda = lda_model[corpus]

In [None]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))


LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

#For saving/loading models

# with open(LDAvis_data_filepath, 'wb') as f:
#     pickle.dump(LDAvis_prepared, f)

# with open(LDAvis_data_filepath, 'rb') as f:
#     LDAvis_prepared = pickle.load(f)
#pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html')

LDAvis_prepared