In [8]:
import numpy as np
import pandas as pd
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

import re
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

import gensim
from gensim import corpora
import pickle

In [10]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [11]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [12]:
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [13]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [14]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [16]:
chat_file = pd.read_csv('all-posts-public-main-chatroom/freecodecamp_casual_chatroom.csv', encoding = 'utf-8')

  interactivity=interactivity, compiler=compiler, result=result)


In [17]:
chats = chat_file.text

In [19]:
chats_tokenized = []
# preprocessing - WILL TAKE 30+ MINS

for chat in chats:
    tokens = prepare_text_for_lda(str(chat))
    chats_tokenized.append(tokens)

In [21]:
# 5-10 mins
dictionary = corpora.Dictionary(chats_tokenized)
corpus = [dictionary.doc2bow(text) for text in chats_tokenized]

pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [None]:
dictionary = load_from_text('dictionary.gensim')

In [23]:
# WILL TAKE A WHILE
# took 1-2 hours
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=5)
ldamodel.save('model5t_5p_4w.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.031*"think" + 0.028*"would" + 0.022*"thanks" + 0.021*"really"')
(1, '0.401*"SCREEN_NAME" + 0.023*"function" + 0.019*"return" + 0.016*"star2"')
(2, '0.019*"though" + 0.016*"something" + 0.015*"maybe" + 0.014*"try"')
(3, '0.129*"SCREEN_NAME" + 0.051*"sparkle" + 0.037*"point" + 0.026*"thumbsup"')
(4, '0.024*"stuff" + 0.019*"going" + 0.019*"people" + 0.018*"thing"')


In [24]:
# WILL TAKE A WHILE
# took ?? hours (stsarted 11:40 pm, not done by 1:30am)
NUM_TOPICS = 15
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=5)
ldamodel.save('model15t_5p_4w.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.144*"really" + 0.105*"going" + 0.090*"start" + 0.074*"cookie"')
(1, '0.105*"though" + 0.076*"first" + 0.075*"learn" + 0.047*"course"')
(2, '0.085*"thanks" + 0.082*"function" + 0.068*"return" + 0.056*"never"')
(3, '0.058*"image" + 0.051*"everyone" + 0.051*"enough" + 0.038*"style"')
(4, '0.172*"sparkle" + 0.090*"thumbsup" + 0.089*"brownie" + 0.085*"send"')
(5, '0.860*"SCREEN_NAME" + 0.025*"star2" + 0.013*"error" + 0.011*"thank"')
(6, '0.084*"better" + 0.074*"look" + 0.052*"seem" + 0.046*"sorry"')
(7, '0.109*"people" + 0.067*"project" + 0.066*"freecodecamp" + 0.035*"create"')
(8, '0.133*"point" + 0.083*"right" + 0.060*"actually" + 0.034*"coffee"')
(9, '0.060*"using" + 0.058*"maybe" + 0.045*"pretty" + 0.043*"problem"')
(10, '0.147*"think" + 0.137*"would" + 0.066*"something" + 0.047*"things"')
(11, '0.130*"stuff" + 0.100*"thing" + 0.031*"wanna" + 0.031*"company"')
(12, '0.056*"check" + 0.055*"anything" + 0.051*"always" + 0.039*"every"')
(13, '0.059*"write" + 0.048*"getting" + 0.047*"

In [25]:
# WILL TAKE A WHILE
# took ~3 hours 
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=10)
ldamodel.save('model5t_10p_4w.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.542*"SCREEN_NAME" + 0.024*"thanks" + 0.009*"great" + 0.007*"thank"')
(1, '0.069*"sparkle" + 0.050*"point" + 0.036*"thumbsup" + 0.035*"brownie"')
(2, '0.026*"think" + 0.023*"would" + 0.022*"people" + 0.020*"something"')
(3, '0.020*"function" + 0.017*"return" + 0.013*"change" + 0.012*"question"')
(4, '0.023*"really" + 0.013*"stuff" + 0.012*"things" + 0.011*"would"')
