# LDA Slack Messages

https://medium.com/@lettier/how-does-lda-work-ill-explain-using-emoji-108abf40fa7d

## Create and preprocess data

In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pandas as pd
import nltk
import ssl
from nltk import word_tokenize

# used to download nltk stuff and get around error I was getting 
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

stemmer = SnowballStemmer("english")


def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def lemmatize(text):
    # return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    
    return [stemmer.stem(WordNetLemmatizer().lemmatize(word, pos='v')) for word in text]

def keep_nouns_and_verbs(text):
    # text = text.split(" ")
    tags = nltk.pos_tag(text)

    # nouns = [word for word,pos in tags if (pos[:2] == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS' or pos == 'VBD' or pos == 'VBDS' or pos == 'VBP')]
    nouns = [word for word,pos in tags if (pos[:2] == 'NN' or pos[:2] == 'VB')]
    
    return nouns

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    # remove stop words and smol words
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    
    # remove all but nouns
    nouns = keep_nouns_and_verbs(result)
    
    # lemmatize! the nouns
    lemmatized_nouns = lemmatize(nouns)
            
    return lemmatized_nouns

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/JoeSkimmons/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/JoeSkimmons/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Preprocessing

In [2]:
# iterate through all given messages and preprocess them
processed_msgs = []

for l in open('../data/general_text.txt', 'r'):
    l = l.strip()
    p = preprocess(l)
    if p:
        print(p)
        processed_msgs.append(p)

print(processed_msgs)

['background', 'slide', 'evolut', 'idea']
['think']
['hour', 'problem', 'solut']
['plan', 'minut', 'hour', 'revis', 'speak', 'today', 'talk', 'product', 'user']
['give', 'advic', 'shruti', 'say']
['list', 'serv', 'ventur', 'capit', 'recommend', 'reach']
['case', 'question', 'guy', 'experi', 'employ', 'want', 'employe', 'know', 'work']
['preval']
['employe', 'compani']
['visibl', 'option', 'post']
['idea']
['satisfi', 'confidenti', 'requir']
['depart', 'want', 'know', 'work']
['put', 'look', 'mess']
['form', 'group']
['think', 'connect', 'linkedin', 'boss', 'connect', 'network']
['connect', 'network', 'work', 'give', 'visibl', 'permiss', 'peopl', 'connect']
['sure']
['think', 'superior', 'form', 'group', 'want', 'group', 'fee']
['exampl', 'compani', 'group']
['group', 'superior']
['think', 'twitter', 'fee', 'peopl', 'group', 'work', 'peopl', 'post', 'receiv', 'feedback', 'smiley', 'face', 'network', 'base', 'there', 'intra', 'compani', 'engag']
['auto', 'generat', 'post', 'differenti', 

['come', 'visit']
['work', 'homework', 'today', 'tomorrow', 'want', 'reserv', 'room', 'tomorrow']
['saturday', 'need', 'time', 'midtown']
['saturday', 'reserv', 'need', 'scienc', 'engin', 'room', 'friday', 'reserv', 'scienc', 'engin', 'room', 'want']
['meet', 'butler']
['thank', 'reserv', 'work']
['hope', 'secur', 'let']
['pitch', 'deck', 'summari', 'decid', 'product', 'imag', 'diagram', 'need', 'market', 'competitor']
['email', 'tonight']
['connect', 'come', 'sach', 'need', 'draft', 'descript', 'slide', 'request', 'draft', 'weekend', 'weekend', 'start', 'get', 'environ', 'know', 'credit']
['educ', 'account', 'credit', 'cloud', 'comput', 'data', 'class']
['go', 'need', 'credit']
['professor', 'mention', 'credit', 'multipl', 'account']
['mention', 'alia', 'account']
['rough', 'draft', 'compani', 'overview', 'drive', 'folder', 'sourc', 'sale', 'stat', 'sell', 'power', 'hubspot', 'salesforc', 'competitor', 'look', 'insight', 'squar', 'accord', 'sheila', 'gulati']
['need', 'credit']
['inst

['farm', 'compani', 'know', 'person', 'lead', 'group', 'email', 'pictur', 'blog', 'submit']
['think', 'rest', 'map']
['thank', 'post', 'piazza', 'instructor', 'sourav']
['resolv', 'pictur', 'tell', 'copi']
['link', 'doc']
['person']
['look']
['know']
['need', 'name']
['blog', 'feel', 'paparazzi']
['think', 'cross', 'race', 'effect', 'keep', 'recogn']
['name', 'illustr']
['guy', 'day', 'get']
['even', 'time', 'check', 'piazza', 'updat', 'meet', 'suppos', 'happen']
['tomorrow', 'night', 'tri', 'conquer', 'mountain', 'work', 'build', 'begin', 'haha']
['agre', 'question', 'custom']
['googl', 'right']
['send', 'video']
['question', 'custom', 'one']
['titl']
['compil', 'list', 'send', 'custom', 'confirm', 'version', 'pitch', 'deck', 'file']
['review', 'mroe', 'send']
['exampl', 'contact', 'slide', 'version']
['ethan', 'dunn', 'master', 'scienc', 'student', 'columbia', 'univers', 'cours', 'build', 'technolog', 'startup', 'class', 'identifi', 'busi', 'need', 'solv', 'team', 'decid', 'custom', 

['blog', 'post', 'includ', 'thing', 'lesson', 'learn', 'class', 'practic', 'pitch', 'photo', 'class']
['take', 'instruct']
['present', 'pictur']
['pictur']
['pictur', 'person', 'pitch', 'class']
['tomorrow', 'night']
['pictur', 'begin', 'peopl', 'present', 'tell', 'differ', 'sure']
['peopl', 'tomorrow', 'time']
['overlap']
['googl', 'organ', 'structur']
['think', 'sentiment', 'student', 'pitch', 'pic']
['mention', 'team']
['put', 'pictur']
['team']
['googl', 'say', 'organ', 'flat']
['note', 'pitch', 'yesterday']
['class', 'yesterday']
['know', 'team', 'pitch']
['team']
['rememb', 'team', 'biolyt']
['forth', 'team', 'face', 'recognit']
['pitch', 'post', 'piazza']
['team', 'biolyt', 'say', 'intend', 'analyz', 'data', 'paper', 'determin', 'one', 'approv', 'money', 'compani']
['shruti', 'ask', 'question', 'compani', 'go', 'answer', 'go', 'target', 'medium', 'compani']
['forth', 'team', 'attent', 'face', 'recognit', 'idea', 'go', 'time']
['sorri', 'kind', 'present', 'work', 'today', 'week',

['guess', 'happen', 'word', 'email', 'tonight']
['rithvik', 'usernam']
['meet']
['respons', 'go']
['number', 'addon', 'crunchbas', 'think', 'preview', 'scroll', 'crunchbas', 'articl']
['crunchbas', 'entri', 'number', 'check']
['deal', 'evalu', 'right']
['think', 'week', 'date']
['thank']
['think', 'push', 'teach', 'creat', 'train', 'rememb', 'rachel', 'say', 'detect', 'email', 'meet', 'provid', 'valu']
['feedback', 'learn', 'content', 'want', 'avoid']
['busi', 'learn', 'need', 'expertis', 'lead', 'develop', 'process']
['shruti', 'gandhi', 'keep', 'say', 'train', 'defin', 'develop', 'skill', 'entri', 'account', 'execut', 'feedback', 'provid', 'learn', 'sdrs']
['add', 'question', 'question', 'custom', 'document']
['blog', 'post', 'tonight', 'tomorrow', 'night', 'valentin']
['night', 'weekend']
['post', 'question', 'studi', 'piazza', 'that', 'hear']
['post', 'exist', 'think', 'person', 'post', 'cours']
['look', 'join', 'noon']
['guy', 'email', 'morn', 'realiz', 'send', 'night']
['problem'

['messag', 'send']
['compani', 'sale', 'survey']
['form']
['content', 'follow']
['ryan', 'columbia', 'student', 'compani', 'columbia', 'busi', 'school', 'startup', 'work', 'project', 'incentiv', 'sale', 'team', 'seek', 'advic', 'potenti', 'custom', 'mind', 'complet', 'survey', 'thank']
['ryan']
['skip', 'term', 'sale', 'food', 'startup', 'think', 'target', 'right']
['sell', 'compani', 'compani']
['agre', 'harm', 'contact', 'compani', 'email', 'research', 'requir', 'survey', 'question']
['send', 'email', 'right']
['yeah', 'case', 'vari', 'size', 'data', 'depend', 'custom']
['phone', 'reach', 'schedul', 'want', 'email', 'survey', 'issu', 'survey', 'respons', 'date']
['send', 'email', 'show', 'send', 'email', 'compani', 'wonder', 'send', 'email']
['media', 'send', 'survey', 'link', 'group', 'send', 'messag', 'friend', 'ask', 'survey', 'respons', 'rate']
['email', 'type', 'communic', 'tool', 'wonder', 'convent']
['time', 'address', 'compani', 'compani']
['thank', 'advic']
['question', 'que

## Create Dictionary out of training data

In [5]:
dictionary = gensim.corpora.Dictionary(processed_msgs)

dictionary.filter_extremes(no_below=5, no_above=0.6)

print(dictionary)

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_msgs]

Dictionary(272 unique tokens: ['idea', 'slide', 'think', 'hour', 'problem']...)


## Create LDA model

In [13]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = , 
                                   id2word = dictionary,                                    
                                   passes = 4,
                                   workers = 2)

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.092*"send" + 0.078*"email" + 0.040*"messag" + 0.034*"sale" + 0.033*"respons" + 0.031*"media" + 0.022*"compani" + 0.021*"peopl" + 0.019*"data" + 0.018*"note"


Topic: 1 
Words: 0.052*"rachel" + 0.035*"yeah" + 0.035*"say" + 0.028*"base" + 0.028*"user" + 0.025*"account" + 0.025*"today" + 0.024*"meet" + 0.022*"think" + 0.021*"go"


Topic: 2 
Words: 0.066*"post" + 0.050*"team" + 0.049*"class" + 0.045*"compani" + 0.045*"blog" + 0.030*"deal" + 0.026*"meet" + 0.025*"thing" + 0.021*"check" + 0.018*"startup"


Topic: 3 
Words: 0.142*"think" + 0.031*"talk" + 0.024*"group" + 0.023*"salesforc" + 0.022*"connect" + 0.022*"problem" + 0.021*"point" + 0.020*"mention" + 0.020*"valu" + 0.019*"want"


Topic: 4 
Words: 0.058*"know" + 0.043*"product" + 0.037*"look" + 0.033*"slide" + 0.033*"help" + 0.033*"feel" + 0.031*"think" + 0.029*"need" + 0.021*"pitch" + 0.021*"analysi"


Topic: 5 
Words: 0.095*"channel" + 0.085*"join" + 0.065*"link" + 0.033*"go" + 0.031*"linkedin" + 0.030*"person" + 0

## Visualize Results

In [14]:
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Try to group an unseen sentence

In [121]:
unseen_document = "We should put that in the pitch deck"

bow_vector = dictionary.doc2bow(preprocess(unseen_document))

# print(lda_model[bow_vector])

# for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
#     print("Score: {}\t Topic {}: {}".format(score, index, lda_model.print_topic(index, 5)))
    
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic {}: {}".format(score, index, lda_model.print_topic(index, 5)))

Score: 0.6999968886375427	 Topic 7: 0.071*"pitch" + 0.054*"know" + 0.029*"send" + 0.028*"deck" + 0.027*"slide"
Score: 0.03333539515733719	 Topic 3: 0.054*"look" + 0.025*"time" + 0.025*"contact" + 0.024*"thing" + 0.019*"employe"
Score: 0.03333362936973572	 Topic 1: 0.059*"question" + 0.048*"post" + 0.036*"blog" + 0.035*"note" + 0.035*"say"
Score: 0.033333443105220795	 Topic 9: 0.070*"sale" + 0.052*"think" + 0.048*"send" + 0.043*"idea" + 0.041*"manag"
Score: 0.0333334356546402	 Topic 0: 0.079*"peopl" + 0.065*"data" + 0.031*"linkedin" + 0.025*"compani" + 0.023*"connect"
Score: 0.0333334356546402	 Topic 2: 0.049*"compani" + 0.032*"email" + 0.024*"call" + 0.019*"person" + 0.019*"think"
Score: 0.0333334356546402	 Topic 4: 0.099*"channel" + 0.094*"work" + 0.087*"join" + 0.053*"tonight" + 0.050*"email"
Score: 0.0333334356546402	 Topic 5: 0.076*"meet" + 0.054*"sale" + 0.030*"compani" + 0.028*"tuesday" + 0.025*"need"
Score: 0.0333334356546402	 Topic 6: 0.083*"thank" + 0.075*"think" + 0.070*"soun