# LDA Slack Messages

https://medium.com/@lettier/how-does-lda-work-ill-explain-using-emoji-108abf40fa7d

## Create and preprocess data

In [65]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pandas as pd
import nltk
import ssl
from nltk import word_tokenize

# used to download nltk stuff and get around error I was getting 
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

stemmer = SnowballStemmer("english")


def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def lemmatize(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def keep_nouns(text):
    text = text.split(" ")
    tags = nltk.pos_tag(text)
    nouns = [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
    
    return nouns

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    # remove stop words and smol words
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    
    # remove all but nouns
    nouns = keep_nouns(result)
    
    # lemmatize! the nouns
    lemmatized_nouns = lemmatize(nouns)
            
    return result

this is a test sentence full of nouns and verb
['THis', 'test', 'sentence', 'nouns', 'verbs']


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/JoeSkimmons/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/JoeSkimmons/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Preprocessing

In [62]:
# iterate through all given messages and preprocess them
processed_msgs = []

for l in open('../data/general_text.txt', 'r'):
    l = l.strip()
    p = preprocess(l)
    if p:
        print(p)
        processed_msgs.append(p)

print(processed_msgs)

['background', 'slide', 'evolut', 'idea']
['think', 'right']
['hour', 'problem', 'solut']
['plan', 'minut', 'hour', 'revis', 'speak', 'today', 'interest', 'talk', 'product', 'user']
['give', 'good', 'advic', 'shruti', 'say']
['list', 'serv', 'columbia', 'ventur', 'capit', 'recommend', 'reach']
['case', 'question', 'guy', 'experi', 'employ', 'want', 'employe', 'know', 'youv', 'work']
['person', 'havent', 'preval']
['employe', 'compani']
['visibl', 'option', 'post']
['idea']
['satisfi', 'confidenti', 'requir']
['depart', 'want', 'know', 'work']
['put', 'fee', 'look', 'like', 'mess']
['true']
['form', 'group']
['think', 'connect', 'linkedin', 'probabl', 'connect', 'boss', 'meaningless', 'connect', 'network']
['mayb', 'connect', 'network', 'work', 'like', 'give', 'visibl', 'permiss', 'peopl', 'connect']
['sure']
['think', 'better', 'superior', 'form', 'group', 'want', 'group', 'fee']
['exampl', 'compani', 'group']
['group', 'customiz', 'superior']
['dont', 'think', 'like', 'linkedin', 'int

['actual', 'help', 'tone', 'phrase', 'tricki', 'hard', 'googl', 'learn']
['feel', 'free', 'tell', 'express', 'inappropri', 'improv']
['second', 'sentenc', 'think', 'better', 'phrase', 'target', 'pain', 'point']
['thought', 'complet', 'correspond', 'mockup', 'better', 'lead', 'manag', 'enhanc', 'autom', 'reach', 'follow', 'custom', 'inform', 'collect', 'analysi']
['think', 'post', 'piazza', 'find', 'help', 'clear', 'thing']
['like', 'email', 'student', 'columbia', 'univers', 'class', 'build', 'technolog', 'startup', 'team', 'work', 'product', 'improv', 'organ', 'custom', 'interact', 'sale', 'develop', 'repres', 'sdrs', 'like', 'discuss', 'idea', 'summar', 'attach', 'document', 'thank']
['subject', 'chang']
['look', 'good']
['preliminari', 'research', 'potenti', 'competitor', 'stand', 'similar', 'idea', 'product', 'saleshack', 'featur', 'pretti', 'close', 'attent']
['look', 'competit', 'analysi', 'outreach', 'acquir', 'saleshack', 'month']
['avail', 'meet', 'help', 'assembl', 'pitch', 'd

['status', 'decid', 'color']
['hope', 'haven', 'contact', 'connect']
['thank', 'see', 'overlap', 'split', 'thing', 'region', 'necessari']
['avoid', 'make', 'duplic', 'contact']
['sure', 'linkedin', 'algorithm', 'mayb', 'send', 'messag', 'long', 'overlap']
['draft', 'messag', 'linkedin', 'hope', 'person', 'profession']
['teddi']
['ryan', 'engin', 'student', 'columbia', 'univers', 'add', 'linkedin', 'day', 'read', 'profil', 'sale', 'experi', 'help', 'team', 'especi', 'experi', 'salesforc', 'work', 'product', 'artifici', 'intellig', 'improv', 'sale', 'approach', 'current', 'tri', 'feedback', 'idea', 'interest', 'attach', 'product', 'mockup']
['nice', 'like', 'introduc', 'swap', 'sentenc', 'start', 'read', 'profil', 'team', 'work', 'product', 'feel', 'like', 'profil', 'reach']
['thank', 'think', 'sequenc']
['manag', 'draft', 'follow', 'messag', 'basic', 'sdrs', 'sure', 'tone']
['manag']
['ryan', 'engin', 'student', 'columbia', 'univers', 'add', 'linkedin', 'day', 'read', 'profil', 'sale', 

['check', 'compani', 'time', 'idea']
['good', 'come', 'matrix', 'checklist', 'checklist', 'suitabl', 'give', 'tri', 'promot', 'comprehens', 'multipl', 'dimens', 'product', 'want', 'sure', 'hard', 'start']
['think', 'dimens', 'choos']
['competitor', 'rank']
['yeah', 'exact', 'question', 'give', 'larg', 'number', 'compani', 'consid', 'competitor', 'best', 'comparison', 'similar', 'comprens', 'product', 'product', 'dimension', 'nich', 'product']
['exact', 'type', 'competitor', 'look', 'social', 'media', 'platform', 'sale', 'team', 'train', 'softwar']
['good', 'question', 'ask', 'sale', 'manag', 'hear', 'similar']
['yeah', 'good', 'idea', 'talk', 'friend', 'week', 'mention', 'manag', 'look', 'potenti', 'altern', 'salesforc', 'think', 'go', 'frame', 'product', 'primarili', 'train', 'tool', 'oppos', 'simpli', 'make', 'live', 'easier', 'deal', 'flow', 'effect']
['long', 'messag']
['join', 'channel']
['join', 'channel']
['join', 'channel']
['make', 'sens', 'think', 'look', 'product', 'train', 

['posit', 'feedback', 'get', 'center', 'social', 'media', 'aspect', 'advantag', 'team', 'understand', 'field', 'user']
['intern', 'social', 'media', 'platform', 'limit', 'sale', 'avenu', 'compar', 'scan', 'social', 'media', 'hire']
['like', 'compani', 'year', 'like', 'leav', 'soon']
['sure', 'softwar', 'mention']
['sorri', 'signal', 'subway', 'interest', 'send', 'email', 'explain', 'discoveri', 'work']
['scan', 'social', 'media', 'hire', 'good', 'idea', 'face', 'data', 'issu', 'data']
['like', 'issu']
['opinion', 'time', 'draft', 'outlin', 'say', 'close', 'lookin']
['issu', 'discuss', 'think', 'inform', 'data', 'issu']
['interest', 'make', 'direct']
['professor', 'firm', 'potenti', 'design', 'partner', 'investor']
['ryan', 'kind', 'work', 'work']
['interest', 'internship', 'opportun', 'ask', 'mention', 'tool', 'express', 'product', 'ask', 'share', 'idea', 'group', 'want', 'disrupt', 'direct', 'mention', 'have', 'difficulti', 'good', 'opportun']
['current', 'work', 'simpli', 'draft', 'o

['reach', 'linkedin', 'email', 'attach', 'version', 'mockup']
['schedul', 'interest', 'invit', 'influenc', 'futur']
['beta', 'user', 'slack', 'channel']
['goal', 'total', 'sdrs', 'sale', 'manag']
['share', 'reach', 'email', 'slack']
['want', 'optim', 'qualiti', 'inform', 'call', 'tomorrow', 'go', 'send', 'list', 'question', 'want', 'answer', 'tonight', 'think', 'import', 'send', 'collat', 'start', 'googl', 'drive', 'know']
['piazza', 'post', 'team', 'verif', 'case', 'check', 'check', 'mintyy']
['thank']
['submit', 'right']
['submit', 'midterm', 'pitch', 'deck']
['cool', 'pass', 'tonight']
['guy', 'look', 'like', 'go', 'stick', 'boston', 'storm', 'origin', 'plan', 'afternoon', 'car', 'have', 'troubl', 'snow', 'doesnt', 'look', 'like', 'abl', 'class', 'meet', 'follow']
['problem', 'justin', 'stay', 'safe']
['check', 'suppos', 'blog', 'post', 'week', 'pictur', 'work', 'chanc', 'class', 'right', 'time', 'littl', 'late']
['check', 'mention', 'come', 'guidelin', 'piazza', 'post', 'shouldn', 

['thank', 'advic']
['add', 'question', 'question']
['quick', 'read', 'sale', 'rep', 'think']
['mention', 'recommend', 'price', 'negoti', 'use', 'teach', 'sale', 'rep']
['timelin', 'displac', 'think', 'depend', 'sale', 'manag', 'decid']
['clarifi', 'think', 'entir', 'sale', 'team', 'autom']
['sorri', 'run', 'late', 'later']
['say', 'sale', 'team', 'manag', 'think', 'have', 'account', 'execut', 'handl', 'lead', 'start', 'finish', 'function', 'today', 'substitut', 'tech']
['head', 'butler', 'abl', 'meet']
['join', 'channel']
['join', 'channel']
['speak', 'professor', 'advis', 'contact', 'sale', 'oper', 'team', 'sale', 'manag', 'sdrs', 'mention', 'industri', 'trend', 'sale', 'manag', 'sale', 'oper', 'team', 'look', 'displac', 'sdrs', 'tech', 'list', 'preliminari', 'idea', 'say', 'lead', 'forecast', 'viabl', 'space', 'look', 'overrun', 'mean', 'includ', 'sale', 'manag', 'sale', 'oper', 'team', 'market', 'research', 'mean', 'focus', 'gamifi', 'train', 'data', 'process', 'lead', 'forecast', '

## Create Dictionary out of training data

In [45]:
dictionary = gensim.corpora.Dictionary(processed_msgs)

dictionary.filter_extremes(no_below=3, no_above=0.6)

print(dictionary)

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_msgs]

Dictionary(334 unique tokens: ['idea', 'slide', 'hour', 'plan', 'product']...)


## Create LDA model

In [46]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 10, 
                                   id2word = dictionary,                                    
                                   passes = 15,
                                   workers = 2)

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.069*"deal" + 0.045*"customers" + 0.041*"value" + 0.038*"right" + 0.035*"works" + 0.028*"evaluation" + 0.027*"sales" + 0.027*"talk" + 0.021*"competitor" + 0.021*"companies"


Topic: 1 
Words: 0.058*"questions" + 0.055*"google" + 0.049*"notes" + 0.040*"sounds" + 0.039*"slides" + 0.038*"mintyy" + 0.032*"pitch" + 0.032*"problem" + 0.027*"yeah" + 0.025*"drive"


Topic: 2 
Words: 0.099*"time" + 0.079*"post" + 0.069*"blog" + 0.056*"class" + 0.051*"tuesday" + 0.041*"help" + 0.036*"posts" + 0.022*"group" + 0.021*"accounts" + 0.018*"wait"


Topic: 3 
Words: 0.089*"work" + 0.073*"pitch" + 0.041*"look" + 0.041*"tomorrow" + 0.040*"deck" + 0.031*"guys" + 0.029*"night" + 0.025*"tonight" + 0.025*"class" + 0.024*"check"


Topic: 4 
Words: 0.086*"data" + 0.063*"product" + 0.048*"linkedin" + 0.043*"messages" + 0.035*"company" + 0.034*"need" + 0.030*"things" + 0.028*"message" + 0.020*"hope" + 0.020*"companies"


Topic: 5 
Words: 0.081*"meet" + 0.079*"today" + 0.063*"media" + 0.038*"sale

## Try to group an unseen sentence

In [47]:
unseen_document = "We should put that in the pitch deck"

bow_vector = dictionary.doc2bow(preprocess(unseen_document))

# print(lda_model[bow_vector])

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic {}: {}".format(score, index, lda_model.print_topic(index, 5)))

Score: 0.6999949812889099	 Topic 3: 0.089*"work" + 0.073*"pitch" + 0.041*"look" + 0.041*"tomorrow" + 0.040*"deck"
Score: 0.033336807042360306	 Topic 1: 0.058*"questions" + 0.055*"google" + 0.049*"notes" + 0.040*"sounds" + 0.039*"slides"
Score: 0.0333341620862484	 Topic 8: 0.130*"team" + 0.042*"sales" + 0.038*"companies" + 0.032*"ideas" + 0.031*"emails"
Score: 0.033333513885736465	 Topic 6: 0.132*"sales" + 0.111*"people" + 0.042*"managers" + 0.032*"company" + 0.028*"video"
Score: 0.03333344683051109	 Topic 0: 0.069*"deal" + 0.045*"customers" + 0.041*"value" + 0.038*"right" + 0.035*"works"
Score: 0.033333417028188705	 Topic 2: 0.099*"time" + 0.079*"post" + 0.069*"blog" + 0.056*"class" + 0.051*"tuesday"
Score: 0.033333417028188705	 Topic 4: 0.086*"data" + 0.063*"product" + 0.048*"linkedin" + 0.043*"messages" + 0.035*"company"
Score: 0.033333417028188705	 Topic 5: 0.081*"meet" + 0.079*"today" + 0.063*"media" + 0.038*"salesforce" + 0.030*"tomorrow"
Score: 0.033333417028188705	 Topic 7: 0.10