### The Process

In [1]:
# We pick the topics ahead of time even if we're not sure what the topics are
# Each document is represented as a distribution over topics
# Each document is presented as a distribution over topics.
# Each topic is represented as a distribution of words

In [2]:
# Probability Distribution
# Every docuement is a distribution of topics
# Every topic is a distribution of words

In [3]:
# Goal: You want LDA to learn the topic mix in each document, and the word mix in each topic

# Choose the number of topics you think there are in your corpus
# K = 2

# Randomly assign each word in each document to one of two topics

# Go through every word and its topic assignment in each document. Look at (1) how often the topic occurs in the document\
# and (2) how often the word occurs in the topic overall. Based on this info, assign the word a new topic.

# Go thorugh multiple iterations of this. Eventually the topics will start making sense.

In [4]:
# Input: Document-Term Matrix, Number of topics, Number of iterations

# Gensim will go through the process of finding the best word distribution\ 
# for each topic and best topic distribution for each document

### Text Cleaning

In [18]:
import pandas as pd
import numpy as np

# import the necessary libraries for LDA with Gensim
from gensim import matutils, models
import scipy.sparse

In [21]:
# df = pd.read_csv('New_Data.csv', usecols = ['all_features'], low_memory = True)
# df.head()

In [8]:
# Convert columns to string
# pre_text = df.squeeze()
# text = ' '.join(pre_text)
# print(text)

In [17]:
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()

In [22]:
# Clean text and return a list of tokens
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [None]:
# Use NLTK's wordnet to find the meaning of words, synonoms, antonyms, and more.
# Use word lemmatizer to get the root word.

In [23]:
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ahmad\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [None]:
# Filter out stopwords

In [24]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahmad\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [None]:
# Define a fucntion that prepares the text for topic modelling

In [90]:
# def prepare_text_for_lda(text):
#     tokens = tokenize(text)
#     tokens = [token for token in tokens if len(token) > 4]
#     tokens = [token for token in tokens if token not in en_stop]
#     tokens = [get_lemma(token) for token in tokens]
#     return tokens
# def prepare_text_for_lda(text):
#     new_tokens = []
#     for token in tokenize(text):
#         if (len(token) > 4 and token not in en_stop):
#             new_tokens.append(get_lemma(token))
#     return new_tokens

# retuns a list -  list is full of token 
def prepare_text_for_lds(text):
    return [get_lemma(token) for token in tokenize(text) if (len(token) > 4 and token not in en_stop)]

In [91]:
# Open our data read it in line by line. For each line, prepare text for LDA, then add to a list

In [96]:
import random
text_data = []
with open('dataset.csv', encoding='utf-8') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['14,"stay', 'clear', 'company', 'people', 'struggle', 'survive', 'toxic', 'environment', 'toxic', 'culture', 'primarily', 'leadership', 'employee', 'value', 'recognition', 'appreciation', 'support', 'disposable', 'widget', 'discard']
['46,great', 'opportunity', 'stable', 'company', 'benefit', 'growth']
['57,embedded', 'software', 'engineer', 'environment', 'competitive', 'learn', 'things', 'project', 'assign', 'working', 'hours', 'project', 'assignment']
['73,neutral', 'globally', 'company', 'master', 'level', 'politics', 'departmental', 'performance', 'rather', 'organization']
['132,good', 'technology', 'level', 'active', 'safety', 'leadership', 'active', 'safety', 'leadership', 'level', 'strong', 'engineer', 'working', 'level', 'leaders', 'compromise', 'organization']
['329,"great', 'place', 'develop', 'personally', 'professionally', 'familiar', 'atmosphere', 'visible', 'hierarchy', 'office', 'collaborative', 'people', 'energy', 'organizational', 'approach', 'management', 'continuou

### LDA with Gensim

In [94]:
# First create a dictionary from the data, then convert to bag-of-words corpus and save the dictionary and corpus for future use.

In [95]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [72]:
# Define the number of topics you want LDA to find

In [97]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.087*"people" + 0.047*"benefit" + 0.047*"interest" + 0.047*"holiday" + 0.047*"better"')
(1, '0.078*"everything" + 0.078*"332,good" + 0.078*"balace" + 0.078*"going" + 0.078*"nothing"')
(2, '0.019*"hire" + 0.019*"abreast" + 0.019*"match" + 0.019*"technology" + 0.019*"execution"')
(3, '0.069*"balance" + 0.069*"nothing" + 0.069*"culture" + 0.069*"143,"best" + 0.069*"diversity"')
(4, '0.053*"culture" + 0.053*"hire" + 0.029*"147,"aptiv" + 0.029*"management" + 0.029*"decent"')


### PYLDAVIS

In [48]:
# Used to interpret the topics in a topic model that has been fit to a corpus of text data
# Package extracts information from a fitted LDA topic model to inform an interactive web-based visualisation.

In [50]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

ModuleNotFoundError: No module named 'pyLDAvis'