### The Process

In [2]:
# We pick the topics ahead of time even if we're not sure what the topics are
# Each document is represented as a distribution over topics
# Each document is presented as a distribution over topics.
# Each topic is represented as a distribution of words

In [3]:
# Probability Distribution
# Every docuement is a distribution of topics
# Every topic is a distribution of words

In [4]:
# Goal: You want LDA to learn the topic mix in each document, and the word mix in each topic

# Choose the number of topics you think there are in your corpus
# K = 2

# Randomly assign each word in each document to one of two topics

# Go through every word and its topic assignment in each document. Look at (1) how often the topic occurs in the document\
# and (2) how often the word occurs in the topic overall. Based on this info, assign the word a new topic.

# Go thorugh multiple iterations of this. Eventually the topics will start making sense.

In [5]:
# Input: Document-Term Matrix, Number of topics, Number of iterations

# Gensim will go through the process of finding the best word distribution\ 
# for each topic and best topic distribution for each document

### Text Cleaning

In [6]:
import pandas as pd
import numpy as np

# import the necessary libraries for LDA with Gensim
from gensim import matutils, models
import scipy.sparse



In [7]:
# df = pd.read_csv('New_Data.csv', usecols = ['all_features'], low_memory = True)
# df.head()

In [8]:
# Convert columns to string
# pre_text = df.squeeze()
# text = ' '.join(pre_text)
# print(text)

In [9]:
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()

In [10]:
# Clean text and return a list of tokens
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [11]:
# Use NLTK's wordnet to find the meaning of words, synonoms, antonyms, and more.
# Use word lemmatizer to get the root word.

In [12]:
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ahmad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
# Filter out stopwords

In [14]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahmad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Define a fucntion that prepares the text for topic modelling

In [19]:
# def prepare_text_for_lda(text):
#     tokens = tokenize(text)
#     tokens = [token for token in tokens if len(token) > 4]
#     tokens = [token for token in tokens if token not in en_stop]
#     tokens = [get_lemma(token) for token in tokens]
#     return tokens
# def prepare_text_for_lda(text):
#     new_tokens = []
#     for token in tokenize(text):
#         if (len(token) > 4 and token not in en_stop):
#             new_tokens.append(get_lemma(token))
#     return new_tokens

# retuns a list -  list is full of token 
def prepare_text_for_lda(text):
    return [get_lemma(token) for token in tokenize(text) if (len(token) > 4 and token not in en_stop)]

In [20]:
# Open our data read it in line by line. For each line, prepare text for LDA, then add to a list

In [21]:
import random
text_data = []
with open('dataset.csv', encoding='utf-8') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['139,stock', 'market', 'motivate', 'compensation', 'adequate', 'company', 'company', 'value', 'employee', 'terminate']
['146,good', 'benefit', 'benefit', 'company', 'care', 'employee', 'balance', 'overwork']
['171,"working', 'outstanding', 'technology', 'product', 'portfolio', 'really', 'ahead', 'competition', 'quality', 'offer', 'mobility', 'market', 'money', 'costs', 'company', 'executive', 'management', 'willing', 'whatever', 'take', 'people', 'profit', 'disgust', 'disguise', 'think', 'owner', 'guilty', 'working', 'please', 'reply', 'email', 'saturday', 'fire', 'moment', 'accept', 'furlough', 'bother', 'calling', 'working', 'commit', 'great', 'company', 'treat', 'people', 'trash', 'advance', 'company', 'treat', 'garbage', 'value', 'capability', 'space', 'mistake', 'always', 'someone', 'blame', 'course', 'fire']
['176,great', 'place', 'fresher', 'automobile', 'industry', 'flexible', 'working', 'hours', 'graduate', 'student', 'ready', 'presentation']
['211,"benefits', 'benefit', 'con

### LDA with Gensim

In [22]:
# First create a dictionary from the data, then convert to bag-of-words corpus and save the dictionary and corpus for future use.

In [23]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [24]:
# Define the number of topics you want LDA to find

In [25]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.050*"company" + 0.050*"place" + 0.050*"great" + 0.027*"package" + 0.027*"offer"')
(1, '0.049*"employee" + 0.049*"company" + 0.027*"compensation" + 0.027*"motivate" + 0.027*"terminate"')
(2, '0.036*"salary" + 0.036*"balance" + 0.036*"444,"worst" + 0.036*"work" + 0.036*"future"')
(3, '0.054*"benefit" + 0.029*"graduate" + 0.029*"hours" + 0.029*"working" + 0.029*"176,great"')
(4, '0.041*"company" + 0.028*"working" + 0.028*"treat" + 0.028*"fire" + 0.028*"people"')


### PYLDAVIS

In [26]:
# Used to interpret the topics in a topic model that has been fit to a corpus of text data
# Package extracts information from a fitted LDA topic model to inform an interactive web-based visualisation.

In [27]:
pip install pyLDAvis

Collecting pyLDAvis
  Using cached pyLDAvis-3.3.1-py2.py3-none-any.whl
Collecting scikit-learn
  Using cached scikit_learn-0.24.2-cp38-cp38-win_amd64.whl (6.9 MB)
Collecting sklearn
  Using cached sklearn-0.0-py2.py3-none-any.whl
Collecting funcy
  Using cached funcy-1.16-py2.py3-none-any.whl (32 kB)
Collecting numexpr
  Using cached numexpr-2.7.3-cp38-cp38-win_amd64.whl (93 kB)
Collecting future
  Using cached future-0.18.2-py3-none-any.whl
Collecting pandas>=1.2.0
  Using cached pandas-1.2.4-cp38-cp38-win_amd64.whl (9.3 MB)
Installing collected packages: scikit-learn, sklearn, pandas, numexpr, future, funcy, pyLDAvis
Note: you may need to restart the kernel to use updated packages.
ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\ahmad\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python38\\site-packages\\sklearn\\.libs\\vcomp140.dll'
Check the permissions.



In [100]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

ModuleNotFoundError: No module named 'pyLDAvis'