### The Process

In [1]:
# We pick the topics ahead of time even if we're not sure what the topics are
# Each document is represented as a distribution over topics
# Each document is presented as a distribution over topics.
# Each topic is represented as a distribution of words

In [2]:
# Probability Distribution
# Every docuement is a distribution of topics
# Every topic is a distribution of words

In [3]:
# Goal: You want LDA to learn the topic mix in each document, and the word mix in each topic

# Choose the number of topics you think there are in your corpus
# K = 2

# Randomly assign each word in each document to one of two topics

# Go through every word and its topic assignment in each document. Look at (1) how often the topic occurs in the document\
# and (2) how often the word occurs in the topic overall. Based on this info, assign the word a new topic.

# Go thorugh multiple iterations of this. Eventually the topics will start making sense.

In [4]:
# Input: Document-Term Matrix, Number of topics, Number of iterations

# Gensim will go through the process of finding the best word distribution\ 
# for each topic and best topic distribution for each document

### Text Cleaning

In [5]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

# import the necessary libraries for LDA with Gensim
from gensim import matutils, models
import scipy.sparse



In [6]:
# df = pd.read_csv('New_Data.csv', usecols = ['all_features'], low_memory = True)
# df.head()

In [7]:
# Convert columns to string
# pre_text = df.squeeze()
# text = ' '.join(pre_text)
# print(text)

In [8]:
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()

In [9]:
# Clean text and return a list of tokens
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [10]:
# Use NLTK's wordnet to find the meaning of words, synonoms, antonyms, and more.
# Use word lemmatizer to get the root word.

In [11]:
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ahmad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# Filter out stopwords

In [13]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahmad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# Define a fucntion that prepares the text for topic modelling

In [15]:
# def prepare_text_for_lda(text):
#     tokens = tokenize(text)
#     tokens = [token for token in tokens if len(token) > 4]
#     tokens = [token for token in tokens if token not in en_stop]
#     tokens = [get_lemma(token) for token in tokens]
#     return tokens
# def prepare_text_for_lda(text):
#     new_tokens = []
#     for token in tokenize(text):
#         if (len(token) > 4 and token not in en_stop):
#             new_tokens.append(get_lemma(token))
#     return new_tokens

# retuns a list -  list is full of token 
def prepare_text_for_lda(text):
    return [get_lemma(token) for token in tokenize(text) if (len(token) > 3 and token not in en_stop)]

In [16]:
# Open our data read it in line by line. For each line, prepare text for LDA, then add to a list

In [17]:
import random
text_data = []
with open('dataset.csv', encoding='utf-8') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['16,"cutting', 'edge', 'automotive', 'technology', 'better', 'average', 'benefit', 'environment', 'change', 'become', 'software', 'development', 'company', 'focus', 'next', 'generation', 'automotive', 'solution', 'brain', 'nervous', 'system', 'automobile', 'great', 'people', 'work', 'demand', 'management', 'team', 'do', 'environment', 'constant', 'change', 'turnover', 'knowledge', 'gap']
['65,strong', 'tier', 'metro', 'detroit', 'team', 'average', 'young', 'communication', 'efficient']
['123,"lots', 'bureaucracy', 'well', 'none', 'make', 'money', 'worth', 'lots', 'bureaucracy']
['225,"best', 'company', 'automotive', 'offer', 'good', 'package', 'employee', 'nice', 'place', 'work', 'supportive', 'staff', 'nothing', 'mention', 'right']
['248,"algorithm', 'developer', 'great', 'manager', 'work', 'life', 'balance', 'frequent', 'management', 'change', 'poor', 'tech', 'lead']
['285,senior', 'technical', 'leader', 'ownership', 'work', 'life', 'balance', 'visibility', 'learning', 'safety', 'sa

### LDA with Gensim

In [18]:
# First create a dictionary from the data, then convert to bag-of-words corpus and save the dictionary and corpus for future use.

In [19]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [20]:
# Define the number of topics you want LDA to find

In [28]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                            num_topics = NUM_TOPICS, 
                                            id2word=dictionary, 
                                            passes=5)
ldamodel.save('model5.gensim')

topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

#represented as a distribution over words
#

(0, '0.012*"good" + 0.012*"balance" + 0.012*"work" + 0.012*"salary" + 0.012*"life"')
(1, '0.063*"bureaucracy" + 0.035*"average" + 0.035*"team" + 0.034*"detroit" + 0.034*"money"')
(2, '0.050*"work" + 0.027*"salary" + 0.027*"automotive" + 0.027*"company" + 0.027*"balance"')
(3, '0.042*"good" + 0.042*"management" + 0.042*"work" + 0.042*"change" + 0.029*"life"')
(4, '0.013*"automotive" + 0.013*"environment" + 0.013*"change" + 0.013*"generation" + 0.013*"next"')


### PYLDAVIS

In [22]:
# Used to interpret the topics in a topic model that has been fit to a corpus of text data
# Package extracts information from a fitted LDA topic model to inform an interactive web-based visualisation.

In [26]:
# dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
# corpus = pickle.load(open('corpus.pkl', 'rb'))
# lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

# import pyLDAvis.gensim.models
# lda_display = pyLDAvis.models.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
# pyLDAvis.display(lda_display)

### Gensim word vector visualisation of various word vectors

In [41]:
import numpy as np

# Get the interactive Tools for Matplotlib
%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.decomposition import PCA


ModuleNotFoundError: No module named 'sklearn.utils'