## Some notes on LDA
Latent Dirichlet Allocation

Unsupervised learning

Assumptions:
- Documents with similar topics use similar groups of words
- Latent topics can then be found by searching for groups of words that frequently occur together in documents across the corpus
- Documents are probability distributions over latent topics
- Topics are probability distributions over words

(notes from NLP w Python on Udemy)

## Prepare data

In [3]:
import pandas as pd
#df = pd.read_csv('Data/GD_INS_df_wlabels_wsentiments.csv')
#rerun with combined reviews
#df = pd.read_csv('Data/collapsed_df_wlabels_wsentiments.csv')
#df = pd.read_csv('Data/1collapsed_allcompanies_wsentiments.csv')
#df = pd.read_csv('glassdoor_reviews_top100.csv')
import pickle
df = pd.read_pickle("./1collapsed_allcompanies_wsentiments.pkl")


In [4]:
company_names = df.company.str.lower()
company_names = company_names.unique()
customize_stop_words = company_names.tolist()
customize_stop_words.append('also')
customize_stop_words.append('na')
customize_stop_words.append('austin')
customize_stop_words.append('expedia')

customize_stop_words

['360training',
 '3m',
 'amd',
 'at&t',
 'accruent',
 'adobe',
 'amazon',
 'apple',
 'applied materials',
 'arm holdings',
 'atlassian',
 'bankvue',
 'bazaarvoice',
 'bigcommerce',
 'bioware',
 'blackbaud',
 'cirrus logic',
 'cisco systems',
 'dell',
 'dropbox',
 'ebay',
 'electronic arts',
 'epicore',
 'facebook',
 'forcepoint',
 'freescale',
 'gsdm',
 'gemalto',
 'global foundries',
 'google',
 'hewlett packard enterprises',
 'hostgator',
 'hotschedules',
 'ibm',
 'indeed',
 'informatica',
 'instacart',
 'intel',
 'invenio solutions',
 'khoros',
 'lawnstarter',
 'luminex',
 'main street hub',
 'mood media',
 'national instruments',
 'netsync network solutions',
 'nvidia',
 'oracle',
 'paypal',
 'pivot3',
 'plainview',
 'polycom',
 'procore technologies',
 'protect america',
 'q2 software',
 'qualcomm',
 'rackspace',
 'retailmenot',
 'sailpoint',
 'salesforce',
 'samsung austin semiconductor',
 'signpost',
 'silicon labs',
 'solarwinds',
 'sparefoot',
 'spiceworks',
 'stitch fix',
 't

In [5]:
from sklearn.feature_extraction import text 

my_stop_words = text.ENGLISH_STOP_WORDS.union(customize_stop_words)

In [6]:
import re
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def clean_comments(text):
    text = re.sub(r'[?|!|\'|"|#]',r'',text)
    text = re.sub(r'[.|,|)|(|\|/]',r' ',text)
    text = text.replace("@","").replace("&","").replace(":","").replace("\"","").replace("/","").replace("''","")
    text = text.replace("â€œ","").replace("â€˜","").replace("*","").replace("’","").replace("\n","").replace("$","")
    text = text.lower()
    
    if not bool(set(text.split()) & set(my_stop_words) or set(text.split())):
        return None
    
    new_text = ""
    for word in text.split():
        tmp_word = re.sub('[^a-z A-Z]+', ' ', word)
        new_text += tmp_word + " "
    new_text = new_text.strip()

    return new_text

def lemmat_list(words):
    new_words = []
    for word in words:
        new_words.append(lemmatizer.lemmatize(word))
    return new_words

#apply cleaning functions
df['cons_cleaned'] = df['cons'].apply(clean_comments)
df['cons_lemmats'] = df['cons_cleaned'].apply(lemmat_list)

df['pros_cleaned'] = df['pros'].apply(clean_comments)
df['pros_lemmats'] = df['pros_cleaned'].apply(lemmat_list)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words=my_stop_words, ngram_range=(1, 3))

In [8]:
dtm = cv.fit_transform(df['cons_cleaned'])

  'stop_words.' % sorted(inconsistent))


In [9]:
df.to_pickle("./1collapsed_allcompanies_wsentiments_cleaned.pkl")
#df.to_csv('1glassdoor_reviews_top100_cleaned.csv')

## LDA

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

In [38]:
LDA = LatentDirichletAllocation(n_components=3,random_state=42)

In [39]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=3, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

### get the vocabulary

In [163]:
#len(cv.get_feature_names())

In [162]:
#this is just random words selected from the list of words
#import random
#for i in range(10):
#    random_word_id = random.randint(0,len(cv.get_feature_names()))
#    print(cv.get_feature_names()[random_word_id])

### get the topics

In [13]:
LDA.components_

array([[0.50006432, 0.50003303, 0.50005645, ..., 0.50004114, 0.50007038,
        0.5000733 ],
       [3.49993568, 2.49996697, 9.49994355, ..., 3.49995885, 6.49992962,
        5.4999267 ]])

In [14]:
single_topic = LDA.components_[0]

In [15]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

array([23706, 17268, 14243,  2835,  9705,  4583, 24920,  1855, 24999,
       19180])

In [40]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')
    


THE TOP 15 WORDS FOR TOPIC #0
['time difficult', 'maximum', 'sales territories', 'outgrowing', 'summer months', 'know good', 'hardly work', 'paid hours', 'integrate', 'taxed', 'upgrading', 'bureaucracy dont', 'got good', 'far average', 'pay frequent', 'fruits', 'new programs', 'senior reps', 'schools', 'hours tough']


THE TOP 15 WORDS FOR TOPIC #1
['training', 'youre', 'product', 'hours', 'think', 'life', 'way', 'office', 'bad', 'need', 'teams', 'know', 'place', 'change', 'things', 'leadership', 'lack', 'years', 'culture', 'sales']


THE TOP 15 WORDS FOR TOPIC #2
['place', 'reviews', 'turnover', 'salary', 'training', 'main street', 'main', 'want', 'month', 'street', 'hub', 'way', 'months', 'product', 'cold', 'day', 'youll', 'business', 'youre', 'sales']




## Attaching  Topic Labels to Reviews

In [41]:
dtm.shape

(78, 25150)

In [42]:
topic_results = LDA.transform(dtm)

In [43]:
topic_results.shape

(78, 3)

In [44]:
topic_results[:,0]

array([8.31002527e-05, 2.09696796e-04, 3.55659410e-04, 2.98658572e-04,
       1.03168393e-04, 1.65264428e-04, 2.75245391e-04, 3.44253109e-04,
       2.82464051e-04, 2.59879916e-04, 1.55488971e-04, 1.19436832e-04,
       1.59976703e-04, 1.11198237e-04, 1.45366907e-04, 1.82802546e-04,
       1.85153187e-04, 3.06349803e-04, 3.84659135e-04, 1.70854862e-04,
       3.62686400e-04, 2.21751627e-04, 2.16784680e-04, 1.73084054e-04,
       1.32108443e-04, 2.98176739e-05, 1.52787958e-04, 1.96288188e-04,
       1.95861131e-04, 1.66686740e-04, 2.31193418e-04, 8.36207031e-05,
       1.06626196e-04, 3.69770174e-04, 1.14637624e-04, 1.74096401e-04,
       1.93886163e-04, 3.15627962e-04, 2.19135937e-04, 1.10084907e-04,
       8.59699628e-04, 1.56363508e-04, 2.98176739e-05, 1.51200197e-04,
       1.75373153e-04, 8.89719239e-04, 2.06433510e-04, 3.12378400e-04,
       2.86499779e-04, 2.15592048e-04, 1.48068275e-04, 1.91298284e-04,
       1.74505411e-04, 1.62154416e-04, 1.28630496e-04, 4.04018942e-04,
      

In [45]:
df['con_lda_0'] = topic_results[:,0]
df['con_lda_1'] = topic_results[:,1]
df['con_lda_2'] = topic_results[:,2]
#df['con_lda_3'] = topic_results[:,3]

In [46]:
topic_results[15].round(2) 

array([0., 1., 0.])

In [47]:
df['cons_cleaned'][15]

'work life balance is usually really great  sometimes it can be stressful when projects are due in a short amount of time but that is rare  interdepartmental communication could improve in some areas technical debt and legacy products that need to be maintained and carefully contemplated with next gen projects a few growing pains integrating sales teams and new products like most other tech companies growing at the same speed there is ever evolving change and need for process improvement be prepared to work hard be equally prepared to be rewarded for that hard work managing through legacy of a   year old company transforming a company means some decisions are better than others we are improving in our ability to learn and adjust and move emps have limited visibility to long term company vision sometimes work can be tough emotionally there are a lot of customers with some heavy missions some more so than others however blackbaud does do a good job encouraging employee participation and 

In [150]:
df['cons_cleaned'].head()

0    the most tenure someone has there that is not ...
1    dont expect to get paid well and there has be ...
2    long hours and high expectations are the norm ...
3    things are changing so fast within apple now t...
4    theres a general departure from the original n...
Name: cons_cleaned, dtype: object

In [48]:
topic_results[1].argmax()

1

In [49]:
df['Cons Topic'] = topic_results.argmax(axis=1)

In [50]:
df

Unnamed: 0,company,cons,Overall,Paid Time Off,Ability Telecommute,Flexible Hours,Social Environment,Coworkers,Wellness Initiatives,Mentorship,...,pros_negative,pros_positive,cons_cleaned,cons_lemmats,pros_cleaned,pros_lemmats,con_lda_0,con_lda_1,con_lda_2,Cons Topic
0,360Training,The most tenure someone has there that is not ...,1.9,2.2,2.8,3.0,2.4,2.0,1.8,1.2,...,0.040,0.246,the most tenure someone has there that is not ...,"[t, h, e, , m, o, s, t, , t, e, n, u, r, e, ...",there are no pros with the exception of a coup...,"[t, h, e, r, e, , a, r, e, , n, o, , p, r, ...",0.000083,0.999830,0.000087,1
1,3m,Modernization of lab buildings as they are vas...,3.3,3.7,3.0,3.2,2.7,3.4,3.7,3.0,...,0.020,0.403,modernization of lab buildings as they are vas...,"[m, o, d, e, r, n, i, z, a, t, i, o, n, , o, ...",collaboration come as you are diversity a cele...,"[c, o, l, l, a, b, o, r, a, t, i, o, n, , c, ...",0.000210,0.999574,0.000216,1
2,AMD,the usual cons - slow to execute I didn't feel...,3.6,4.0,3.6,4.0,3.3,3.9,3.4,2.7,...,0.015,0.410,the usual cons slow to execute i didnt feel ...,"[t, h, e, , u, s, u, a, l, , c, o, n, s, , ...",good engineering environment to work in as a c...,"[g, o, o, d, , e, n, g, i, n, e, e, r, i, n, ...",0.000356,0.999279,0.000365,1
3,AT&T,Lack of training in product. Customers suffer...,2.9,3.4,2.1,2.3,2.2,3.2,2.2,2.7,...,0.042,0.456,lack of training in product customers suffer b...,"[l, a, c, k, , o, f, , t, r, a, i, n, i, n, ...",good benefits and pay some company discounts g...,"[g, o, o, d, , b, e, n, e, f, i, t, s, , a, ...",0.000299,0.999393,0.000309,1
4,Accruent,Don't expect to get paid well and there has be...,3.2,4.6,4.0,4.0,3.9,3.8,3.6,3.2,...,0.025,0.316,dont expect to get paid well and there has bee...,"[d, o, n, t, , e, x, p, e, c, t, , t, o, , ...",the corporate culture is pretty good and the e...,"[t, h, e, , c, o, r, p, o, r, a, t, e, , c, ...",0.000103,0.999790,0.000107,1
5,Adobe,There are no cons to working at Adobe. No cons...,3.7,4.5,3.7,4.0,3.5,3.5,4.0,3.2,...,0.018,0.353,there are no cons to working at adobe no cons ...,"[t, h, e, r, e, , a, r, e, , n, o, , c, o, ...",adobe is a truly inspiring place to work the b...,"[a, d, o, b, e, , i, s, , a, , t, r, u, l, ...",0.000165,0.999666,0.000169,1
6,Amazon,Long hours and high expectations are the norm....,3.0,3.5,2.3,2.5,2.6,3.3,2.1,2.5,...,0.044,0.348,long hours and high expectations are the norm ...,"[l, o, n, g, , h, o, u, r, s, , a, n, d, , ...",fast promotion great culture locations worldwi...,"[f, a, s, t, , p, r, o, m, o, t, i, o, n, , ...",0.000275,0.999443,0.000282,1
7,Apple,things are changing so fast within Apple. Now ...,3.5,3.9,2.5,2.7,3.0,4.0,4.0,2.7,...,0.036,0.506,things are changing so fast within apple now d...,"[t, h, i, n, g, s, , a, r, e, , c, h, a, n, ...",the benefits and the flexibility apple is real...,"[t, h, e, , b, e, n, e, f, i, t, s, , a, n, ...",0.000344,0.999300,0.000355,1
8,Applied Materials,Horrible autocratic management and arbitrary t...,2.5,2.5,2.8,2.5,2.1,2.8,3.6,1.7,...,0.036,0.448,horrible autocratic management and arbitrary t...,"[h, o, r, r, i, b, l, e, , a, u, t, o, c, r, ...",salary nothing much else to say reasonable pay...,"[s, a, l, a, r, y, , n, o, t, h, i, n, g, , ...",0.000282,0.999429,0.000289,1
9,Arm Holdings,Nothing major that I have come across so far. ...,,,,,,,,,...,0.012,0.388,nothing major that i have come across so far d...,"[n, o, t, h, i, n, g, , m, a, j, o, r, , t, ...",only months in but i am finding people are f...,"[o, n, l, y, , , , m, o, n, t, h, s, , i, ...",0.000260,0.999475,0.000265,1


In [51]:
df['Cons Topic'] 

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    1
25    2
26    1
27    1
28    1
29    1
     ..
48    1
49    1
50    1
51    1
52    1
53    1
54    1
55    1
56    1
57    1
58    1
59    1
60    1
61    1
62    1
63    1
64    1
65    1
66    1
67    1
68    1
69    1
70    1
71    1
72    1
73    1
74    1
75    1
76    1
77    1
Name: Cons Topic, Length: 78, dtype: int64

## report for positive reviews

In [52]:
dtm = cv.fit_transform(df['pros_cleaned'])

In [62]:
LDA = LatentDirichletAllocation(n_components=3,random_state=42)

In [63]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=3, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [64]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')
    
#for individuals:
#topic 0 about culuture/environment
#topic 1 about worklife balance

#companies:
#topic 0 about atmosphere and culture
#topic 1 about flexibility worklife blanace
#topic 3 about company being good (leaderships and mission)

THE TOP 15 WORDS FOR TOPIC #0
['excellent', 'home', 'products', 'good work', 'learn', 'technology', 'growth', 'best', 'career', 'great benefits', 'flexible', 'work life balance', 'life balance', 'culture', 'work life']


THE TOP 15 WORDS FOR TOPIC #1
['life balance', 'snacks', 'work life', 'product', 'make', 'flexible', 'best', 'smart', 'office', 'amazing', 'lunch', 'food', 'perks', 'free', 'culture']


THE TOP 15 WORDS FOR TOPIC #2
['amazing', 'just', 'day', 'growth', 'product', 'leadership', 'snacks', 'sales', 'ive', 'hard', 'office', 'best', 'free', 'make', 'culture']




In [65]:
topic_results = LDA.transform(dtm)
topic_results[21].round(2) 

array([0.79, 0.  , 0.21])

In [66]:
df['pros_cleaned'][21]

'awesome co workers relaxed environment flexible schedule supportive management unbelievable benefits coffee machines cereal bars luxurious accommodations awesome jobpassionate peoplegreat benefitslove the products that we put out especially fifa great transparency from executives great team to work with nice co workers to your face generally ok atmosphere great benefits great people building a wide array of different types of gaming experiences plenty of room for learning and growth its a video game companyyou meet some great peoplephysical environment is pleasant awesome company great benefits i like my boss   good people good environment flexible hours you know what youre working on and its usually a big name title so you know people will know the game youve put your work into video games were awesome to play wonderful inspiring and talented people interesting challenges industry leading technical resources recognizable brand great way to learn industry standards   they give you gam

In [67]:
df['Pros Topic'] = topic_results.argmax(axis=1)

In [68]:
df['pro_lda_0'] = topic_results[:,0]
df['pro_lda_1'] = topic_results[:,1]
df['pro_lda_2'] = topic_results[:,2]

In [69]:
df['Pros Topic'] 

0     0
1     0
2     0
3     0
4     2
5     0
6     0
7     0
8     0
9     0
10    1
11    2
12    0
13    2
14    0
15    0
16    2
17    0
18    0
19    1
20    0
21    0
22    0
23    1
24    0
25    2
26    2
27    0
28    0
29    1
     ..
48    0
49    2
50    2
51    0
52    2
53    2
54    2
55    0
56    0
57    1
58    0
59    0
60    0
61    1
62    0
63    1
64    2
65    2
66    1
67    2
68    0
69    0
70    1
71    0
72    0
73    2
74    0
75    0
76    1
77    0
Name: Pros Topic, Length: 78, dtype: int64

In [70]:
df.columns

Index(['company', 'cons', 'Overall', 'Paid Time Off', 'Ability Telecommute',
       'Flexible Hours', 'Social Environment', 'Coworkers',
       'Wellness Initiatives', 'Mentorship', 'Maternity Leave',
       'Family Growth Support', 'Employer Responsiveness',
       'Learning Opportunities', 'Management Opportunities',
       'Equal Opportunities', 'Salary Satisfaction',
       'Female Representation in Leadership', 'Number of Ratings', 'pros',
       'advice_to_management', 'overall_rating', 'Career Opportunities',
       'Compensation and Benefits', 'Culture & Values', 'Work/Life Balance',
       'con_scores', 'cons_compound', 'cons_negative', 'cons_positive',
       'pro_scores', 'pros_compound', 'pros_negative', 'pros_positive',
       'cons_cleaned', 'cons_lemmats', 'pros_cleaned', 'pros_lemmats',
       'con_lda_0', 'con_lda_1', 'con_lda_2', 'Cons Topic', 'Pros Topic',
       'pro_lda_0', 'pro_lda_1', 'pro_lda_2'],
      dtype='object')

In [71]:
#df.to_csv('Data/1collapsed_allcompanies_wTopics.csv')
df.to_pickle("./1collapsed_allcompanies_wTopics.pkl")