# Lift Calculations

## Import Libraries

In [1]:
import pandas as pd

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\conol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
import spacy
from nltk.util import ngrams
from collections import Counter
nlp=spacy.load('en_core_web_sm')

In [5]:
import re
import numpy as np
import pandas as pd
from pprint import pprint


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)



In [6]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

## Import Data

In [7]:
reviews_df = pd.read_csv('all_reviews.csv')
reviews_df

Unnamed: 0.1,Unnamed: 0,company,pros,cons
0,0,Uber,"Hours, pay, management and environment",I wouldn’t say I had any cons working at this job
1,1,Uber,Work your very own schedule,Sometimes the pay can be unbalanced
2,2,Uber,"Uber stock, good benefits, good experience","Bad commission structure, limited upside"
3,3,Uber,Decent pay for the work you do.,The customer service department doesnt take th...
4,4,Uber,Smart people\r\nLots of resources\r\nGreat com...,Fast-moving org - sometimes a little too fast ...
...,...,...,...,...
9663,5894,"VMware, Inc.",Was a great place to make good money and be pa...,Dell has had too heavy of an influence on the ...
9664,5895,"VMware, Inc.","work life balance, good to work with managers",easy to get some fantastic experience
9665,5896,"VMware, Inc.",The culture and values of this company makes i...,No cons to share best company to become a part...
9666,5897,"VMware, Inc.",Great campus and several onsite gyms,Hard to find affordable housing near campus.


## Data Prep

In [8]:
#Tokenize comment
reviews_df['con_token']=reviews_df['cons'].map(lambda x:nlp(x))
reviews_df['pro_token']=reviews_df['pros'].map(lambda x:nlp(x))

In [9]:
#frequency count of single words
pro=dict()
for row in range(len(reviews_df)):
    text=reviews_df['pro_token'].iloc[row]
    for word in text:
        if word.is_punct:
            continue
        
        if word.text.lower() in pro:
            pro[word.text.lower()]+=1
        else:
            pro[word.text.lower()]=1


In [10]:
con=dict()
for row in range(len(reviews_df)):
    text=reviews_df['pro_token'].iloc[row]
    for word in text:
        if word.is_punct:
            continue
        
        if word.text.lower() in con:
            con[word.text.lower()]+=1
        else:
            con[word.text.lower()]=1

In [11]:
# STEP 2: create dataframe from dictionary
con_count = pd.DataFrame.from_dict(con, orient = 'index')
con_count.columns=['count']
con_count=con_count.sort_values(by='count',ascending=False)

# STEP 2: create dataframe from dictionary
pro_count = pd.DataFrame.from_dict(con, orient = 'index')
pro_count.columns=['count']
pro_count=pro_count.sort_values(by='count',ascending=False)

In [13]:
con_count.to_csv('con_count.csv')
pro_count.to_csv('pro_count.csv')

## Lift

In [15]:
diversitypro = ('diveristy', 'inclusive','diverse')
environmentpro = ('room','space','location','food','environment','office', 'perks', 'technoolgy', 'food', 'tech','environment')
culturepro = ('positive','helpful','family','atmosphere','community','coworkers','company', 'people', 'culture', 'team','fun','care','friendly','colleagues','supportive','support','remote','values','cares','mission','collaborative','competitive','staff','caring')
worklifepro = ('vacation','life','balance','flexible','hours','flexibility','schedule','pto')
paypro = ('money','401k','insurance','pay', 'salary','compensation','stock','benefits','bonuses','options')
careergrowthpro = ('impact','opportunity','growth','opportunity','career','grow','growing','research')
leadershippro = ('vision','leaders','leadership','management','managers','ceo','manager')
personaldevpro = ('skills','driven','training','resources','success','learning','projects','experience','learn','challenging','resources','development')

In [18]:
def replace(comment):
    new_comment = []
    for word in comment:
        if word.is_punct:
            continue
        elif word.is_stop:
            continue
        elif word.text.lower() in diversitypro:
            word='diversity'
            new_comment.append(word)
        elif word.text.lower() in environmentpro:
            word='environment'
            new_comment.append(word)
        elif word.text.lower() in culturepro:
            word='culture'
            new_comment.append(word)
        elif word.text.lower() in worklifepro:
            word='worklife'
            new_comment.append(word)
        elif word.text.lower() in paypro:
            word='pay'
            new_comment.append(word)
        elif word.text.lower() in careergrowthpro:
            word='growth'
            new_comment.append(word)
        elif word.text.lower() in leadershippro:
            word='leadership'
            new_comment.append(word)
        elif word.text.lower() in personaldevpro:
            word='personaldevelopment'
            new_comment.append(word)
        else: 
            word = word.text.lower()
            new_comment.append(word)
        
    return(' '.join(new_comment))

reviews_df['pro_replace']=reviews_df['pro_token'].map(replace)
reviews_df['con_replace']=reviews_df['con_token'].map(replace)

In [19]:
featuresnames = ('diversity','environment','culture','worklife','pay','growth','leadership','personaldevelopment')

In [21]:
firm = pd.DataFrame(reviews_df['company'].value_counts())
firmname = firm.index

In [25]:
#Tokenize comment
reviews_df['con_replace_token']=reviews_df['con_replace'].map(lambda x:nlp(x))
reviews_df['pro_replace_token']=reviews_df['pro_replace'].map(lambda x:nlp(x))

In [30]:
pro_attributes = []


for row in range(len(reviews_df)):
    text=reviews_df['pro_replace_token'].iloc[row]
    attributelist = list()
    for word in text:
        if word.is_punct:
            continue
        if word.text in featuresnames:
            attributelist.append(word.text)
    pro_attributes.append(attributelist)
pro_attributes

[['worklife', 'pay', 'leadership', 'environment'],
 ['worklife'],
 ['pay', 'pay', 'personaldevelopment'],
 ['pay'],
 ['culture', 'personaldevelopment', 'pay', 'environment', 'culture'],
 ['pay'],
 ['culture',
  'growth',
  'growth',
  'culture',
  'culture',
  'personaldevelopment',
  'culture',
  'culture',
  'pay',
  'environment'],
 ['pay', 'culture'],
 [],
 ['worklife', 'pay'],
 ['pay'],
 ['worklife', 'worklife', 'worklife', 'worklife'],
 ['culture', 'culture'],
 ['pay'],
 [],
 ['culture', 'environment'],
 ['worklife'],
 ['worklife', 'worklife', 'pay'],
 ['growth'],
 ['worklife'],
 ['pay', 'growth', 'culture'],
 ['pay'],
 ['pay'],
 [],
 ['culture', 'pay', 'culture'],
 ['personaldevelopment', 'pay', 'culture', 'growth', 'culture'],
 ['culture', 'pay'],
 ['pay'],
 ['worklife',
  'worklife',
  'worklife',
  'worklife',
  'pay',
  'pay',
  'pay',
  'culture',
  'culture',
  'culture',
  'personaldevelopment'],
 ['pay', 'worklife', 'culture'],
 ['worklife', 'worklife'],
 ['culture', 'gr

In [31]:
con_attributes = []


for row in range(len(reviews_df)):
    text=reviews_df['con_replace_token'].iloc[row]
    attributelist = list()
    for word in text:
        if word.is_punct:
            continue
        if word.text in featuresnames:
            attributelist.append(word.text)
    con_attributes.append(attributelist)
con_attributes

[[],
 ['pay'],
 [],
 [],
 [],
 ['worklife'],
 ['culture', 'culture', 'culture', 'personaldevelopment'],
 ['pay', 'culture'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['pay', 'pay'],
 [],
 [],
 ['culture'],
 ['culture'],
 [],
 ['culture'],
 [],
 [],
 [],
 ['environment', 'culture'],
 ['leadership', 'worklife'],
 ['pay'],
 ['leadership', 'personaldevelopment'],
 [],
 ['worklife', 'worklife'],
 [],
 ['culture'],
 ['pay', 'pay'],
 [],
 [],
 [],
 ['pay', 'culture'],
 ['pay'],
 ['pay'],
 ['leadership'],
 ['worklife'],
 [],
 ['leadership'],
 ['pay'],
 ['leadership'],
 ['growth', 'culture', 'culture', 'culture', 'culture'],
 ['culture', 'culture', 'personaldevelopment'],
 ['culture'],
 ['pay', 'culture'],
 [],
 ['culture', 'worklife', 'worklife', 'pay'],
 [],
 ['leadership'],
 ['worklife', 'pay'],
 [],
 ['culture'],
 [],
 ['worklife'],
 ['culture'],
 ['culture', 'culture', 'culture'],
 ['pay'],
 [],
 [],
 ['pay'],
 ['pay'],
 ['environment', 'growth'],
 [],
 [],
 ['pay'],
 ['pay'],
 [

In [32]:
reviews_df['pro_attributes'] = pro_attributes
reviews_df['con_attributes'] = con_attributes
features = reviews_df[reviews_df['pro_attributes'].map(lambda d:len(d))>0]
features = reviews_df[reviews_df['con_attributes'].map(lambda d:len(d))>0]
features

Unnamed: 0.1,Unnamed: 0,company,pros,cons,con_token,pro_token,pro_replace,con_replace,con_replace_token,pro_replace_token,attributes,pro_attributes,con_attributes
1,1,Uber,Work your very own schedule,Sometimes the pay can be unbalanced,"(Sometimes, the, pay, can, be, unbalanced)","(Work, your, very, own, schedule)",work worklife,pay unbalanced,"(pay, unbalanced)","(work, worklife)",[worklife],[worklife],[pay]
5,5,Uber,"You can make a sizable pay with Uber, that's i...","Driving can be stressful, so I would recommend...","(Driving, can, be, stressful, ,, so, I, would,...","(You, can, make, a, sizable, pay, with, Uber, ...",sizable pay uber willing work,driving stressful recommend divide driving tim...,"(driving, stressful, recommend, divide, drivin...","(sizable, pay, uber, willing, work)",[pay],[pay],[worklife]
6,6,Uber,I love working at Uber (corporate)! I think so...,Looks like we're going to head into hybrid wor...,"(Looks, like, we, 're, going, to, head, into, ...","(I, love, working, at, Uber, (, corporate, ), ...",love working uber corporate think culture nerv...,looks like going head hybrid work 50 think cul...,"(looks, like, going, head, hybrid, work, 50, t...","(love, working, uber, corporate, think, cultur...","[culture, growth, growth, culture, culture, pe...","[culture, growth, growth, culture, culture, pe...","[culture, culture, culture, personaldevelopment]"
7,7,Uber,Money and time and fun and,Pay and people and time and,"(Pay, and, people, and, time, and)","(Money, and, time, and, fun, and)",pay time culture,pay culture time,"(pay, culture, time)","(pay, time, culture)","[pay, culture]","[pay, culture]","[pay, culture]"
18,18,Uber,Good wlb and growth cilture,Pay bad stock kept going down,"(Pay, bad, stock, kept, going, down)","(Good, wlb, and, growth, cilture)",good wlb growth cilture,pay bad pay kept going,"(pay, bad, pay, kept, going)","(good, wlb, growth, cilture)",[growth],[growth],"[pay, pay]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9660,5891,"VMware, Inc.",Very focused on individual growth. Sustainable...,You have to really want and enjoy the job of p...,"(You, have, to, really, want, and, enjoy, the,...","(Very, focused, on, individual, growth, ., Sus...",focused individual growth sustainable pace liv...,want enjoy job professional services mgmt spen...,"(want, enjoy, job, professional, services, mgm...","(focused, individual, growth, sustainable, pac...","[growth, personaldevelopment, culture, leaders...","[growth, personaldevelopment, culture, leaders...",[personaldevelopment]
9661,5892,"VMware, Inc.","wfh options, mentorship options, competitive p...",There is constantly changing leadership,"(There, is, constantly, changing, leadership)","(wfh, options, ,, mentorship, options, ,, comp...",wfh pay mentorship pay culture pay health pay,constantly changing leadership,"(constantly, changing, leadership)","(wfh, pay, mentorship, pay, culture, pay, heal...","[pay, pay, culture, pay, pay]","[pay, pay, culture, pay, pay]",[leadership]
9663,5894,"VMware, Inc.",Was a great place to make good money and be pa...,Dell has had too heavy of an influence on the ...,"(Dell, has, had, too, heavy, of, an, influence...","(Was, a, great, place, to, make, good, money, ...",great place good pay growth brand great leader...,dell heavy influence culture vmware goes start...,"(dell, heavy, influence, culture, vmware, goes...","(great, place, good, pay, growth, brand, great...","[pay, growth, leadership]","[pay, growth, leadership]",[culture]
9664,5895,"VMware, Inc.","work life balance, good to work with managers",easy to get some fantastic experience,"(easy, to, get, some, fantastic, experience)","(work, life, balance, ,, good, to, work, with,...",work worklife worklife good work leadership,easy fantastic personaldevelopment,"(easy, fantastic, personaldevelopment)","(work, worklife, worklife, good, work, leaders...","[worklife, worklife, leadership]","[worklife, worklife, leadership]",[personaldevelopment]


In [35]:
def count_firm_atb_pair(opinion,firm, attribute):
    count = 0
    for i in range(len(features)):
        if ((firm == features['company'].iloc[i]) and (attribute in features[opinion].iloc[i])):
            count += 1
    return count

In [36]:
def count_firm(firm):
    count = 0
    for i in range(len(reviews_df)):
        if (firm == reviews_df['company'].iloc[i]):
            count += 1
    return count

In [37]:
def count_attribute(opinion,attribute):
    count = 0
    for i in range(len(features)):
        if (attribute in features[opinion].iloc[i]):
            count += 1
    return count

In [38]:
def get_lift_atb (opinion, firm, attribute):
    numerator = len(reviews_df) * count_firm_atb_pair(opinion,firm, attribute)
    denominator = count_firm(firm) * count_attribute(opinion,attribute)
    return numerator / denominator 

In [39]:
pro_list = []
for i in range(len(firmname)):
    firm_a = firmname[i]
    i_list = []
    for j in range(len(featuresnames)):
        attribute = featuresnames[j]
        lift = get_lift_atb('pro_attributes',firm_a, attribute)
        i_list.append(lift)
    pro_list.append(i_list)

In [40]:
con_list = []
for i in range(len(firmname)):
    firm_a = firmname[i]
    i_list = []
    for j in range(len(featuresnames)):
        attribute = featuresnames[j]
        lift = get_lift_atb('con_attributes',firm_a, attribute)
        i_list.append(lift)
    con_list.append(i_list)

In [41]:
pro_lift = pd.DataFrame(pro_list)
pro_lift.index = firmname
pro_lift.columns = featuresnames
pro_lift

Unnamed: 0,diversity,environment,culture,worklife,pay,growth,leadership,personaldevelopment
Booz Allen Hamilton Inc.,0.000000,0.540112,1.000532,1.313024,0.957727,0.678116,0.968953,1.743553
"AirBnb, Inc.",0.878909,1.530317,1.029119,0.898385,0.856913,0.290621,0.861292,0.610243
The Broad Institute,1.757818,1.710354,1.257812,0.552852,1.159353,1.549980,0.861292,1.569197
National Aeronautics and Space Administration,0.439455,0.900186,0.543146,0.691065,0.352847,0.290621,0.215323,0.958954
Zynga,0.439455,1.890391,1.086292,0.552852,1.411387,0.968737,0.215323,0.871776
...,...,...,...,...,...,...,...,...
Ford Motor Company,0.878909,0.180037,0.628906,0.829278,0.907320,0.387495,0.000000,0.523066
General Electric Company,0.000000,1.800372,0.929066,0.345533,0.756100,0.484369,0.807461,1.307665
"Doma Holdings, Inc.",1.126807,0.692451,0.952888,0.708785,0.775487,1.490365,1.656330,1.117662
Verizon Communications Inc.,0.000000,0.900186,0.667021,1.382130,2.184289,0.645825,1.076615,1.452961


In [44]:
pro_lift.sort_values(by='diversity',ascending= False)

Unnamed: 0,diversity,environment,culture,worklife,pay,growth,leadership,personaldevelopment
"Affirm, Inc.",4.394545,1.980410,1.515092,0.829278,1.814640,1.840601,1.507261,1.830730
Unity Technologies,3.515636,1.170242,1.429332,0.967491,1.260167,2.034349,2.153229,1.220487
Intel Corporation,3.417980,0.400083,0.730547,0.998205,1.120148,1.184012,0.956991,1.065504
Johns Hopkins University Applied Physics Laboratory,3.076182,1.350279,0.829012,1.727663,1.562607,1.937475,0.322984,1.394842
Intuit Inc.,2.929697,1.600331,1.683435,0.614280,1.120148,3.121487,2.751349,2.227873
...,...,...,...,...,...,...,...,...
Fidelity Management and Research LLC,0.000000,0.450093,0.914772,0.898385,1.209760,1.065611,0.861292,0.610243
Cloud BigData Technologies Group,0.000000,0.090019,0.457386,0.207320,0.252033,0.193747,0.322984,0.523066
"Patagonia, Inc.",0.000000,1.080223,1.086292,0.552852,1.008133,0.000000,0.322984,0.435888
Southwest Airlines Co.,0.000000,0.720149,0.943359,1.243917,1.915454,0.871864,0.322984,0.348711


In [42]:
con_lift = pd.DataFrame(con_list)
con_lift.index = firmname
con_lift.columns = featuresnames
con_lift

Unnamed: 0,diversity,environment,culture,worklife,pay,growth,leadership,personaldevelopment
Booz Allen Hamilton Inc.,0.84807,1.139087,0.980925,0.645969,1.417029,0.878909,0.641859,1.289067
"AirBnb, Inc.",1.69614,1.139087,1.137873,0.322984,0.322052,0.781253,1.123253,1.189908
The Broad Institute,0.84807,1.423859,1.020162,0.968953,1.932312,1.953131,0.802324,2.181497
National Aeronautics and Space Administration,0.00000,1.993402,0.392370,0.430646,0.901746,0.585939,0.401162,1.289067
Zynga,1.69614,1.139087,1.608718,0.645969,0.322052,0.488283,1.845344,1.090749
...,...,...,...,...,...,...,...,...
Ford Motor Company,0.00000,1.139087,0.941688,1.291938,0.515283,0.585939,0.641859,0.396636
General Electric Company,0.00000,0.711929,0.980925,1.884076,1.127182,0.976566,1.002905,0.495795
"Doma Holdings, Inc.",0.00000,0.730184,0.804862,0.000000,0.330310,1.252007,0.411448,1.271269
Verizon Communications Inc.,0.00000,0.474620,1.307900,1.435486,1.073507,0.976566,2.139530,0.661060


In [45]:
con_lift.sort_values(by='diversity',ascending= False)

Unnamed: 0,diversity,environment,culture,worklife,pay,growth,leadership,personaldevelopment
Healthcare Catalyst,5.088421,1.423859,1.922614,1.184276,1.610260,1.464848,2.005809,2.379815
Paylocity Corporation,5.088421,0.854315,1.491006,2.045568,1.352618,1.660162,1.925577,2.379815
American Express Company,4.711501,1.582065,0.915530,0.956991,1.574476,0.651044,0.980618,0.661060
Fractal Analytics Inc,4.240351,1.067894,1.275203,1.211192,1.288208,1.586919,2.206390,1.983179
"Equinix, Inc.",4.240351,0.996701,1.177110,0.430646,0.450873,1.074222,1.925577,0.793272
...,...,...,...,...,...,...,...,...
"The MathWorks, Inc.",0.000000,1.708630,1.137873,0.430646,1.932312,1.367192,0.641859,1.189908
National Institutes of Health,0.000000,0.569543,0.549318,0.322984,0.966156,0.683596,0.401162,0.694113
Intuit Inc.,0.000000,0.791033,1.525884,1.315862,0.500970,1.302088,0.980618,1.322120
"Etsy, Inc.",0.000000,1.708630,1.883377,1.291938,1.030566,1.269535,2.246506,1.388226


In [49]:
firmname == 'SouthernCompany'

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False])