In [1]:
from collections import defaultdict
import numpy as np
import pandas as pd
from imageio import imread, imwrite
from PIL import Image
from scipy.misc import imsave
import matplotlib.pyplot as plt

stopwords = set(STOPWORDS) 

# Load Data File

In [2]:
datafile = 'employee_reviews_clean.csv'
dataset = pd.read_csv(datafile)
dataset.shape

(58844, 21)

In [110]:
with open('positive-words.txt') as infile:
    word_pos = [l[:-1] for l in infile if not l.startswith(';')]
with open('negative-words.txt',encoding = "ISO-8859-1") as infile:
    word_neg = [l[:-1] for l in infile if not l.startswith(';')]    
print(len(word_pos))
print(len(word_neg))

2007
4784


# Split Data (company-wise and rating-wise)

In [3]:
# Company wise data split
facebook = dataset[dataset['company'] == 'facebook']
netflix = dataset[dataset['company'] == 'netflix']
google = dataset[dataset['company'] == 'google']
apple = dataset[dataset['company'] == 'apple']
amazon = dataset[dataset['company'] == 'amazon']
microsoft = dataset[dataset['company'] == 'microsoft']
print('facebook: ', facebook.size)
print('netflix:  ', netflix.size)
print('google:   ', google.size)
print('apple:    ', apple.size)
print('amazon:   ', amazon.size)
print('microsoft:', microsoft.size)
print()

# Rating wise data split
five = dataset[dataset['overall-ratings'] == 5]
four = dataset[dataset['overall-ratings'] == 4]
three = dataset[dataset['overall-ratings'] == 3]
two = dataset[dataset['overall-ratings'] == 2]
one = dataset[dataset['overall-ratings'] == 1]
print('five:     ', five.size)
print('four:     ', four.size)
print('three:    ', three.size)
print('two:      ', two.size)
print('one:      ', one.size)

facebook:  27405
netflix:   10710
google:    141708
apple:     235767
amazon:    523551
microsoft: 296583

five:      438186
four:      408660
three:     226254
two:       89649
one:       72975


# Word Cloud Related Functions

In [154]:
def generate_word_frequency_dict(column,sw,n=2000):
    word_count = defaultdict(int)
    for r in column:
        if not isinstance(r,str): continue
        for w in r.split():
            word_count[w] += 1
    counts = [(word_count[w],w) for w in word_count]
    counts.sort()
    counts.reverse()
    words = {}
    for x in counts[:n]:
        if x[1] in sw or x[1].isdigit(): continue
        words[x[1]] = x[0]
    #words = [x[1] for x in counts[:2000]]
    return words

In [123]:
def generate_setiment_wf_dict(wf_d,direction):
    words = {}
    for w in wf_d:
        if w in direction:
            words[w] = wf_d[w]
    return words

In [275]:
def generate_word_cloud(d,mask,font,bgc,fc,outf):
        
    wordcloud = WordCloud(background_color=bgc,
                          mask=mask,
                          stopwords=STOPWORDS,
                          scale=4,
                          #max_font_size=200,
                          #min_font_size=5,
                          #max_words=100,
                          width=2000,
                          height=2000,
                          font_path=font)
    wordcloud.generate_from_frequencies(d)
    wordcloud.recolor(color_func=fc)
    wordcloud.to_file(outf)
    
def generate_word_cloud_default(d,outf,bgc='black'):
    wordcloud = WordCloud(background_color=bgc, scale=4)
    wordcloud.generate_from_frequencies(d)
    wordcloud.to_file(outf)

# Facebook

In [125]:
# Facebook utility
fb_sw = ['facebook','fb','lot','lots','also','thing','things','company','really']
fb_dir = 'fb/'
fb_mask = imread(fb_dir+"fb.png")
fb_color = ImageColorGenerator(imread(fb_dir+'fb_color.png'))
fb_font = fb_dir+'facebook-letter-faces.ttf'

In [126]:
# Facebook pros
wf_pros_facebook = generate_word_frequency_dict(facebook['pros_clean'],fb_sw)
swf_pros_facebook = generate_setiment_wf_dict(wf_pros_facebook,word_pos)
fb_pros_outf = fb_dir+'wc_pros_fb.png'
generate_word_cloud(swf_pros_facebook,fb_mask,fb_font,"white",fb_color,fb_pros_outf)

In [127]:
# Facebook cons
wf_cons_facebook = generate_word_frequency_dict(facebook['cons_clean'],fb_sw)
swf_cons_facebook = generate_setiment_wf_dict(wf_cons_facebook,word_neg)
fb_cons_outf = fb_dir+'wc_cons_fb.png'
generate_word_cloud(swf_cons_facebook,fb_mask,fb_font,"white",fb_color,fb_cons_outf)

In [134]:
# Facebook summary
wf_sum_facebook = generate_word_frequency_dict(facebook['summary_clean'],fb_sw)
fb_sum_outf = fb_dir+'wc_summary_fb.png'
generate_word_cloud(wf_sum_facebook,fb_mask,fb_font,"white",fb_color,fb_sum_outf)

# Netflix

In [161]:
# Netflix utility
nf_sw = ['netflix','company','cons']
nf_dir = 'netflix/'
nf_mask = imread(nf_dir+'netflix.png')
nf_color = ImageColorGenerator(imread(nf_dir+'netflix_color.png'))
nf_font = nf_dir+'netflix_font.otf'

In [162]:
# Netflix pros
wf_pros_netflix = generate_word_frequency_dict(netflix['pros_clean'], nf_sw)
swf_pros_netflix = generate_setiment_wf_dict(wf_pros_netflix,word_pos)
nf_pros_outf = nf_dir+'wc_pros_netflix.png'
generate_word_cloud(swf_pros_netflix,nf_mask,nf_font,"black",nf_color,nf_pros_outf)

In [163]:
# Netflix cons
wf_cons_netflix = generate_word_frequency_dict(netflix['cons_clean'], nf_sw,5000)
swf_cons_netflix = generate_setiment_wf_dict(wf_cons_netflix,word_neg)
nf_cons_outf = nf_dir+'wc_cons_netflix.png'
generate_word_cloud(swf_cons_netflix,nf_mask,nf_font,"black",nf_color,nf_cons_outf)

In [167]:
# Netflix summary
wf_sum_netflix = generate_word_frequency_dict(netflix['summary_clean'], nf_sw)
#swf_sum_netflix = generate_setiment_wf_dict(wf_sum_netflix,word_neg+word_pos)
nf_sum_outf = nf_dir+'wc_summary_netflix.png'
generate_word_cloud(wf_sum_netflix,nf_mask,nf_font,"black",nf_color,nf_sum_outf)

# Google

In [193]:
# Google utility
gg_sw = ['google','company','really','lot','lots','cons']
gg_dir = 'google/'
gg_mask = imread(gg_dir+'google.png')
gg_color = ImageColorGenerator(imread(gg_dir+'google_color.png'))
gg_font = gg_dir+'GoogleSans-Regular.ttf'

In [191]:
# Google pros
wf_pros_google = generate_word_frequency_dict(google['pros_clean'], gg_sw)
#swf_pros_google = generate_setiment_wf_dict(wf_pros_google,word_pos)
gg_pros_outf = gg_dir+'wc_pros_google.png'
generate_word_cloud(wf_pros_google,gg_mask,gg_font,"white",gg_color,gg_pros_outf)

In [194]:
# Google cons
wf_cons_google = generate_word_frequency_dict(google['cons_clean'], gg_sw)
swf_cons_google = generate_setiment_wf_dict(wf_cons_google,word_neg)
gg_cons_outf = gg_dir+'wc_cons_google.png'
generate_word_cloud(swf_cons_google,gg_mask,gg_font,"white",gg_color,gg_cons_outf)

In [195]:
# Google Summary
wf_sum_google = generate_word_frequency_dict(google['summary_clean'], gg_sw)
gg_sum_outf = gg_dir+'wc_summary_google.png'
generate_word_cloud(wf_sum_google,gg_mask,gg_font,"white",gg_color,gg_sum_outf)

# Apple

In [214]:
# Apple utility
ap_sw = ['apple','company','pros','cons','really','lot','lots']
ap_dir = 'apple/'
ap_mask = imread(ap_dir+'apple.png')
ap_color = ImageColorGenerator(imread(ap_dir+'apple_color.png'))
ap_font = ap_dir+'SF-Pro-Display-Regular.otf'

In [215]:
# Apple pros
wf_pros_apple = generate_word_frequency_dict(apple['pros_clean'], ap_sw)
#swf_pros_apple = generate_setiment_wf_dict(wf_pros_apple,word_pos)
ap_pros_outf = ap_dir+'wc_pros_apple.png'
generate_word_cloud(wf_pros_apple,ap_mask,ap_font,"white",ap_color,ap_pros_outf)

In [216]:
# Apple cons
wf_cons_apple = generate_word_frequency_dict(apple['cons_clean'], ap_sw)
swf_cons_apple = generate_setiment_wf_dict(wf_cons_apple,word_neg)
ap_cons_outf = ap_dir+'wc_cons_apple.png'
generate_word_cloud(swf_cons_apple,ap_mask,ap_font,"white",ap_color,ap_cons_outf)

In [217]:
# Apple summary
wf_sum_apple = generate_word_frequency_dict(apple['summary_clean'], ap_sw)
ap_sum_outf = ap_dir+'wc_summary_apple.png'
generate_word_cloud(wf_sum_apple,ap_mask,ap_font,"white",ap_color,ap_sum_outf)

# Amazon

In [218]:
# Amazon utility
am_sw = ['amazon','company','pros','cons','really','lot','lots']
am_dir = 'amazon/'
am_mask = imread(am_dir+'amazon.png')
am_color = ImageColorGenerator(imread(am_dir+'amazon_color.png'))
am_font = am_dir+'AmazonEmber_Bd.ttf'

In [220]:
# Amazon pros
wf_pros_amazon = generate_word_frequency_dict(amazon['pros_clean'], am_sw)
swf_pros_amazon = generate_setiment_wf_dict(wf_pros_amazon,word_pos)
am_pros_outf = am_dir+'wc_pros_amazon.png'
generate_word_cloud(swf_pros_amazon,am_mask,am_font,"black",am_color,am_pros_outf)

In [221]:
# Amazon cons
wf_cons_amazon = generate_word_frequency_dict(amazon['cons_clean'], am_sw)
swf_cons_amazon = generate_setiment_wf_dict(wf_cons_amazon,word_neg)
am_cons_outf = am_dir+'wc_cons_amazon.png'
generate_word_cloud(swf_cons_amazon,am_mask,am_font,"black",am_color,am_cons_outf)

In [222]:
# Amazon summary
wf_sum_amazon = generate_word_frequency_dict(amazon['summary_clean'], am_sw)
am_sum_outf = am_dir+'wc_summary_amazon.png'
generate_word_cloud(wf_sum_amazon,am_mask,am_font,"black",am_color,am_sum_outf)

# Microsoft

In [254]:
# Microsoft utility
ms_sw = ['microsoft','company','pros','cons','really','lot','lots']
ms_dir = 'microsoft/'
ms_mask = imread(ms_dir+'microsoft.png')
ms_color = ImageColorGenerator(imread(ms_dir+'microsoft_color4.png'))
ms_font = ms_dir+'Microsoft_Logo.ttf'

In [255]:
# Microsoft pros
wf_pros_microsoft = generate_word_frequency_dict(microsoft['pros_clean'], ms_sw)
swf_pros_microsoft = generate_setiment_wf_dict(wf_pros_microsoft,word_pos)
ms_pros_outf = ms_dir+'wc_pros_microsoft.png'
generate_word_cloud(swf_pros_microsoft,ms_mask,ms_font,"black",ms_color,ms_pros_outf)

In [262]:
# Microsoft cons
wf_cons_microsoft = generate_word_frequency_dict(microsoft['cons_clean'], ms_sw)
swf_cons_microsoft = generate_setiment_wf_dict(wf_cons_microsoft,word_neg)
ms_cons_outf = ms_dir+'wc_cons_microsoft.png'
generate_word_cloud(swf_cons_microsoft,ms_mask,ms_font,"black",ms_color,ms_cons_outf)

In [259]:
# Microsoft summary
wf_sum_microsoft = generate_word_frequency_dict(microsoft['summary_clean'], ms_sw)
ms_sum_outf = ms_dir+'wc_summary_microsoft.png'
generate_word_cloud(wf_sum_microsoft,ms_mask,ms_font,"black",ms_color,ms_sum_outf)

# Ratings

In [280]:
# Rating
rating_based_dir = 'rating/'
r_sw = ['company','amazon','netflix','google','microsoft','apple','really','lot','lots','work']
# 4 and 5 stars rating
wf_good = generate_word_frequency_dict(list(five['summary_clean']) + list(four['summary_clean']), r_sw+word_neg)
#swf_good = generate_word_frequency_dict(wf_good,word_pos)
good_outf = rating_based_dir + 'wc_good_ratings.png'
generate_word_cloud_default(wf_good,good_outf,'white')
# 1 and 2 stars rating
wf_bad = generate_word_frequency_dict(list(one['summary_clean']) + list(two['summary_clean']), r_sw+word_pos)
#swf_bad = generate_word_frequency_dict(wf_bad,word_neg)
bad_outf = rating_based_dir + 'wc_bad_ratings.png'
generate_word_cloud_default(wf_bad,bad_outf)

### TODO: Refactor codes to be more efficient
### TODO: Some word clouds use setiment and some do not (need double check for word cloud quality)