# Preparation: library loading

In [1]:
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from scipy.misc import imread

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from sklearn.pipeline import Pipeline
from sklearn import base

from joblib import dump, load
import pickle
import dill

In [3]:
import nltk
import re

nltk.download('stopwords');
nltk.download('punkt');

import spacy
nlp = spacy.load('en');

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Use nltk's English stopwords.
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['the','i','you','a','c','slu','amazon','google','apple','microsoft','company'])

In [5]:
def tokenization(text):
    '''
    INPUT: a single user comment, eg. 'Work hard, have fun, make history. Be proud of our self, the job we are doing.'
    OUTPUT: tokenized comment (i.e, single words), eg. ['work','hard','have','fun', 'make','history',...]
    
    NOTE:
    nltk.sent_tokenize: this gives a list of sentences
    nltk.word_tokenize: this gives a list of sentences
    '''
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.lower() not in stopwords]
    filtered_tokens = []
    for token in tokens:
        if re.search('^[a-z|A-Z]+$', token):
            filtered_tokens.append(token)
    return filtered_tokens

### Dump & Load

In [121]:
# Dump
keyword = open("Keyword.pkl","wb")
pickle.dump(KW_dict,keyword)
keyword.close()

topic = open("Topic.pkl","wb")
pickle.dump(Topic_dict,topic)
topic.close()

In [128]:
keyword_s = open("Keyword_s.pkl","wb")
pickle.dump(KW_dict_s,keyword_s)
keyword_s.close()

In [47]:
# Load
KW_dict = pickle.load(open("Keyword.pkl", "rb"))

Topic_dict = pickle.load(open("Topic.pkl", "rb"))

In [48]:
KW_dict['google']

'amazing balance benefits best culture environment food free free food good great life nice opportunities pay people perks place salary smart smart people work working'

In [49]:
Topic_dict['amazon']

['Topic 0: work culture',
 'Topic 1: good pay',
 'Topic 2: people get',
 'Topic 3: benefits great']

# Part 0: Data Import

In [8]:
df = pd.read_csv(r'./all_company.csv')
print('The dimension of the table is ' + str(df.shape[0]) + ' rows and ' + str(df.shape[1]) + ' columns.')  

The dimension of the table is 122249 rows and 18 columns.


In [29]:
company_list=df.groupby('Company').count().index.tolist()

In [30]:
stopwords.extend(company_list)

In [31]:
len(company_list)

107

# Part 1: ViolinPlots, Keyword, WordClouds

In [122]:
def get_Violin(target_company, df):
    '''
    Input: target_company => 'google'
    Output: text_pros, violinplots(save into file)
    '''
    # filter dataframe
    df_t=df.loc[df['Company'] == target_company]
    df_t=df_t.rename(columns={'rating_overall':'Overall', "rating_balance":"Work-Life Balance", 
                         "rating_culture": "Culture","rating_career":"Career", 
                          "rating_comp":"Compensation","rating_mgmt":"Management"})
    
    
    # Violin plot
    #plt.clf()
    sns.set(rc={'figure.figsize':(18,6)});
    Vplot = sns.violinplot(data=df_t[['Overall','Work-Life Balance','Culture','Career',
                                  'Compensation','Management']],palette='Set3');
    Vplot.set_title(target_company.capitalize()+' Ratings Plot', fontsize=30);
    Vplot.set(xlabel='Ratings', ylabel='Stars');
    Vplot.set_xlabel('Ratings', fontsize=20);
    Vplot.set_ylabel('Stars', fontsize=20);
    Vplot.figure.savefig(r'./ViolinPlots/{}_Violin.jpg'.format(target_company));
    plt.clf()
    return



In [123]:
for i in range(len(company_list)):
    get_Violin(company_list[i],df)

<Figure size 1296x432 with 0 Axes>

In [115]:
# ViolinPlots & Keyword_dict
def get_keyword(target_company, df):
    '''
    Input: target_company => 'google'
    Output: text_pros, violinplots(save into file)
    '''
    # filter dataframe
    df_t=df.loc[df['Company'] == target_company]
    
    '''
    # Violin plot
    sns.set(rc={'figure.figsize':(18,6)});
    Vplot = sns.violinplot(data=df_t[['rating_overall','rating_balance','rating_culture','rating_career',
                                  'rating_comp','rating_mgmt']],palette='Set3');
    Vplot.set_title(target_company.capitalize()+' Ratings Plot', fontsize=30);
    Vplot.set(xlabel='Ratings', ylabel='Stars');
    Vplot.set_xlabel('Ratings', fontsize=20,fontname="Arial");
    Vplot.set_ylabel('Stars', fontsize=20, fontname="Arial");
    Vplot.figure.savefig(r'./ViolinPlots/{}_Violin.jpg'.format(target_company));
    plt.clf()
    '''
    
    # feature words of pros
    def get_pros(df_company):
        pros = []
        for i in range(len(df_company)):  
            pros.append(df_company.iloc[i][9])
        return pros
    
    pros = get_pros(df_t)
    
    tfidf_model = TfidfVectorizer(max_df=0.75, max_features=100,
                                 min_df=0.05, stop_words=stopwords,
                                 use_idf=True, tokenizer=tokenization, ngram_range=(1,2))
    tfidf_matrix = tfidf_model.fit_transform(pros)
    feature_pros = tfidf_model.get_feature_names()
    text_pros = ' '.join(feature_pros)
    
    return text_pros

'\nKW_dict = {}\nfor i in range(len(company_list)):\n    KW_dict[company_list[i]]=get_keyword(company_list[i],df)\n'

In [None]:
KW_dict = {}
for i in range(len(company_list)):
    KW_dict[company_list[i]]=get_keyword(company_list[i],df)

In [127]:
KW_dict_s = {}
company_list_s = ['google','amazon','point72','microsoft','apple', 'accenture', 'airbnb','altria',
               'boeing','broadcom','capitalone','citi','comcast','deloitte','dupont','facebook','honeywell','intel',
                'lyft','netflix','nike','pfizer','target','uber','visa','verizon','linkedin','bayer','chevron']
for i in range(len(company_list_s)):
    KW_dict_s[company_list_s[i]]=get_keyword(company_list_s[i],df)

In [None]:
# WordClouds
for k, v in KW_dict.items():
    config = Path(r'./Masks/mask_{}.png'.format(k))
    if config.is_file():
        mask = imread(r'./Masks/mask_{}.png'.format(k))
        wc=WordCloud(mask=mask,background_color="white",scale=2,repeat=True,colormap = 'viridis').generate(v)
        wc.to_file(r'./WordClouds/{}_wordcloud.jpg'.format(k))
    else:
        mask = imread(r'./Masks/mask.png'.format(k))
        wc=WordCloud(mask=mask,background_color="white",scale=2,repeat=True,colormap = 'viridis').generate(v)
        wc.to_file(r'./WordClouds/{}_wordcloud.jpg'.format(k))

# Part 2: Topic Modelling

In [33]:
def get_topic(target_company, df):
    '''
    Input: target_company
    Output: first 4 topic of each company
    '''
    class ReviewTransformer(base.BaseEstimator, base.TransformerMixin):
        def __init__(self, target_company):
            self.name = target_company  # We will need these in transform()

        def fit(self, X, y=None):
            # This transformer doesn't need to learn anything about the data,
            # so it can just return self without any further processing
            return self

        def transform(self, X):
            # Return an array with the same number of rows as X and one
            # column for each in self.col_names

            X_t=X.loc[X['Company'] == self.name]

            pros = []
            for i in range(len(X_t)):  
                pros.append(X_t.iloc[i][9])
            return pros
        
    CntVec = CountVectorizer(max_df=0.75, min_df=0.05, max_features=100, 
                             stop_words=stopwords, tokenizer=tokenization, ngram_range=(1,2))
    
    LDA = LatentDirichletAllocation(n_components=4, max_iter=5, learning_method='online', 
                                    learning_offset=50.,random_state=0)
    
    lda_est = Pipeline([('GetProsCmt', ReviewTransformer(target_company)),
                        ('CountVectorizer', CntVec),
                        ('LatentDirichletAllocation', LDA)])
    
    model = lda_est.fit(df)
    
    def display_topics(model, feature_names, no_top_words):
        res = []
        for topic_idx, topic in enumerate(model.components_):
            topic_words = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
            elm = "Topic {}: {}".format(topic_idx, topic_words)
            res.append(elm)
            #print(elm)
        return res
    
    topics = display_topics(lda_est.named_steps['LatentDirichletAllocation'], 
                            lda_est.named_steps['CountVectorizer'].get_feature_names(), 2)
    return topics,model

In [37]:
target_company = 'google'
Topic,model=get_topic(target_company,df)
dill.dump(model, open(r'./TopicModel/{}_TopicModel.joblib'.format(target_company), 'wb'))

lda_est = model
step1 = lda_est.named_steps['GetProsCmt'].fit_transform(df)
step2 = lda_est.named_steps['CountVectorizer'].fit_transform(step1)
step3 = lda_est.named_steps['LatentDirichletAllocation'].fit(step2)
topic_plot = pyLDAvis.sklearn.prepare(step3, step2, lda_est.named_steps['CountVectorizer'])
pyLDAvis.save_html(topic_plot, r'./LDA_plot/{}.html'.format(target_company))

In [39]:
Topic_dict = {}
for i in range(len(company_list)):
    target_company = company_list[i]
    Topic_dict[company_list[i]],model=get_topic(company_list[i],df)
    dill.dump(model, open(r'./TopicModel/{}_TopicModel.joblib'.format(target_company), 'wb'))
    
    
    lda_est = model
    step1 = lda_est.named_steps['GetProsCmt'].fit_transform(df)
    step2 = lda_est.named_steps['CountVectorizer'].fit_transform(step1)
    step3 = lda_est.named_steps['LatentDirichletAllocation'].fit(step2)
    topic_plot = pyLDAvis.sklearn.prepare(step3, step2, lda_est.named_steps['CountVectorizer'])
    pyLDAvis.save_html(topic_plot, r'./LDA_plot/{}.html'.format(target_company))
    

In [None]:
Topic_dict['amazon'] # type list

### Save Model (pipeline)

In [19]:
dill.dump(model, open(r'./TopicModel/{}_TopicModel.joblib'.format(target_company), 'wb'))

In [20]:
lda_est = dill.load(open(r'./TopicModel/{}_TopicModel.joblib'.format(target_company), 'rb'))

### display plot

In [21]:
import pyLDAvis.sklearn
from __future__ import print_function
pyLDAvis.enable_notebook()

In [22]:
step1 = lda_est.named_steps['GetProsCmt'].fit_transform(df)

In [23]:
step2 = lda_est.named_steps['CountVectorizer'].fit_transform(step1)

In [24]:
step3 = lda_est.named_steps['LatentDirichletAllocation'].fit(step2)

In [25]:
topic_plot = pyLDAvis.sklearn.prepare(step3, step2, lda_est.named_steps['CountVectorizer'])
topic_plot

In [26]:
pyLDAvis.save_html(topic_plot, r'./LDA_plot/{}.html'.format(target_company))

# Part 6: New Recommendation

In [119]:
def get_recommendation(user_prf, KW_dict):
    
    def stopword_RMV(sent):
        res = []
        for word in sent.split():
            if word.lower() not in stopwords:
                res.append(word)
        return ' '.join(res)
    
    doc0 = nlp(stopword_RMV(user_prf))
    score_dict = {}
    for k,v in KW_dict.items():
        temp_doc = nlp(v)
        score_dict[k]= doc0.similarity(temp_doc)
        
    sorted_score = sorted(score_dict.items(), key=lambda kv: kv[1],reverse=True)
    
    rcm_company = []
    for i in range(5):
        rcm_company.append('#'+str(i+1)+': '+str(sorted_score[i][0].capitalize()))

    return rcm_company

In [120]:
rcm_company=get_recommendation(input('Your Preference:'),KW_dict_s)
rcm_company

Your Preference:free food


['#1: Mcdonald', '#2: Google', '#3: Netflix', '#4: Facebook', '#5: Point72']