In [1]:
# import
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import altair as alt
import warnings
import streamlit as st
from keras.layers import Dense, Activation
from keras.models import Sequential
from tensorflow import keras
from sklearn.cluster import KMeans
# folium
import folium
import folium.plugins as plugins
from streamlit_folium import folium_static

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# NLP
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import string

# this line tells jupyter notebook to put the plots in the notebook rather than saving them to file
%matplotlib inline

# this line makes plots prettier on mac retina screens - if you don't have one it shouldn't do anything
%config InlineBackend.figure_format = 'retina'


# Style
sns.set_style(style = 'darkgrid')

In [2]:
df = pd.read_csv('reviews.csv')
df.drop(columns = ['Unnamed: 0'], axis = 1, inplace = True)

In [3]:
df.shape

(16920, 7)

In [4]:
df.drop_duplicates()

Unnamed: 0,Summary,Date,JobTitle,AuthorLocation,OverallRating,Pros,Cons
0,Amazing!,"Aug 12, 2021",Senior Sales Recruiter,"Aug 12, 2021 - Senior Sales Recruiter",5.0,"-Unparalleled team, benefits, and overall comp...","-Hyper growth always means a big work load, bu..."
1,Amazing Company,"Aug 6, 2022","LMTS, Software Engineering","Aug 6, 2022 - LMTS, Software Engineering",5.0,Work life balance\r\nRespect\r\nSalesforce car...,May not offer salary like FAANG companies.
2,What a disappointment...,"Nov 30, 2020",Account Executive- Core Team,"in San Francisco, CA",3.0,- Benefits are top notch\r\n- Perks in the tow...,"I came into Salesforce like every rep, excited..."
3,Great Company,"Aug 4, 2022",Operations Analyst,"Nov 30, 2020 - Account Executive- Core Team",5.0,Company is really great and the working cultur...,There are times promotions are political
4,Great Company Good Culture,"Aug 5, 2022",Lead Engineer/Technologist,"in San Francisco, CA",5.0,PTO\nSalary\nCulture\nCEO & Character,Project teams can be very silo'd which makes i...
...,...,...,...,...,...,...,...
13228,Differentiator = environment,"Aug 3, 2022",Regional Sales Manager,"in San Francisco, CA",5.0,There’s no room to be an jerk here. The enviro...,Experiencing growing pains transitioning from ...
13229,Amazing,"Aug 3, 2022",Account Executive,"Aug 5, 2022 - Lead Engineer/Technologist",5.0,"Love it here, great talent",Nothing bad about working here
13295,Solid,"Aug 5, 2022",Account Executive,"Aug 6, 2022 - LMTS, Software Engineering",5.0,"Good place to work, treat you well",Takes a long time to move up
13297,Great Company to Work For,"Aug 4, 2022",Account Director,"Aug 5, 2022 - Lead Engineer/Technologist",5.0,Sales Training \nCompensation \nWork-Life Bala...,Fast Pace Environment- Can be Challenging for ...


In [5]:
df.head()

Unnamed: 0,Summary,Date,JobTitle,AuthorLocation,OverallRating,Pros,Cons
0,Amazing!,"Aug 12, 2021",Senior Sales Recruiter,"Aug 12, 2021 - Senior Sales Recruiter",5.0,"-Unparalleled team, benefits, and overall comp...","-Hyper growth always means a big work load, bu..."
1,Amazing Company,"Aug 6, 2022","LMTS, Software Engineering","Aug 6, 2022 - LMTS, Software Engineering",5.0,Work life balance\r\nRespect\r\nSalesforce car...,May not offer salary like FAANG companies.
2,What a disappointment...,"Nov 30, 2020",Account Executive- Core Team,"in San Francisco, CA",3.0,- Benefits are top notch\r\n- Perks in the tow...,"I came into Salesforce like every rep, excited..."
3,Great Company,"Aug 4, 2022",Operations Analyst,"Nov 30, 2020 - Account Executive- Core Team",5.0,Company is really great and the working cultur...,There are times promotions are political
4,Great Company Good Culture,"Aug 5, 2022",Lead Engineer/Technologist,"in San Francisco, CA",5.0,PTO\nSalary\nCulture\nCEO & Character,Project teams can be very silo'd which makes i...


In [6]:
df.columns

Index(['Summary', 'Date', 'JobTitle', 'AuthorLocation', 'OverallRating',
       'Pros', 'Cons'],
      dtype='object')

In [7]:
df = df[['OverallRating','Pros']]
df

Unnamed: 0,OverallRating,Pros
0,5.0,"-Unparalleled team, benefits, and overall comp..."
1,5.0,Work life balance\r\nRespect\r\nSalesforce car...
2,3.0,- Benefits are top notch\r\n- Perks in the tow...
3,5.0,Company is really great and the working cultur...
4,5.0,PTO\nSalary\nCulture\nCEO & Character
...,...,...
16915,5.0,"Good place to work, treat you well"
16916,5.0,they were great to work with
16917,5.0,Sales Training \nCompensation \nWork-Life Bala...
16918,5.0,Company is really great and the working cultur...


# Pros

In [8]:
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 20}

plt.rc('font', **font)

In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from textblob import TextBlob
from textblob import Word
import re,string,unicodedata

### Tokenization

In [10]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

### Removing special chars

In [11]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
df['Pros']=df['Pros'].apply(remove_special_characters)

### Text stemming 

In [12]:
# #Stemming the text
# def simple_stemmer(text):
#     ps=nltk.porter.PorterStemmer()
#     text= ' '.join([ps.stem(word) for word in text.split()])
#     return text
# #Apply function on review column
# df['Pros']=df['Pros'].apply(simple_stemmer)

### Text Lemmatizing

In [13]:
lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]


#Apply function on review column
df['Pros']=df['Pros'].apply(lemmatize_text)


### Removing stopwords

In [14]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
df['Pros']=df['Pros'].apply(remove_stopwords)

{'doesn', 'myself', 'an', 'so', "isn't", 'hasn', 'here', 'i', 'not', 've', "that'll", 'them', 'was', 'under', "mightn't", 'no', "hadn't", 'been', "mustn't", 'of', 'had', 'mightn', 'same', 'wasn', 'how', 'why', 'aren', 'ma', 'him', 'he', 's', 'isn', 'or', 'from', 'once', 'that', 'only', 'his', 'further', 'other', 'her', "should've", 'did', "wouldn't", 'their', 'which', 'because', 'down', 'in', 'mustn', 'the', 'too', 'yours', "you'd", 'through', 'during', "aren't", 'our', 'are', 'there', 'just', 'when', 'shouldn', 'between', 'don', "shouldn't", "didn't", 'into', 'what', 'can', 'than', 'she', 'if', 'your', "weren't", 'some', 'any', "won't", 'this', 'needn', 'a', 'each', "shan't", 'itself', 'while', "it's", 'theirs', "hasn't", "wasn't", 'who', 'over', 'should', 'these', 'am', 'weren', 'most', 'both', 'they', 'haven', 'all', 'after', 'own', 'on', "doesn't", 'do', 'off', 'nor', 'yourself', "you're", 'll', 'at', 'himself', "you've", 'being', 'up', 'with', 'very', 'few', 'you', 'ours', 'about'

In [15]:
df.head()

Unnamed: 0,OverallRating,Pros
0,5.0,"[ ' Unparalleled ' , ' team ' , ' benefit ' , ..."
1,5.0,"[ ' Work ' , ' life ' , ' balance ' , ' Respec..."
2,3.0,"[ ' Benefits ' , ' ' , ' top ' , ' notch ' , '..."
3,5.0,"[ ' Company ' , ' ' , ' really ' , ' great ' ,..."
4,5.0,"[ ' PTO ' , ' Salary ' , ' Culture ' , ' CEO '..."


In [None]:
tf = TfidfVectorizer(stop_words='english', min_df=3)
tf.fit(df['Pros'])

pro_tf = tf.transform(df['Pros'])
pro_df = pd.DataFrame(pro_tf.todense(), columns=tf.get_feature_names())



fig, ax = plt.subplots(figsize = (30,30))
top_texts = pro_df.sum().sort_values(ascending=False)
top_texts.head(100).plot(kind='barh')
print(fig)

In [None]:
fig, ax = plt.subplots(1,1)
# Create and generate a word cloud image:
Cloud = WordCloud(width=1000, height=700,
                  background_color='black',
                  stopwords=stopwords,
                  min_font_size=3,
                  min_word_length=0).generate_from_frequencies(top_texts.head(100))

# background_color="white", max_words=50).generate_from_frequencies(top_texts)

# Display the generated image:
# plt.figure(figsize=(15,10))
plt.imshow(Cloud, interpolation='bilinear')
plt.axis("off")
print(fig)

### Bigram

In [None]:
tf = TfidfVectorizer(stop_words='english', min_df=3, ngram_range=(2,2))
tf.fit(df['Pros'])

pro_tf = tf.transform(df['Pros'])
pro_df = pd.DataFrame(pro_tf.todense(), columns=tf.get_feature_names())



fig, ax = plt.subplots(figsize = (30,30))
top_texts = pro_df.sum().sort_values(ascending=False)
top_texts.head(100).plot(kind='barh')
print(fig)

fig, ax = plt.subplots()
# Create and generate a word cloud image:
Cloud = WordCloud(width=500, height=400,
                  background_color='black',
                  stopwords=stopwords,
                  min_font_size=3,
                  min_word_length=0).generate_from_frequencies(top_texts)

# background_color="white", max_words=50).generate_from_frequencies(top_texts)

# Display the generated image:
plt.imshow(Cloud, interpolation='bilinear')
plt.axis("off")

print(fig)

In [None]:
tf = TfidfVectorizer(stop_words='english', min_df=3, ngram_range=(3,3))
tf.fit(df['Pros'])

pro_tf = tf.transform(df['Pros'])
pro_df = pd.DataFrame(pro_tf.todense(), columns=tf.get_feature_names())



fig, ax = plt.subplots(figsize = (30,30))
top_texts = pro_df.sum().sort_values(ascending=False)
top_texts.head(100).plot(kind='barh')
print(fig)

fig, ax = plt.subplots()
# Create and generate a word cloud image:
Cloud = WordCloud(width=500, height=400,
                  background_color='black',
                  stopwords=stopwords,
                  min_font_size=3,
                  min_word_length=0).generate_from_frequencies(top_texts)

# background_color="white", max_words=50).generate_from_frequencies(top_texts)

# Display the generated image:
plt.imshow(Cloud, interpolation='bilinear')
plt.axis("off")
print(fig)

In [None]:
df.rename(columns = {'Pros':'pros'}, inplace = True)

In [None]:
df.head()

In [None]:
df.to_csv('pros.csv')