# Analyzing Glassdoor Data for Visualizations

### This next section finds the mean and median of the numerical columns so we can see if there are significant differences between "Current" and "Former" employees, and whether that changes by year

In [1]:
# We're importing a little known library known as "Pandas" in this section
import pandas as pd

In [2]:
#Import the csvs
glassdoor_data = "../../glassdoor_reviews.csv"
glassdoor_df = pd.read_csv(glassdoor_data)

glassdoor_df.head()

Unnamed: 0,firm,date_review,job_title,current,location,overall_rating,work_life_balance,culture_values,diversity_inclusion,career_opp,comp_benefits,senior_mgmt,recommend,ceo_approv,outlook,headline,pros,cons
0,AFH-Wealth-Management,2015-04-05,,Current Employee,,2,4.0,3.0,,2.0,3.0,3.0,x,o,r,"Young colleagues, poor micro management",Very friendly and welcoming to new staff. Easy...,"Poor salaries, poor training and communication."
1,AFH-Wealth-Management,2015-12-11,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",2,3.0,1.0,,2.0,1.0,4.0,x,o,r,"Excellent staff, poor salary","Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...
2,AFH-Wealth-Management,2016-01-28,Office Administrator,"Current Employee, less than 1 year","Bromsgrove, England, England",1,1.0,1.0,,1.0,1.0,1.0,x,o,x,"Low salary, bad micromanagement",Easy to get the job even without experience in...,"Very low salary, poor working conditions, very..."
3,AFH-Wealth-Management,2016-04-16,,Current Employee,,5,2.0,3.0,,2.0,2.0,3.0,x,o,r,Over promised under delivered,Nice staff to work with,No career progression and salary is poor
4,AFH-Wealth-Management,2016-04-23,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",1,2.0,1.0,,2.0,1.0,1.0,x,o,x,client reporting admin,"Easy to get the job, Nice colleagues.","Abysmal pay, around minimum wage. No actual tr..."


In [3]:
#See how many companies are in the dataframe
print("The number of companies in this dataset is:")
print(glassdoor_df["firm"].nunique())
print("-----")
print("They are:")
for i in glassdoor_df["firm"].unique():
    print(f"   {i}")

The number of companies in this dataset is:
428
-----
They are:
   AFH-Wealth-Management
   AJ-Bell
   ALDI
   AQA
   ASDA
   ASOS
   AXA-UK
   Abcam
   Abertawe-Bro-Morgannwg-University-Health-Board
   Accenture
   Accor
   Achieving-for-Children
   ActionCOACH
   Active-Care-Group
   Adecco
   Age-UK-The-National-Charity
   AlixPartners
   American-Express
   Amey
   Angard-Staffing
   Anglian-Water
   Anglo-American
   Animal-and-Plant-Health-Agency
   Aon
   Apple
   Arcadia
   Arnold-Clark
   AstraZeneca
   Aviva
   B-and-M-Retail
   B-and-Q
   BAT
   BBC
   BDO
   BHS
   BIS
   BNP-Paribas
   BNY-Mellon
   BP
   BPP-Holdings
   BT
   Babcock-International-Group
   Babylon-Health
   Bain-and-Company
   Balfour-Beatty
   Bannatyne-Group
   Barchester-Healthcare
   Barclays
   Barnardo-s
   Barnet-and-Chase-Farm-Hospitals-NHS-Trust
   Barnett-Waddingham
   Barratt-Developments
   Barts-Health-NHS-Trust
   BayWa-r-e-renewable-energy
   Bayer
   Best-Western
   Betsi-Cadwaladr-Univers

In [4]:
#See how many different "types" of employees there are
for i in glassdoor_df["current"].unique():
    print(i)

Current Employee
Current Employee, more than 1 year
Current Employee, less than 1 year
Former Employee
Current Employee, more than 5 years
Former Employee, more than 1 year
Former Employee, more than 3 years
Former Employee, more than 5 years
Current Employee, more than 3 years
Current Employee, more than 8 years
Former Employee, less than 1 year
Former Employee, more than 8 years
Current Employee, more than 10 years
Former Employee, more than 10 years
Former Contractor, less than 1 year
Former Intern, less than 1 year
Current Contractor, less than 1 year
Former Contractor
Former Intern, more than 1 year
Current Contractor
Former Intern
Current Intern, less than 1 year
Current Contractor, more than 1 year
Former Contractor, more than 1 year
Former Contractor, more than 8 years
Former Temporary Employee
KEY NOT FOUND: jobLine.per_diem-former
Current Freelancer, more than 3 years
KEY NOT FOUND: jobLine.temporary-former


In [5]:
#Employees we're interested in comparing
employees = ["Current Employee",
             "Current Employee, more than 1 year",
            "Current Employee, less than 1 year",
            "Former Employee",
            "Current Employee, more than 5 years",
            "Former Employee, more than 1 year",
            "Former Employee, more than 3 years",
            "Former Employee, more than 5 years",
            "Current Employee, more than 3 years",
            "Current Employee, more than 8 years",
            "Former Employee, less than 1 year",
            "Former Employee, more than 8 years",
            "Current Employee, more than 10 years",
            "Former Employee, more than 10 years"]

In [6]:
#Only include those types of employees
glassdoor_employees_df = glassdoor_df.loc[glassdoor_df["current"].isin(employees)].reset_index(drop = True)
glassdoor_employees_df

Unnamed: 0,firm,date_review,job_title,current,location,overall_rating,work_life_balance,culture_values,diversity_inclusion,career_opp,comp_benefits,senior_mgmt,recommend,ceo_approv,outlook,headline,pros,cons
0,AFH-Wealth-Management,2015-04-05,,Current Employee,,2,4.0,3.0,,2.0,3.0,3.0,x,o,r,"Young colleagues, poor micro management",Very friendly and welcoming to new staff. Easy...,"Poor salaries, poor training and communication."
1,AFH-Wealth-Management,2015-12-11,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",2,3.0,1.0,,2.0,1.0,4.0,x,o,r,"Excellent staff, poor salary","Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...
2,AFH-Wealth-Management,2016-01-28,Office Administrator,"Current Employee, less than 1 year","Bromsgrove, England, England",1,1.0,1.0,,1.0,1.0,1.0,x,o,x,"Low salary, bad micromanagement",Easy to get the job even without experience in...,"Very low salary, poor working conditions, very..."
3,AFH-Wealth-Management,2016-04-16,,Current Employee,,5,2.0,3.0,,2.0,2.0,3.0,x,o,r,Over promised under delivered,Nice staff to work with,No career progression and salary is poor
4,AFH-Wealth-Management,2016-04-23,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",1,2.0,1.0,,2.0,1.0,1.0,x,o,x,client reporting admin,"Easy to get the job, Nice colleagues.","Abysmal pay, around minimum wage. No actual tr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
838529,the-LEGO-Group,2021-06-02,Marketing Manager,"Current Employee, more than 5 years","München, Bavaria, Bavaria",5,4.0,5.0,4.0,4.0,4.0,4.0,v,v,v,Just an awesome company to work for!!!,"Great company values, awesome product, smart c...",Not very easy to transfer to other locations
838530,the-LEGO-Group,2021-06-03,Sales Associate,"Current Employee, less than 1 year","London, England, England",3,,,,,,,o,o,o,working at lego,staff discount is really nice,micro managing is a hassle\r\ncan become menta...
838531,the-LEGO-Group,2021-06-03,Strategist,Current Employee,,4,5.0,5.0,5.0,3.0,5.0,3.0,v,o,o,not interested in growing their people,loved brand for a lot of people,you can spend 6-10 years without any promotion...
838532,the-LEGO-Group,2021-06-04,Customer Service Representative,"Current Employee, less than 1 year",,5,,,,,,,o,o,o,Great Place to Work,"Good wages, good hours, lots of resources","Working every other weekend, busy seasons can ..."


In [7]:
# Take a look at the columns
glassdoor_employees_df.columns

Index(['firm', 'date_review', 'job_title', 'current', 'location',
       'overall_rating', 'work_life_balance', 'culture_values',
       'diversity_inclusion', 'career_opp', 'comp_benefits', 'senior_mgmt',
       'recommend', 'ceo_approv', 'outlook', 'headline', 'pros', 'cons'],
      dtype='object')

In [216]:
#Split the "current" column into Current/Former and how long the employee has been/was there
glassdoor_employees_df[["Current/Former", "Length"]] = glassdoor_employees_df["current"].str.split(',', 1, expand = True)
glassdoor_employees_df

KeyError: 'current'

In [9]:
# Drop the columns that are non-numerical
glassdoor_employees_df = glassdoor_employees_df.drop(columns = ["recommend", "ceo_approv", "outlook", "headline"])
glassdoor_employees_df

Unnamed: 0,firm,date_review,job_title,current,location,overall_rating,work_life_balance,culture_values,diversity_inclusion,career_opp,comp_benefits,senior_mgmt,pros,cons,Current/Former,Length
0,AFH-Wealth-Management,2015-04-05,,Current Employee,,2,4.0,3.0,,2.0,3.0,3.0,Very friendly and welcoming to new staff. Easy...,"Poor salaries, poor training and communication.",Current Employee,
1,AFH-Wealth-Management,2015-12-11,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",2,3.0,1.0,,2.0,1.0,4.0,"Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...,Current Employee,more than 1 year
2,AFH-Wealth-Management,2016-01-28,Office Administrator,"Current Employee, less than 1 year","Bromsgrove, England, England",1,1.0,1.0,,1.0,1.0,1.0,Easy to get the job even without experience in...,"Very low salary, poor working conditions, very...",Current Employee,less than 1 year
3,AFH-Wealth-Management,2016-04-16,,Current Employee,,5,2.0,3.0,,2.0,2.0,3.0,Nice staff to work with,No career progression and salary is poor,Current Employee,
4,AFH-Wealth-Management,2016-04-23,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",1,2.0,1.0,,2.0,1.0,1.0,"Easy to get the job, Nice colleagues.","Abysmal pay, around minimum wage. No actual tr...",Current Employee,more than 1 year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
838529,the-LEGO-Group,2021-06-02,Marketing Manager,"Current Employee, more than 5 years","München, Bavaria, Bavaria",5,4.0,5.0,4.0,4.0,4.0,4.0,"Great company values, awesome product, smart c...",Not very easy to transfer to other locations,Current Employee,more than 5 years
838530,the-LEGO-Group,2021-06-03,Sales Associate,"Current Employee, less than 1 year","London, England, England",3,,,,,,,staff discount is really nice,micro managing is a hassle\r\ncan become menta...,Current Employee,less than 1 year
838531,the-LEGO-Group,2021-06-03,Strategist,Current Employee,,4,5.0,5.0,5.0,3.0,5.0,3.0,loved brand for a lot of people,you can spend 6-10 years without any promotion...,Current Employee,
838532,the-LEGO-Group,2021-06-04,Customer Service Representative,"Current Employee, less than 1 year",,5,,,,,,,"Good wages, good hours, lots of resources","Working every other weekend, busy seasons can ...",Current Employee,less than 1 year


In [10]:
#Rearrange the columns for #fun
glassdoor_employees_df = glassdoor_employees_df[["Current/Former",
                                                 "Length",
                                                 "firm",
                                                 "date_review",
                                                 "job_title",
                                                "location",
                                                "overall_rating",
                                                "work_life_balance",
                                                "culture_values",
                                                "diversity_inclusion",
                                                "career_opp",
                                                "comp_benefits",
                                                "senior_mgmt",
                                                "pros",
                                                "cons"]]
glassdoor_employees_df

Unnamed: 0,Current/Former,Length,firm,date_review,job_title,location,overall_rating,work_life_balance,culture_values,diversity_inclusion,career_opp,comp_benefits,senior_mgmt,pros,cons
0,Current Employee,,AFH-Wealth-Management,2015-04-05,,,2,4.0,3.0,,2.0,3.0,3.0,Very friendly and welcoming to new staff. Easy...,"Poor salaries, poor training and communication."
1,Current Employee,more than 1 year,AFH-Wealth-Management,2015-12-11,Office Administrator,"Bromsgrove, England, England",2,3.0,1.0,,2.0,1.0,4.0,"Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...
2,Current Employee,less than 1 year,AFH-Wealth-Management,2016-01-28,Office Administrator,"Bromsgrove, England, England",1,1.0,1.0,,1.0,1.0,1.0,Easy to get the job even without experience in...,"Very low salary, poor working conditions, very..."
3,Current Employee,,AFH-Wealth-Management,2016-04-16,,,5,2.0,3.0,,2.0,2.0,3.0,Nice staff to work with,No career progression and salary is poor
4,Current Employee,more than 1 year,AFH-Wealth-Management,2016-04-23,Office Administrator,"Bromsgrove, England, England",1,2.0,1.0,,2.0,1.0,1.0,"Easy to get the job, Nice colleagues.","Abysmal pay, around minimum wage. No actual tr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
838529,Current Employee,more than 5 years,the-LEGO-Group,2021-06-02,Marketing Manager,"München, Bavaria, Bavaria",5,4.0,5.0,4.0,4.0,4.0,4.0,"Great company values, awesome product, smart c...",Not very easy to transfer to other locations
838530,Current Employee,less than 1 year,the-LEGO-Group,2021-06-03,Sales Associate,"London, England, England",3,,,,,,,staff discount is really nice,micro managing is a hassle\r\ncan become menta...
838531,Current Employee,,the-LEGO-Group,2021-06-03,Strategist,,4,5.0,5.0,5.0,3.0,5.0,3.0,loved brand for a lot of people,you can spend 6-10 years without any promotion...
838532,Current Employee,less than 1 year,the-LEGO-Group,2021-06-04,Customer Service Representative,,5,,,,,,,"Good wages, good hours, lots of resources","Working every other weekend, busy seasons can ..."


In [11]:
glassdoor_employees_df.columns

Index(['Current/Former', 'Length', 'firm', 'date_review', 'job_title',
       'location', 'overall_rating', 'work_life_balance', 'culture_values',
       'diversity_inclusion', 'career_opp', 'comp_benefits', 'senior_mgmt',
       'pros', 'cons'],
      dtype='object')

In [12]:
# Find the mean value for the numerical columns grouped by current/former and length of employment
grouped_glassdoor_employees_mean_df = glassdoor_employees_df.groupby(["Current/Former", "Length"]).mean()

In [13]:
# Find the median value for the numerical columns grouped by current/former and length of employment
grouped_glassdoor_employees_median_df = glassdoor_employees_df.groupby(["Current/Former", "Length"]).median()

In [14]:
# Export these grouped dataframes to csv files for visualization in Tableau
grouped_glassdoor_employees_mean_df.to_csv('../Resources/Glassdoor_Grouped_Data/grouped_glassdoor_employees_mean.csv')
grouped_glassdoor_employees_median_df.to_csv('../Resources/Glassdoor_Grouped_Data/grouped_glassdoor_employees_median.csv')

## Sentiment Analysis

### In this section, we will use a VADER library to perform sentiment analysis on the "pros" and "cons" columns of the glassdoor data

We got a lot of help from gpt-4 on this, but I checked with Mudit and he said that we could include this and its results in our presentation, if we could explain what this section was doing. I'll try to heavily comment with that in mind.

In [15]:
# This is the Natural Language Toolkit, which is a library that works with
# human language data, which helps with "tokenization" (splitting up long
# pieces of text) and can also perform sentiment analysis (determining the 
# positivity or negativity of a particular word, phrase, sentence, etc.)

!pip install nltk



In [27]:
#Import the library
import nltk

#These three things are important for what comes next

# VADER is Valance Aware Dictionary and sEntiment Reasoner, which is the sentiment
# analysis section of this library
nltk.download('vader_lexicon')

# Punkt is the tokenizer section, which is able to split content into sentences,
# and sentences into words, even with natural language
nltk.download('punkt')

# Finally, stopwords removes the random words like "the", "and", "of" etc.
# but you can also tell it which other words to not include
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to C:\Users\Jill
[nltk_data]     Brammah\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Jill
[nltk_data]     Brammah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jill
[nltk_data]     Brammah\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [17]:
#Take the first 50,000 rows of data just for speed
glassdoor_employees_trial = glassdoor_employees_df.head(50000)

In [18]:
#Imports
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Define a function to calculate sentiment scores using VADER
def get_sentiment_score(sentence):
    sentiment = sia.polarity_scores(sentence)
    return sentiment['compound']

# The way VADER works is it has a lexicon containing words and phrases
# and associated sentiment scores. These include "good" and "excellent" which
# would be positively scored, with excellent scoring higher.
# Negative words like "bad" and "terrible" are scored lower, again scaled
# VADER also takes into account "very bad", "bad!!!" or "BAD" as worse than "bad"
# and then aggregates the sentiment across all sentence sin a text

# Apply sentiment analysis to 'pros' column, and save as new column
glassdoor_employees_trial['pros_sentiment'] = glassdoor_employees_trial['pros'].apply(get_sentiment_score)

# Apply sentiment analysis to 'cons' column, and save as new column
glassdoor_employees_trial['cons_sentiment'] = glassdoor_employees_trial['cons'].apply(get_sentiment_score)

# Calculate average sentiment scores for each category of employer
average_pros_sentiment = glassdoor_employees_trial.groupby(["Current/Former", "Length"])['pros_sentiment'].mean()
average_cons_sentiment = glassdoor_employees_trial.groupby(["Current/Former", "Length"])['cons_sentiment'].mean()

# Print average sentiment scores for each category of employee
print("Average Pros Sentiment:")
print(average_pros_sentiment)
print("-----")
print("Average Cons Sentiment:")
print(average_cons_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Average Pros Sentiment:
Current/Former    Length             
Current Employee   less than 1 year      0.649604
                   more than 1 year      0.662763
                   more than 10 years    0.619920
                   more than 3 years     0.664745
                   more than 5 years     0.664894
                   more than 8 years     0.648868
Former Employee    less than 1 year      0.578217
                   more than 1 year      0.636131
                   more than 10 years    0.656512
                   more than 3 years     0.652349
                   more than 5 years     0.654939
                   more than 8 years     0.636694
Name: pros_sentiment, dtype: float64

Average Cons Sentiment:
Current/Former    Length             
Current Employee   less than 1 year     -0.033445
                   more than 1 year     -0.034794
                   more than 10 years    0.022438
                   more than 3 years    -0.014610
                   more than 5 years  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
# Concatenate these dataframes
glassdoor_pros_and_cons_sentiment_df = pd.concat([average_pros_sentiment, average_cons_sentiment])
glassdoor_pros_and_cons_sentiment_df

Current/Former    Length             
Current Employee   less than 1 year      0.649604
                   more than 1 year      0.662763
                   more than 10 years    0.619920
                   more than 3 years     0.664745
                   more than 5 years     0.664894
                   more than 8 years     0.648868
Former Employee    less than 1 year      0.578217
                   more than 1 year      0.636131
                   more than 10 years    0.656512
                   more than 3 years     0.652349
                   more than 5 years     0.654939
                   more than 8 years     0.636694
Current Employee   less than 1 year     -0.033445
                   more than 1 year     -0.034794
                   more than 10 years    0.022438
                   more than 3 years    -0.014610
                   more than 5 years    -0.010306
                   more than 8 years    -0.002733
Former Employee    less than 1 year     -0.092643
            

In [21]:
#Export the result to csv
glassdoor_pros_and_cons_sentiment_df.to_csv('../Resources/Glassdoor_Grouped_Data/glassdoor_pros_and_cons_sentiment.csv')

In [201]:
# Define a function for tokenization and text cleaning
def clean_and_tokenize(text):
    # Set the stop words library to english
    stop_words = set(stopwords.words('english'))
    #Include the extra words to be removed
    # (These were words that were showing up in the top fifty pros and cons
    # that we felt had no bearing on the overall reasons people might leave
    # or stay)
    custom_words_to_remove = set(["great", "good", "amazing","decent","positive", "better", "fantastic", "retail", "cool",
                                  "nice", "awesome",
                                  "bad", "pros", "cons", "poor", "hard", "difficult",
                                  "work", "working", "worked", "job", "jobs", "company", "apple", "bit", "business", "industry"
                                  "very", "extremely", "anything", "nothing", "really", "little", "lack", "always", "higher"
                                  "long", "lot", "lots", "best", "low", "sometimes","many", "well", "never", "life",
                                  "none", "less", "big", "bit", "around", "much", "long", "high",
                                  "get", "even", "would", "one", "want", "new", "like", "made", "done", "make", "could",
                                  "part", "always", "take", "issues", "us", "give", "ever", "given", "time", "times", "store", "place", "day"])
    stop_words.update(custom_words_to_remove)
    
    #set everything to lower case
    tokens = word_tokenize(text.lower())
    
    #Create the list 
    # isalpha means this only includes words, not random punctuation or numbers, etc.
    # and doesn't include the words that are in the stop_words 
    clean_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return clean_tokens

In [202]:
# Apply tokenization and text cleaning function to 'pros' column
glassdoor_employees_trial['pros_cleaned'] = glassdoor_employees_trial['pros'].apply(clean_and_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [203]:
#Look at the results (an array of arrays)
glassdoor_employees_trial['pros_cleaned']

0        [friendly, welcoming, staff, easy, going, ethic]
1                         [friendly, helpful, colleagues]
2                    [easy, without, experience, finance]
3                                                 [staff]
4                                      [easy, colleagues]
                               ...                       
49995                [management, staff, super, friendly]
49996                           [pay, benefits, training]
49997           [commission, structure, earning, ability]
49998                            [pay, think, complement]
49999                            [pay, think, complement]
Name: pros_cleaned, Length: 50000, dtype: object

In [204]:
# Count the number of words in 'pros' list using "Counter"
pros_word_count = Counter([word for words in glassdoor_employees_trial['pros_cleaned'] for word in words])

In [205]:
# Print the results
print("Word Count in Pros:")
print(pros_word_count)

Word Count in Pros:


In [206]:
# Apply the clean and tokenisation function across the cons list
glassdoor_employees_trial['cons_cleaned'] = glassdoor_employees_trial['cons'].apply(clean_and_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [207]:
# Count the number of words in 'cons' list using "Counter"
cons_word_count = Counter([word for words in glassdoor_employees_trial['cons_cleaned'] for word in words])

In [208]:
# Print the results
print("Word Count in Cons:")
print(cons_word_count)


Word Count in Cons:


In [209]:
# Save the pros and cons word counts into separate dataframes
pros_word_count_df = pd.DataFrame(pros_word_count.items(), columns=['word', 'count'])
cons_word_count_df = pd.DataFrame(cons_word_count.items(), columns=['word', 'count'])

In [210]:
# Sort the DataFrame based on the count column in descending order
pros_word_count_df = pros_word_count_df.sort_values(by='count', ascending=False)
cons_word_count_df = cons_word_count_df.sort_values(by='count', ascending=False)

In [211]:
#Take a look at the results
pros_word_count_df.head()

Unnamed: 0,word,count
65,benefits,10794
11,people,10385
391,pay,6045
97,environment,5488
26,culture,4372


In [212]:
#Save only the top fiften for eventual plotting
top_fifteen_pros_word_count_df = pros_word_count_df.iloc[:15]
top_fifteen_cons_word_count_df = cons_word_count_df.iloc[:15]

top_fifteen_pros_word_count_df.reset_index(drop=True, inplace=True)
top_fifteen_cons_word_count_df.reset_index(drop=True, inplace=True)

In [213]:
top_fifteen_pros_word_count_df

Unnamed: 0,word,count
0,benefits,10794
1,people,10385
2,pay,6045
3,environment,5488
4,culture,4372
5,opportunities,3727
6,balance,3674
7,team,3539
8,friendly,2881
9,flexible,2855


In [214]:
top_fifteen_cons_word_count_df

Unnamed: 0,word,count
0,management,7716
1,hours,5440
2,people,5136
3,pay,4390
4,employees,3436
5,managers,2962
6,balance,2745
7,customers,2591
8,staff,2100
9,team,2093


In [215]:
# Save DataFrames to CSV files
top_fifteen_pros_word_count_df.to_csv("../Resources/Glassdoor_Grouped_Data/glassdoor_pros_word_count.csv",index=False)
top_fifteen_cons_word_count_df.to_csv("../Resources/Glassdoor_Grouped_Data/glassdoor_cons_word_count.csv",index=False)