In [1]:
# import necessary modules
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt


from IPython.display import display

### Load the data & preview the data

In [2]:
data = pd.read_csv('../data/raw/Copy of Ask A Manager Salary Survey 2021 (Responses) - Form Responses 1.csv')

In [3]:
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28061 entries, 0 to 28060
Data columns (total 18 columns):
 #   Column                                                                                                                                                                                                                                Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                                --------------  -----  
 0   Timestamp                                                                                                                                                                                                                             28061 non-null  object 
 1   How old are you?                                                                                             

Unnamed: 0,Timestamp,How old are you?,What industry do you work in?,Job title,"If your job title needs additional context, please clarify here:","What is your annual salary? (You'll indicate the currency in a later question. If you are part-time or hourly, please enter an annualized equivalent -- what you would earn if you worked the job 40 hours a week, 52 weeks a year.)","How much additional monetary compensation do you get, if any (for example, bonuses or overtime in an average year)? Please only include monetary compensation here, not the value of benefits.",Please indicate the currency,"If ""Other,"" please indicate the currency here:","If your income needs additional context, please provide it here:",What country do you work in?,"If you're in the U.S., what state do you work in?",What city do you work in?,How many years of professional work experience do you have overall?,How many years of professional work experience do you have in your field?,What is your highest level of education completed?,What is your gender?,What is your race? (Choose all that apply.)
0,4/27/2021 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,,55000,0.0,USD,,,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White
1,4/27/2021 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,,54600,4000.0,GBP,,,United Kingdom,,Cambridge,8 - 10 years,5-7 years,College degree,Non-binary,White
2,4/27/2021 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,,34000,,USD,,,US,Tennessee,Chattanooga,2 - 4 years,2 - 4 years,College degree,Woman,White
3,4/27/2021 11:02:41,25-34,Nonprofits,Program Manager,,62000,3000.0,USD,,,USA,Wisconsin,Milwaukee,8 - 10 years,5-7 years,College degree,Woman,White
4,4/27/2021 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,,60000,7000.0,USD,,,US,South Carolina,Greenville,8 - 10 years,5-7 years,College degree,Woman,White


### Cleaning data:
- standardize column names
- Remove unnecessary columns
- convert columns to the appropriate data type
- handle missing values

In [4]:
# Make a copy of the original data
df = data.copy()

In [5]:
df.columns

Index(['Timestamp', 'How old are you?', 'What industry do you work in?',
       'Job title',
       'If your job title needs additional context, please clarify here:',
       'What is your annual salary? (You'll indicate the currency in a later question. If you are part-time or hourly, please enter an annualized equivalent -- what you would earn if you worked the job 40 hours a week, 52 weeks a year.)',
       'How much additional monetary compensation do you get, if any (for example, bonuses or overtime in an average year)? Please only include monetary compensation here, not the value of benefits.',
       'Please indicate the currency',
       'If "Other," please indicate the currency here: ',
       'If your income needs additional context, please provide it here:',
       'What country do you work in?',
       'If you're in the U.S., what state do you work in?',
       'What city do you work in?',
       'How many years of professional work experience do you have overall?',
       

**Standardize column names**

In [6]:
column_names = {
    'Timestamp': 'timestamp',
    'How old are you?': 'age_range',
    'What industry do you work in?': 'industry',
    'Job title': 'job_title',
    'If your job title needs additional context, please clarify here:': 'job_title_context',
    "What is your annual salary? (You'll indicate the currency in a later question. If you are part-time or hourly, please enter an annualized equivalent -- what you would earn if you worked the job 40 hours a week, 52 weeks a year.)":'salary(annualized)',
    'How much additional monetary compensation do you get, if any (for example, bonuses or overtime in an average year)? Please only include monetary compensation here, not the value of benefits.': 'additional_compensation',
    'Please indicate the currency':'currency',
    'If "Other," please indicate the currency here: ': 'other_currency',
    'If your income needs additional context, please provide it here:': 'income_context',
    'What country do you work in?': 'country',
    "If you're in the U.S., what state do you work in?":'state(US)',
    'What city do you work in?': 'city(US)',
    'How many years of professional work experience do you have overall?': 'years_of_experience',
    'How many years of professional work experience do you have in your field?': 'years_of_experience_in_field',
    'What is your highest level of education completed?': 'level_of_education',
    'What is your gender?': 'gender',
    'What is your race? (Choose all that apply.)': 'race'
}
df.columns = df.columns.map(column_names)

In [7]:
df.head()

Unnamed: 0,timestamp,age_range,industry,job_title,job_title_context,salary(annualized),additional_compensation,currency,other_currency,income_context,country,state(US),city(US),years_of_experience,years_of_experience_in_field,level_of_education,gender,race
0,4/27/2021 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,,55000,0.0,USD,,,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White
1,4/27/2021 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,,54600,4000.0,GBP,,,United Kingdom,,Cambridge,8 - 10 years,5-7 years,College degree,Non-binary,White
2,4/27/2021 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,,34000,,USD,,,US,Tennessee,Chattanooga,2 - 4 years,2 - 4 years,College degree,Woman,White
3,4/27/2021 11:02:41,25-34,Nonprofits,Program Manager,,62000,3000.0,USD,,,USA,Wisconsin,Milwaukee,8 - 10 years,5-7 years,College degree,Woman,White
4,4/27/2021 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,,60000,7000.0,USD,,,US,South Carolina,Greenville,8 - 10 years,5-7 years,College degree,Woman,White


**Dropping Unnecessary Columns Or Columns with High Missing Values**

In [8]:
# drop columns with high missing values
(df.isnull().mean() *100).sort_values(ascending=False)

other_currency                  99.269449
income_context                  89.159331
job_title_context               74.120666
additional_compensation         25.996935
state(US)                       17.896725
level_of_education               0.791134
race                             0.630769
gender                           0.609387
city(US)                         0.292221
industry                         0.263711
job_title                        0.003564
timestamp                        0.000000
age_range                        0.000000
salary(annualized)               0.000000
currency                         0.000000
country                          0.000000
years_of_experience              0.000000
years_of_experience_in_field     0.000000
dtype: float64

'income_context', 'other_currency', 'job_title_context' all have more than 70% missing values, so we can drop them. But before doing so, let's check if there are any meaningful information in these columns.

In [9]:
print('Number of Non-null values in the Other_currency Column: ',len(df[~df['other_currency'].isnull()]))
print('Number of unique values in the Other_currency Column: ',df[~df['other_currency'].isnull()]['other_currency'].nunique())
df[~df['other_currency'].isnull()].iloc[:,:10].sample(20)

Number of Non-null values in the Other_currency Column:  205
Number of unique values in the Other_currency Column:  121


Unnamed: 0,timestamp,age_range,industry,job_title,job_title_context,salary(annualized),additional_compensation,currency,other_currency,income_context
27642,1/27/2022 14:13:05,18-24,Media & Digital,homie,,56,5.0,USD,5,5
27602,1/7/2022 4:55:01,25-34,Engineering or Manufacturing,Senior process engineer,,424823,71845.0,Other,THB,
11719,4/28/2021 4:09:53,25-34,Computing or Tech,Technical Writer,,187200,,Other,ILS (Shekel),"Standard work weeks in Israel are 45h, so I ca..."
24884,5/5/2021 21:27:45,45-54,Scientist,Staff Scientist,,1000000,,Other,NTD,
24198,5/4/2021 14:14:19,25-34,Computing or Tech,Software engineer,,59100,5500.0,Other,Euro,
3124,4/27/2021 11:59:26,25-34,Sales,Territory Manager,Software sales for automotive industry,33000,0.0,USD,Base plus Commission,Commision checks every month that vary based o...
9847,4/27/2021 19:05:54,25-34,Education (Higher Education),Teaching assistant,,210000,0.0,Other,NIS (new Israeli shekel),"The definition of ""full time"" is 22 hr/week, n..."
7545,4/27/2021 14:55:37,25-34,Engineering or Manufacturing,Administrative Assistant,Travel Coordinator,264,167.0,Other,Php,
27191,10/26/2021 13:08:45,25-34,Retail,waiter,,50000,3000.0,Other,croatian kuna,It is a regular job in a tourist Area so salar...
13218,4/28/2021 12:14:42,35-44,Health care,Operational Services,Kitchen Hand,83000,,Other,Australian Dollars,AU $ & NZ $ are very different. (That would be...


There are 121 out of 205 unique values in the 'other_currency' column, with makes it a lot of unique values to deal with and the benefit of dealing with each one is minimal. So the current approach at the moment would be to first drop all the rows in the dataframe that have values in the other_currency column because they can skew the analysis of the data then drop the column itself after.

The other columns with high missing values (income_context, job_title_context) can be dropped without losing any relevant information.

In [10]:
# Remove rows where 'other_currency' column has non-null values.
df = df[df['other_currency'].isnull()]
df.shape

(27856, 18)

In [11]:
# Drop unnecessary columns.
df = df.drop(columns=['income_context', 'other_currency', 'job_title_context'],axis=1)

In [12]:
df.head()

Unnamed: 0,timestamp,age_range,industry,job_title,salary(annualized),additional_compensation,currency,country,state(US),city(US),years_of_experience,years_of_experience_in_field,level_of_education,gender,race
0,4/27/2021 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,55000,0.0,USD,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White
1,4/27/2021 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,54600,4000.0,GBP,United Kingdom,,Cambridge,8 - 10 years,5-7 years,College degree,Non-binary,White
2,4/27/2021 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,34000,,USD,US,Tennessee,Chattanooga,2 - 4 years,2 - 4 years,College degree,Woman,White
3,4/27/2021 11:02:41,25-34,Nonprofits,Program Manager,62000,3000.0,USD,USA,Wisconsin,Milwaukee,8 - 10 years,5-7 years,College degree,Woman,White
4,4/27/2021 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,60000,7000.0,USD,US,South Carolina,Greenville,8 - 10 years,5-7 years,College degree,Woman,White


**Cleaning other columns with missing values**

In [13]:
(df.isnull().mean() * 100).sort_values(ascending=False)

additional_compensation         25.969271
state(US)                       17.428920
level_of_education               0.771827
race                             0.621051
gender                           0.610281
city(US)                         0.294371
industry                         0.258472
job_title                        0.003590
salary(annualized)               0.000000
timestamp                        0.000000
age_range                        0.000000
currency                         0.000000
country                          0.000000
years_of_experience              0.000000
years_of_experience_in_field     0.000000
dtype: float64

- Additional Compensation Column

In [14]:
# fill missing values in 'additional_compensation' column with 0
df['additional_compensation'] = df['additional_compensation'].fillna(0)

- State(US) Column

In [15]:
# Viewing the pattern of the state(US) column to see if United States has have corresponding states and 
# non United States entries have null values.
df.iloc[:,6:9]

Unnamed: 0,currency,country,state(US)
0,USD,United States,Massachusetts
1,GBP,United Kingdom,
2,USD,US,Tennessee
3,USD,USA,Wisconsin
4,USD,US,South Carolina
...,...,...,...
28056,CAD,Canada,
28057,USD,United States,Missouri
28058,USD,USA,Georgia
28059,USD,Myanmar,Colorado


In [16]:
# View rows with missing values in 'State(US)' column to confirm the pattern.
df[df['state(US)'].isnull()].iloc[:,6:10]

Unnamed: 0,currency,country,state(US),city(US)
1,GBP,United Kingdom,,Cambridge
10,USD,United States,,"Boston, MA"
14,CAD,Canada,,Remote
15,GBP,United Kingdom,,Lincoln
21,USD,United States,,Atlanta
...,...,...,...,...
28050,EUR,Spain,,Barcelona
28053,CAD,Taiwan,,Taipei
28055,CAD,Canada,,Toronto
28056,CAD,Canada,,Kitchener


It appears that there are some rows where the 'country' column is 'United States' but the 'State(US)' column is missing.

Since there are multiple entries where the in the country column United States is written in many different ways, we will convert all entries to 'United States' to make it consistent first before dealing with the missing state entries.

In [17]:
df['country'].unique()

array(['United States', 'United Kingdom', 'US', 'USA', 'Canada',
       'United Kingdom ', 'usa', 'UK', 'Scotland ', 'U.S.',
       'United States ', 'The Netherlands', 'Australia ', 'Spain', 'us',
       'Usa', 'England', 'finland', 'United States of America', 'France',
       'United states', 'Scotland', 'USA ', 'United states ', 'Germany',
       'UK ', 'united states', 'Ireland', 'Australia', 'Uk',
       'United States of America ', 'U.S. ', 'canada', 'Canada ', 'U.S>',
       'ISA', 'Great Britain ', 'US ', 'United State', 'U.S.A', 'Denmark',
       'U.S.A.', 'America', 'Netherlands', 'netherlands', 'England ',
       'united states of america', 'Ireland ', 'Switzerland',
       'Netherlands ', 'Bermuda', 'Us', 'The United States',
       'United State of America', 'Germany ', 'Mexico ', 'United Stated',
       'South Africa ', 'Belgium', 'Northern Ireland', 'u.s.',
       'South Africa', 'UNITED STATES', 'united States', 'Sweden',
       'Hong Kong', 'Sri lanka', 'Contracts', 'U

Starting with USA, we can see it was entered in many ways. We will convert them all to 'United States'.

- Convert all USA entries in 'country' column to 'United States'.

In [18]:
# First make all the entries in 'country' column lowercase.
df['country'] = df['country'].str.lower().str.strip()

In [19]:
# show all unique country entries that start with the letter 'u'
df[df['country'].str.startswith('u')]['country'].unique()

array(['united states', 'united kingdom', 'us', 'usa', 'uk', 'u.s.',
       'united states of america', 'u.s>', 'united state', 'u.s.a',
       'u.s.a.', 'united state of america', 'united stated',
       'usa-- virgin islands', 'united statws', 'u.s', 'unites states',
       'u. s.', 'united sates', 'united states of american',
       'uniited states', 'united kingdom (england)',
       'united sates of america',
       'united states (i work from home and my clients are all over the us/canada/pr',
       'unted states', 'united statesp', 'united stattes',
       'united statea', 'united kingdom.', 'united statees',
       'uniyed states', 'uniyes states', 'united states of americas',
       'u.a.', 'us of a', 'united arab emirates', 'u.k.', 'u.sa',
       'united kindom', 'united status', 'uxz', 'uss', 'uniteed states',
       'united stares', 'uk (northern ireland)', 'uk for u.s. company',
       'unite states', 'united kingdomk', 'unitedstates',
       'u.k. (northern england)', 'u

In [20]:
# Get all the usa varitions into list
usa_list = ['united states', 'united kingdom', 'us', 'usa', 'u.s.',
       'united states of america', 'u.s>', 'united state', 'u.s.a',
       'u.s.a.', 'united state of america', 'united stated',
       'usa-- virgin islands', 'united statws', 'u.s', 'unites states',
       'u. s.', 'united sates', 'united states of american',
       'uniited states', 'united sates of america',
       'united states (i work from home and my clients are all over the us/canada/pr',
       'unted states', 'united statesp', 'united stattes',
       'united statea', 'united statees',
       'uniyed states', 'uniyes states', 'united states of americas',
       'u.a.', 'us of a', 'u.sa',
       'united status', 'uniteed states',
       'united stares', 'unite states', 'unitedstates',
       'united statew', 'united statues', 'untied states',
       'usa (company is based in a us territory, i work remote)',
       'usab', 'unitied states',
       'united sttes', 'uniter statez', 'u. s',
       'usa tomorrow', 'united stateds',
       'usat', 'unitef stated', 'ua', 'usaa',
       'united y', 'united statss', 'united  states',
       'united states is america','america','isa','the united states',
       'san francisco', '🇺🇸','california','the us']

# replace all USA entries in 'country' column with 'United States' using the list usa_list
df['country'] = df['country'].apply(lambda x: 'united states' if x in usa_list else x)

In [21]:
df['country'].unique()

array(['united states', 'canada', 'uk', 'scotland', 'the netherlands',
       'australia', 'spain', 'england', 'finland', 'france', 'germany',
       'ireland', 'great britain', 'denmark', 'netherlands',
       'switzerland', 'bermuda', 'mexico', 'south africa', 'belgium',
       'northern ireland', 'sweden', 'hong kong', 'sri lanka',
       'contracts', 'england/uk',
       "we don't get raises, we get quarterly bonuses, but they periodically asses income in the area you work, so i got a raise because a 3rd party assessment showed i was paid too little for the area we were located",
       'england, uk.', 'greece', 'japan', 'britain', 'austria',
       'canada, ottawa, ontario', 'global', 'united kingdom (england)',
       'worldwide (based in us but short term trips aroudn the world)',
       'canadw', 'luxembourg', 'united kingdom.', 'new zealand',
       'cayman islands', 'can',
       'i am located in canada but i work for a company in the us',
       'latvia', 'puerto rico', 'rwa

In [22]:
# Get all the uk varitions into list
uk_list = ['uk', 'united kingdom (england)',
       'united kingdom.', 'u.k.', 'united kindom',
       'uk (northern ireland)', 'uk for u.s. company',
       'united kingdomk', 'u.k. (northern england)', 'u.k',
       'uk (england)', 'uk, remote', 'unites kingdom',
       'uk, but for globally fully remote company','england','england/uk',
       'england, uk.','britain','england, uk','wales (united kingdom)', 'england, gb',
       'england, united kingdom','englang','scotland, uk','wales, uk', 'northern ireland',
       'wales (uk)','northern ireland, united kingdom','london','great britain']

# replace all USA entries in 'country' column with 'United States' using the list usa_list
df['country'] = df['country'].apply(lambda x: 'united kingdom' if x in uk_list else x)

In [25]:
# Get all the canada varitions into list
canada_list = ['canada', 'canada, ottawa, ontario', 'canadw',
       'can', 'canda', 'canada and usa', 'canad',
       'canadá','csnada']

# replace all USA entries in 'country' column with 'United States' using the list usa_list
df['country'] = df['country'].apply(lambda x: 'canada' if x in canada_list else x)

Since some entries in the 'country' column have sentences rather than country names, we will drop these rows

In [32]:
# Drop rows where 'country' column has more than 3 words
df = df[df['country'].str.split().apply(len) <= 3]

Getting the count of each country in the dataframe

In [65]:
value_counts = df['country'].value_counts()
value_counts

country
united states     23774
canada             1683
united kingdom      912
australia           377
germany             195
                  ...  
saudi arabia          1
loutreland            1
taiwan                1
myanmar               1
burma                 1
Name: count, Length: 125, dtype: int64

Also based on the frequency of the top countries, we will keep only the countries with more than 100 entries, as countries with less than that may not provide meaningful insights in this dataset.

In [66]:
df = df[df['country'].isin(value_counts[value_counts > 100].index)]

In [61]:
# check if value_counts is a dataframe or series


Index(['united states', 'canada', 'united kingdom', 'australia', 'germany',
       'ireland', 'new zealand'],
      dtype='object', name='country')

In [52]:
value_counts[value_counts > 5]

country
united states      23774
canada              1683
united kingdom       912
australia            377
germany              195
ireland              122
new zealand          119
france                68
netherlands           57
spain                 49
scotland              45
sweden                40
switzerland           37
belgium               35
the netherlands       31
japan                 29
austria               17
south africa          17
finland               16
italy                 14
denmark               10
israel                 7
india                  6
nz                     6
singapore              6
Name: count, dtype: int64