In [1]:
# import necessary modules
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt


from IPython.display import display

### Load the data & preview the data

In [2]:
data = pd.read_csv('../data/raw/Copy of Ask A Manager Salary Survey 2021 (Responses) - Form Responses 1.csv')

In [3]:
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28061 entries, 0 to 28060
Data columns (total 18 columns):
 #   Column                                                                                                                                                                                                                                Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                                --------------  -----  
 0   Timestamp                                                                                                                                                                                                                             28061 non-null  object 
 1   How old are you?                                                                                             

Unnamed: 0,Timestamp,How old are you?,What industry do you work in?,Job title,"If your job title needs additional context, please clarify here:","What is your annual salary? (You'll indicate the currency in a later question. If you are part-time or hourly, please enter an annualized equivalent -- what you would earn if you worked the job 40 hours a week, 52 weeks a year.)","How much additional monetary compensation do you get, if any (for example, bonuses or overtime in an average year)? Please only include monetary compensation here, not the value of benefits.",Please indicate the currency,"If ""Other,"" please indicate the currency here:","If your income needs additional context, please provide it here:",What country do you work in?,"If you're in the U.S., what state do you work in?",What city do you work in?,How many years of professional work experience do you have overall?,How many years of professional work experience do you have in your field?,What is your highest level of education completed?,What is your gender?,What is your race? (Choose all that apply.)
0,4/27/2021 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,,55000,0.0,USD,,,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White
1,4/27/2021 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,,54600,4000.0,GBP,,,United Kingdom,,Cambridge,8 - 10 years,5-7 years,College degree,Non-binary,White
2,4/27/2021 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,,34000,,USD,,,US,Tennessee,Chattanooga,2 - 4 years,2 - 4 years,College degree,Woman,White
3,4/27/2021 11:02:41,25-34,Nonprofits,Program Manager,,62000,3000.0,USD,,,USA,Wisconsin,Milwaukee,8 - 10 years,5-7 years,College degree,Woman,White
4,4/27/2021 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,,60000,7000.0,USD,,,US,South Carolina,Greenville,8 - 10 years,5-7 years,College degree,Woman,White


### Cleaning data:
- standardize column names
- Remove unnecessary columns
- convert columns to the appropriate data type
- handle missing values

In [4]:
# Make a copy of the original data
df = data.copy()

In [5]:
df.columns

Index(['Timestamp', 'How old are you?', 'What industry do you work in?',
       'Job title',
       'If your job title needs additional context, please clarify here:',
       'What is your annual salary? (You'll indicate the currency in a later question. If you are part-time or hourly, please enter an annualized equivalent -- what you would earn if you worked the job 40 hours a week, 52 weeks a year.)',
       'How much additional monetary compensation do you get, if any (for example, bonuses or overtime in an average year)? Please only include monetary compensation here, not the value of benefits.',
       'Please indicate the currency',
       'If "Other," please indicate the currency here: ',
       'If your income needs additional context, please provide it here:',
       'What country do you work in?',
       'If you're in the U.S., what state do you work in?',
       'What city do you work in?',
       'How many years of professional work experience do you have overall?',
       

**Standardize column names**

In [6]:
column_names = {
    'Timestamp': 'timestamp',
    'How old are you?': 'age_range',
    'What industry do you work in?': 'industry',
    'Job title': 'job_title',
    'If your job title needs additional context, please clarify here:': 'job_title_context',
    "What is your annual salary? (You'll indicate the currency in a later question. If you are part-time or hourly, please enter an annualized equivalent -- what you would earn if you worked the job 40 hours a week, 52 weeks a year.)":'salary(annualized)',
    'How much additional monetary compensation do you get, if any (for example, bonuses or overtime in an average year)? Please only include monetary compensation here, not the value of benefits.': 'additional_compensation',
    'Please indicate the currency':'currency',
    'If "Other," please indicate the currency here: ': 'other_currency',
    'If your income needs additional context, please provide it here:': 'income_context',
    'What country do you work in?': 'country',
    "If you're in the U.S., what state do you work in?":'state(US)',
    'What city do you work in?': 'city(US)',
    'How many years of professional work experience do you have overall?': 'years_of_experience',
    'How many years of professional work experience do you have in your field?': 'years_of_experience_in_field',
    'What is your highest level of education completed?': 'level_of_education',
    'What is your gender?': 'gender',
    'What is your race? (Choose all that apply.)': 'race'
}
df.columns = df.columns.map(column_names)

In [7]:
df.head()

Unnamed: 0,timestamp,age_range,industry,job_title,job_title_context,salary(annualized),additional_compensation,currency,other_currency,income_context,country,state(US),city(US),years_of_experience,years_of_experience_in_field,level_of_education,gender,race
0,4/27/2021 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,,55000,0.0,USD,,,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White
1,4/27/2021 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,,54600,4000.0,GBP,,,United Kingdom,,Cambridge,8 - 10 years,5-7 years,College degree,Non-binary,White
2,4/27/2021 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,,34000,,USD,,,US,Tennessee,Chattanooga,2 - 4 years,2 - 4 years,College degree,Woman,White
3,4/27/2021 11:02:41,25-34,Nonprofits,Program Manager,,62000,3000.0,USD,,,USA,Wisconsin,Milwaukee,8 - 10 years,5-7 years,College degree,Woman,White
4,4/27/2021 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,,60000,7000.0,USD,,,US,South Carolina,Greenville,8 - 10 years,5-7 years,College degree,Woman,White


**Dropping Unnecessary Columns Or Columns with High Missing Values**

In [8]:
# drop columns with high missing values
(df.isnull().mean() *100).sort_values(ascending=False)

other_currency                  99.269449
income_context                  89.159331
job_title_context               74.120666
additional_compensation         25.996935
state(US)                       17.896725
level_of_education               0.791134
race                             0.630769
gender                           0.609387
city(US)                         0.292221
industry                         0.263711
job_title                        0.003564
timestamp                        0.000000
age_range                        0.000000
salary(annualized)               0.000000
currency                         0.000000
country                          0.000000
years_of_experience              0.000000
years_of_experience_in_field     0.000000
dtype: float64

'income_context', 'other_currency', 'job_title_context' all have more than 70% missing values, so we can drop them. But before doing so, let's check if there are any meaningful information in these columns.

In [10]:
print('Number of Non-null values in the Other_currency Column: ',len(df[~df['other_currency'].isnull()]))
print('Number of unique values in the Other_currency Column: ',df[~df['other_currency'].isnull()]['other_currency'].nunique())
df[~df['other_currency'].isnull()].iloc[:,:10].sample(20)

Number of Non-null values in the Other_currency Column:  205
Number of unique values in the Other_currency Column:  121


Unnamed: 0,timestamp,age_range,industry,job_title,job_title_context,salary(annualized),additional_compensation,currency,other_currency,income_context
11212,4/27/2021 23:45:09,25-34,Education (Higher Education),Research Fellow,Postdoctorales research Fellow at a medical sc...,80000,8000.0,Other,SGD,
7915,4/27/2021 15:23:14,25-34,Computing or Tech,Executive Assiatant II,Grade 6,86000,20000.0,USD,Overtime (about 5 hours a week) and bonus,
17547,4/28/2021 20:46:27,18-24,Education (Higher Education),Quality Assurance Auditor,,2400,1000.0,USD,Converted mine into USD for your easyness,
27504,11/16/2021 12:01:59,25-34,Business or Consulting,Executive Assistant,,37200,2000.0,Other,PLN,
11457,4/28/2021 1:32:51,35-44,Government and Public Administration,Senior manager,Innovation and research management for governm...,135000,0.0,Other,Sgd,
18495,4/29/2021 0:31:33,35-44,Computing or Tech,Senior Product Designer,,92580,9000.0,Other,SGD,
16627,4/28/2021 18:17:19,35-44,Computing or Tech,Director of Engineering,,580000,100000.0,Other,Israeli Shekels,
16139,4/28/2021 17:33:28,18-24,Interior Design (commercial),Interior Designer,,40000,1200.0,USD,additional compensation is for overtime (i am ...,
11620,4/28/2021 2:49:46,25-34,Biotech (R&D),Development scientist,,556200,3000.0,Other,DKK,
27846,7/14/2022 7:42:59,35-44,Law,Legal Practitioner,Senior Associate,20000,,Other,NGN,


There are 121 out of 205 unique values in the 'other_currency' column, with makes it a lot of unique values to deal with and the benefit of dealing with each one is minimal. So the current approach at the moment would be to first drop all the rows in the dataframe that have values in the other_currency column because they can skew the analysis of the data then drop the column itself after.

The other columns with high missing values (income_context, job_title_context) can be dropped without losing any relevant information.

In [15]:
# Remove rows where 'other_currency' column has non-null values.
df = df[df['other_currency'].isnull()]
df.shape


(27856, 18)

In [16]:
# Drop unnecessary columns.
df = df.drop(columns=['income_context', 'other_currency', 'job_title_context'],axis=1)