In [67]:
import pandas as pd
import numpy as np
from IPython.display import display


In [68]:
df = pd.read_pickle('../data/interim/interim_cleaned_data.pickle')

In [69]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 26613 entries, 0 to 28058
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   timestamp                     26613 non-null  object 
 1   age_range                     26613 non-null  object 
 2   industry                      26613 non-null  object 
 3   job_title                     26613 non-null  object 
 4   salary(annualized)            26613 non-null  object 
 5   additional_compensation       26613 non-null  float64
 6   currency                      26613 non-null  object 
 7   country                       26613 non-null  object 
 8   us_state                      26613 non-null  object 
 9   us_city                       26613 non-null  object 
 10  years_of_experience           26613 non-null  object 
 11  years_of_experience_in_field  26613 non-null  object 
 12  level_of_education            26613 non-null  object 
 13  gender

Unnamed: 0,timestamp,age_range,industry,job_title,salary(annualized),additional_compensation,currency,country,us_state,us_city,years_of_experience,years_of_experience_in_field,level_of_education,gender,race
0,4/27/2021 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,55000,0.0,USD,united states,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White
1,4/27/2021 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,54600,4000.0,GBP,united kingdom,,Cambridge,8 - 10 years,5-7 years,College degree,Non-binary,White
2,4/27/2021 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,34000,0.0,USD,united states,Tennessee,Chattanooga,2 - 4 years,2 - 4 years,College degree,Woman,White
3,4/27/2021 11:02:41,25-34,Nonprofits,Program Manager,62000,3000.0,USD,united states,Wisconsin,Milwaukee,8 - 10 years,5-7 years,College degree,Woman,White
4,4/27/2021 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,60000,7000.0,USD,united states,South Carolina,Greenville,8 - 10 years,5-7 years,College degree,Woman,White


### Second Round of Cleaning

- Check for duplicates
- Covert date columns to datetime type
- Convert salary and additonal compensation columns to numeric type
- convert all currencies to USD using convertion rate as of 2021
- Convert all categorial columns to category type
- Drop columns that are not needed for analysis


**Check for duplicates**

In [70]:
df.duplicated().sum()

np.int64(0)

No duplicates found

**Convert date columns to datetime type**

In [71]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

**Convert salary to numeric type**

In [72]:
# pd.set_option('display.precision', 0)
pd.options.display.float_format = '{:.2f}'.format

In [73]:
# Checking if there are any non numeric values in the salary column
print(df['salary(annualized)'].str.contains('[a-zA-Z]').sum())

0


In [74]:
# Removing the ',' from the salary column and converting it to numeric type
df['salary(annualized)'] = pd.to_numeric(df['salary(annualized)'].str.replace(',', ''))

**Convert all currencies to USD using convertion rate as of 2021**

In [75]:
# Getting the unique values in the currency column
df['currency'].unique()

array(['USD', 'GBP', 'CAD', 'AUD/NZD', 'EUR', 'CHF', 'SEK', 'Other',
       'JPY'], dtype=object)

In [76]:
# Creating a dictionary with the conversion rates as of 2021
currency_conversion = {
    "USD": 1,
    "EUR": 1.18,
    "GBP": 1.36,
    "CAD": 0.78,
    "AUD/NZD": 0.72,
    "CHF": 1.08,
    "JPY": 0.0091,
    "Other": 0
}

In [77]:
# Creating a function to convert all currencies to USD using convertion rate as of 2021
def convert_to_usd(row):
    if row['currency'] in currency_conversion:
        return row['salary(annualized)'] * currency_conversion[row['currency']]
    else:
        return row['salary(annualized)']

In [79]:
df['salary_usd'] = df.apply(convert_to_usd, axis=1)

**Convert all categorial columns to category type**

In [89]:
# convert columns to category type
df[
    [
        "age_range",
        "gender",
        "years_of_experience",
        "years_of_experience_in_field",
        "level_of_education",
    ]
] = df[
        [
            "age_range",
            "gender",
            "years_of_experience",
            "years_of_experience_in_field",
            "level_of_education",
        ]
      ].astype("category")

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26613 entries, 0 to 28058
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   timestamp                     26613 non-null  datetime64[ns]
 1   age_range                     26613 non-null  category      
 2   industry                      26613 non-null  object        
 3   job_title                     26613 non-null  object        
 4   salary(annualized)            26613 non-null  int64         
 5   additional_compensation       26613 non-null  float64       
 6   currency                      26613 non-null  object        
 7   country                       26613 non-null  object        
 8   us_state                      26613 non-null  object        
 9   us_city                       26613 non-null  object        
 10  years_of_experience           26613 non-null  category      
 11  years_of_experience_in_field  266

**Drop columns that are not needed for analysis**

In [93]:
df = df.drop(columns=['salary(annualized)', 'currency'])