In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [2]:
def launch_project(file_path):
    ''' 
    Function to create the dataframe to work with, extracted from the original csv file
    
    Return
    data
     '''
    csv = file_path
    original_data = pd.read_csv(csv)
    data = original_data.copy()
    return data

In [3]:
def columns_fix(data):
    ''' First rename of columns, based on original questions from survey, to manageable and understandable information.
        Remove of columns without meaningfull information.
        
        return
            data with corrected colums '''
    new_columns = ['timestamp', 'age_range', 'industry', 'job_title', 'job_title_details', 
                'annual_salary', 'addtl_compensation', 'currency', 'other_currency', 'income_addtl_detail', 
                'country', 'us_state', 'city', 'years_experience_total', 'years_experience_field', 'education_level', 'gender', 'race'
                ]
    data.columns = new_columns
    cols_to_remove_nulls = ['job_title_details', 'income_addtl_detail', 'other_currency']
    data.drop(cols_to_remove_nulls, axis=1, inplace=True)
    data.industry.fillna('Not specified', inplace=True)
    data.education_level.fillna('Not specified', inplace=True)
    data.gender.fillna('Other or prefer not to answer', inplace=True)
    data.addtl_compensation.fillna(0, inplace=True)
    data.loc[6477, 'gender'] = 'Other or prefer not to answer'
    data.race.fillna('Another option not listed here or prefer not to answer', inplace=True)
    data.city.fillna('Not specified', inplace=True)
    data.us_state.fillna('Not specified', inplace=True)
    return data



In [4]:
def time_and_salary_fix(data):
    '''
    To breakdown the timestamp column to get the years, monnths and days when the survey was answered.
    Hours and seconds are not considered

    Salary column comes with dollar sign and with commas, these are removed to create a clean annual salary column
    return
        data with time broken down
    '''
    data.timestamp = pd.to_datetime(data['timestamp'])
    data['day'] = data['timestamp'].dt.day
    data['month'] = data['timestamp'].dt.month
    data['year'] = data['timestamp'].dt.year

    data['annual_salary_clean'] = data['annual_salary'].apply(lambda x: float(re.sub(r'[$,]','',x)))

    return data

In [5]:
def encode_age_range(element: str) -> int:
    """
    Receives the value of an age range and 
    returns an integer to classify it accondingly

    Return
        Int
    """

    if element == 'under 18':
        return 0
    elif element == '18-24':
        return 1
    elif element == '25-34':
        return 2
    elif element == '35-44':
        return 3
    elif element == '45-54':
        return 4
    elif element == '55-64':
        return 5
    elif element == '65 or over':
        return 6
    else:
        return 1000

In [6]:
def encode_years_experience_total(element: str) -> int:
    """
    Receives the value of the years of experience and 
    returns an integer to classify it accondingly

    Return
        Int
    """

    if element == '1 year or less':
        return 0
    elif element == '2 - 4 years':
        return 1
    elif element == '5-7 years':
        return 2
    elif element == '8 - 10 years':
        return 3
    elif element == '11 - 20 years':
        return 4
    elif element == '21 - 30 years':
        return 5
    elif element == '31 - 40 years':
        return 6
    elif element == '41 years or more':
        return 7
    else:
        return 1000

In [7]:
def encode_years_experience_field(element: str) -> int:
    """
    Receives the value of the years of experience in the current field and 
    returns an integer to classify it accondingly

    Return
        Int
    """

    if element == '1 year or less':
        return 0
    elif element == '2 - 4 years':
        return 1
    elif element == '5-7 years':
        return 2
    elif element == '8 - 10 years':
        return 3
    elif element == '11 - 20 years':
        return 4
    elif element == '21 - 30 years':
        return 5
    elif element == '31 - 40 years':
        return 6
    elif element == '41 years or more':
        return 7
    else:
        return 1000

In [8]:
def encoding_fields(data):
    '''
    This functions takes the 3 fields that will be encoded in order to make the analysis feasible and turns that into inttegers.
    The fields are:
        - age_range
        - years_experience_field
        - years_experience_total

    return
        data with added columns with encoded fields
    '''
    data['age_range_encoded'] = data['age_range'].apply(encode_age_range)
    data['years_experience_field_encoded'] = data['years_experience_field'].apply(encode_years_experience_field)
    data['years_experience_total_encoded'] = data['years_experience_total'].apply(encode_years_experience_total)
    return data

In [9]:
def fix_race_and_industry_columns(data):
    '''
    This functions fixes the gaps that exist in the 'race' and 'industry' columns.
    There are too many elements in these columns and only a handfull of them make more than 90% of the results;
    therefore, the elements are summarized and reduced.

    return
        data with summarized race and industry columns
    '''
    industry_to_keep = ['Computing or Tech', 'Education (Higher Education)', 'Nonprofits', 'Government and Public Administration', 'Health care', 
    'Accounting, Banking & Finance', 'Engineering or Manufacturing', 'Marketing, Advertising & PR', 'Law', 'Business or Consulting', 'Education (Primary/Secondary)', 
    'Media & Digital', 'Insurance', 'Retail', 'Recruitment or HR', 'Property or Construction', 'Art & Design', 'Utilities & Telecommunications', 'Transport or Logistics', 'Sales']
    data['industry'].where(data['industry'].isin(industry_to_keep), other='Other', inplace=True)
    
    race_to_keep = ['White', 'Asian or Asian American', 'Black or African American', 'Hispanic, Latino, or Spanish origin', 'Another option not listed here or prefer not to answer' ]
    data['race'].where(data['race'].isin(race_to_keep), other='Other', inplace=True)
    
    return data


In [10]:
def data_cleaning_spider():
    print('Project data cleaning started')
    file_path = 'Ask A Manager Salary Survey 2021 (Responses) - Form Responses 1.csv'
    data = launch_project(file_path)
    print('Data correctly extracted')
    data = columns_fix(data)
    print('Columns fixed correctly')
    data = time_and_salary_fix(data)
    print('Time broken down')
    data = encoding_fields(data)
    print('Encoded fields done')
    data = fix_race_and_industry_columns(data)
    print('"race" and "industry" columns fixed')
    print()
    new_columns = ['timestamp', 'age_range', 'age_range_encoded', 'industry', 'job_title',
       'annual_salary', 'annual_salary_clean', 'addtl_compensation',
       'currency', 'country', 'us_state', 'city', 'years_experience_total',
       'years_experience_total_encoded', 'years_experience_field', 'years_experience_field_encoded',
       'education_level', 'gender', 'race', 'day', 'month', 'year'
       ]
    data = data[new_columns]
    print('Columns order adjusted and data cleaning complete up to this point')
    print()
    print('Items pending to clean = "job_title", "country", and "us_state"')
    return data

In [11]:
data_cleaning_spider()

Project data cleaning started
Data correctly extracted
Columns fixed correctly
Time broken down
Encoded fields done
"race" and "industry" columns fixed

Columns order adjusted and data cleaning complete up to this point

Items pending to clean = "job_title", "country", and "us_state"
Project data cleaning started
Data correctly extracted
Columns fixed correctly
Time broken down
Encoded fields done
"race" and "industry" columns fixed

Columns order adjusted and data cleaning complete up to this point

Items pending to clean = "job_title", "country", and "us_state"
