#### This Notebook is to create additional features that might be useful for the analysis.

In [139]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# pd.set_option('display.precision', 0)
pd.options.display.float_format = '{:.2f}'.format


In [140]:
df = pd.read_pickle('../data/processed/processed_cleaned_data.pickle')

In [141]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 26563 entries, 0 to 28058
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   timestamp                     26563 non-null  datetime64[ns]
 1   age_range                     26563 non-null  category      
 2   industry                      26563 non-null  object        
 3   job_title                     26563 non-null  object        
 4   country                       26563 non-null  object        
 5   us_state                      26563 non-null  object        
 6   us_city                       26563 non-null  object        
 7   years_of_experience           26563 non-null  category      
 8   years_of_experience_in_field  26563 non-null  category      
 9   level_of_education            26563 non-null  category      
 10  gender                        26563 non-null  category      
 11  race                          265

Unnamed: 0,timestamp,age_range,industry,job_title,country,us_state,us_city,years_of_experience,years_of_experience_in_field,level_of_education,gender,race,salary_usd,additional_compensation_usd
0,2021-04-27 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,united states,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White,55000.0,0.0
1,2021-04-27 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,united kingdom,,Cambridge,8 - 10 years,5-7 years,College degree,Non-binary,White,74256.0,5440.0
2,2021-04-27 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,united states,Tennessee,Chattanooga,2 - 4 years,2 - 4 years,College degree,Woman,White,34000.0,0.0
3,2021-04-27 11:02:41,25-34,Nonprofits,Program Manager,united states,Wisconsin,Milwaukee,8 - 10 years,5-7 years,College degree,Woman,White,62000.0,3000.0
4,2021-04-27 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,united states,South Carolina,Greenville,8 - 10 years,5-7 years,College degree,Woman,White,60000.0,7000.0


#### Feature To Be Created:
- Total Compensation: Combine salary_usd and additional_compensation to create a total_compensation feature.
- Salary Bands: Create salary bands or categories (e.g., low, medium, high)
- Log Salary: Use a logarithmic transformation of salary to handle skewness in salary distribution.
- Convert the years_of_experience and years_of_experience_in_field columns into numerical categories using midpoint
- Calculate the difference between years_of_experience and years_of_experience_in_field to see if people are transitioning into new fields.
- Convert multi-race entries into binary columns (e.g., is_white, is_asian) for easier analysis.
- Create interaction features between gender and level_of_education to study their combined effect on salary.
- Classify cities or states as urban or rural to analyze salary and job title distributions.


**Total Compensation Column** 

In [142]:
df['total_compensation'] = df['salary_usd'] + df['additional_compensation_usd']

**Create Salary Bands**

In [143]:
bins = [0, 25000, 50000, 75000, 100000,250000,600000, float('inf')]
labels = ['$0-25k', '$25k-50k', '$50k-75k', '$75k-100k', '$100k-250k', '$250k-600k', 'Top 1%']
df['earning_band'] = pd.cut(df['total_compensation'], bins=bins, labels=labels, right=True)

In [144]:
bins = [0, 25000, 65000, 150_000,250000,600000,1_000_000, float('inf')]
labels = ['Very Low', 'Low', 'Middle', 'Upper Middle', 'Upper', 'High Upper', 'High Networth']
df['earning_class'] = pd.cut(df['total_compensation'], bins=bins, labels=labels, right=True)

**Transform Earning into Logarithmic Scale**

In [145]:
# Log Transformation (adding 1 to avoid log(0) issues)
df['log_total_earnings'] = np.log1p(df['total_compensation'])

**Convert Years of Experience and Years of Experience In Field into numerical values using midpoint**

In [146]:
# Function to convert experience ranges to midpoints
import re
def convert_to_midpoint(experience_range):
    # Handle the "41 years or more" and "1 year or less"  edge cases separately
    if 'or' in experience_range:
        num = float(re.findall(r'\d+', experience_range)[0])
        return num 
    
    # Extract numeric values from the range
    # We remove any extra whitespace and split by the "-" or " - "
    bounds = experience_range.replace('years', '').replace(' ', '').split('-')
    
    # Calculate the midpoint
    lower_bound = float(bounds[0])
    upper_bound = float(bounds[1])
    
    midpoint = (lower_bound + upper_bound) / 2
    return midpoint


In [147]:
# Apply the conversion function to the years_of_experience column
df['experience_yrs'] = pd.to_numeric(df['years_of_experience'].apply(convert_to_midpoint))
df['experience_in_field_yrs'] = pd.to_numeric(df['years_of_experience_in_field'].apply(convert_to_midpoint))

**Calculate the difference between years_of_experience and years_of_experience_in_field to see people who transitioned into new fields.**

In [148]:
df['transitioned']  = df['experience_yrs'] - df['experience_in_field_yrs']

In [149]:
df[df['transitioned'] < 0].shape

(241, 21)

It seems that there are people who have negative values in the transition fields, which shouldn't be the case.
We can investigate further.

In [150]:
df[df['transitioned'] < 0].sample(5)

Unnamed: 0,timestamp,age_range,industry,job_title,country,us_state,us_city,years_of_experience,years_of_experience_in_field,level_of_education,...,race,salary_usd,additional_compensation_usd,total_compensation,earning_band,earning_class,log_total_earnings,experience_yrs,experience_in_field_yrs,transitioned
25233,2021-05-06 10:29:09,35-44,Health care,Phlebotomy Supervisor,united states,Colorado,Fort Collins,8 - 10 years,11 - 20 years,Master's degree,...,White,58000.0,0.0,58000.0,$50k-75k,Low,10.97,9.0,15.5,-6.5
15001,2021-04-28 15:57:08,25-34,Engineering or Manufacturing,Project surveyor,united states,Maine,Portland,5-7 years,11 - 20 years,College degree,...,White,62400.0,0.0,62400.0,$50k-75k,Low,11.04,6.0,15.5,-9.5
7088,2021-04-27 14:25:27,25-34,Libraries,Emerging technologies librarian,united states,New Hampshire,Pelham,5-7 years,11 - 20 years,Master's degree,...,Native American or Alaska Native,55000.0,0.0,55000.0,$50k-75k,Low,10.92,6.0,15.5,-9.5
20811,2021-04-29 17:21:33,35-44,Government and Public Administration,Planner,canada,,Vancouver Island,5-7 years,8 - 10 years,Master's degree,...,White,70980.0,0.0,70980.0,$50k-75k,Middle,11.17,6.0,9.0,-3.0
450,2021-04-27 11:10:09,35-44,"Marketing, Advertising & PR",Brand Manager,united states,California,San Francisco,11 - 20 years,21 - 30 years,Some college,...,Another option not listed here or prefer not t...,140000.0,20000.0,160000.0,$100k-250k,Upper Middle,11.98,15.5,25.5,-10.0


We can assume some people mistook the fields for each other and filled years_of_experience_in_field as years_of_experience and vice versa. To fix it, we can make all the negative values in transitioned to positive.

In [151]:
def fix_transitioned(row):
    if row < 0:
        return row * (-1)
    else:
        return row

In [152]:
df['transitioned'] = df['transitioned'].apply(fix_transitioned)

In [153]:
df[df['transitioned'] < 0].shape

(0, 21)