In [27]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import re
import csv

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau

In [28]:
df = pd.read_csv('final_df.csv')

In [29]:
# Divide the column into 2 columns: rate_type and estimator
df[['rate_type', 'estimator']] = df['per_hour_or_year'].str.extract(r'/(yr|hr) \((Employer|Glassdoor) est.\)')
# Drop initial column
df = df.drop(columns=['per_hour_or_year'])
df

Unnamed: 0,employer,employer_rating,sector,industry,employer_founded,employer_revenue,job_title,location,min_salary,max_salary,average_base_salary,rate_type,estimator
0,Parsons,4.0,Information Technology,Information Technology Support Services,1944.0,$1 to $5 billion (USD),Data Scientist,Remote,100K,176K,"$138,050",yr,Employer
1,TriNet,3.7,Management & Consulting,Business Consulting,1988.0,$1 to $5 billion (USD),Data Scientist,Remote,76K,182K,"$129,200",yr,Employer
2,Cotiviti,3.7,Information Technology,Information Technology Support Services,1979.0,$500 million to $1 billion (USD),Data Scientist I,Remote,93K,109K,"$101,000",yr,Employer
3,Alliant Credit Union,3.3,Financial Services,Banking & Lending,1935.0,$5 to $25 million (USD),Data Scientist - Hybrid,"Chicago, IL",115K,142K,"$127,549",yr,Glassdoor
4,"NextDeavor, Inc.",4.5,,,,Unknown / Non-Applicable,"Data Scientist, Product Insights","San Francisco, CA",90.00,105.63,$97.81,hr,Employer
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1866,Plaid,4.1,Information Technology,Enterprise Software & Network Solutions,2012.0,Unknown / Non-Applicable,Experienced Machine Learning Engineer - Credit,"San Francisco, CA",226K,319K,"$272,520",yr,Employer
1867,Apple,4.2,Information Technology,Computer Hardware Development,1976.0,$10+ billion (USD),AIML - Senior Data Infrastructure Software Eng...,"Cupertino, CA",200K,364K,"$281,950",yr,Employer
1868,Madhive,3.7,Information Technology,Enterprise Software & Network Solutions,2017.0,Unknown / Non-Applicable,"Engineering Manager, Machine Learning",Remote,240K,300K,"$270,000",yr,Employer
1869,TRICORPS SECURITY,3.4,Management & Consulting,Security & Protective,,Unknown / Non-Applicable,Artificial Intelligence Implementation Specialist,"Oklahoma City, OK",50K,50K,"$50,000",yr,Employer


In [30]:
# Check how much values with hour rate
df[df['rate_type'] == 'hr']

Unnamed: 0,employer,employer_rating,sector,industry,employer_founded,employer_revenue,job_title,location,min_salary,max_salary,average_base_salary,rate_type,estimator
4,"NextDeavor, Inc.",4.5,,,,Unknown / Non-Applicable,"Data Scientist, Product Insights","San Francisco, CA",90.00,105.63,$97.81,hr,Employer
9,Agile Tech Labs,5.0,,,,Unknown / Non-Applicable,Sr. Data Analyst,Remote,55.00,60.00,$57.50,hr,Employer
26,Meridiansoft INC,4.1,,,,,DATA Science/ Data Scientist,"Phoenix, AZ",30.00,45.22,$37.61,hr,Employer
39,Meritor ITconsulting,4.8,,,,,AI/ML Engineer,Remote,60.00,60.00,$60.00,hr,Employer
54,Data Inc,3.9,Information Technology,Information Technology Support Services,1983.0,$25 to $100 million (USD),Senior Data Analyst,"New York, NY",60.00,70.00,$65.00,hr,Employer
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1557,Pinnacle Technical Resources,3.7,,,,$1 to $5 million (USD),AI Business Systems Analyst,Alaska,40.00,45.00,$42.50,hr,Employer
1558,Capitaltechsolutions Inc,3.2,,,,Unknown / Non-Applicable,5+ Years in Research/Data Analyst Senior- Atla...,"Atlanta, GA",45.00,50.00,$47.50,hr,Employer
1574,Ehub Global solution,3.5,,,,Unknown / Non-Applicable,Data Scientist,"Austin, TX",28.96,55.28,$42.12,hr,Employer
1589,Zonestra Technologies llc,3.0,,,,,"AI Ops Engineer/Software Engineer (AI, Python,...","Moline, IL",40.00,45.80,$42.90,hr,Employer


In [31]:
# Function to convert min_salary and max_salary into in a single format and with an annual rate
def clean_and_convert(value, rate_type):
    if rate_type == 'yr':
        value = value.replace('K', '')
        return float(value) * 1000
    elif rate_type == 'hr':
        return float(value) * 160 * 12

In [32]:
# Apply the function above
for col in ['min_salary', 'max_salary']:
    df[col] = df.apply(lambda row: clean_and_convert(row[col], row['rate_type']), axis=1)

In [34]:
# Function to convert average_base_salary into in a single format and with an annual rate
def clean_and_convert_average_base(value, rate_type):
    if rate_type == 'yr':
        value = value.replace('$', '').replace(',', '')
        return float(value)
    elif rate_type == 'hr':
        value = value.replace('$', '')
        return float(value) * 160 * 12

In [35]:
# Apply the function above
df['average_base_salary'] = df.apply(lambda row: clean_and_convert_average_base(row['average_base_salary'], row['rate_type']), axis=1)

In [37]:
# Delete rate type, because all rates are per year now
df = df.drop(columns=['rate_type'])

In [40]:
# Save df to csv
df.to_csv('transformed_df.csv', index=False)

In [41]:
df = pd.read_csv('transformed_df.csv')
df

Unnamed: 0,employer,employer_rating,sector,industry,employer_founded,employer_revenue,job_title,location,min_salary,max_salary,average_base_salary,estimator
0,Parsons,4.0,Information Technology,Information Technology Support Services,1944.0,$1 to $5 billion (USD),Data Scientist,Remote,100000.0,176000.0,138050.0,Employer
1,TriNet,3.7,Management & Consulting,Business Consulting,1988.0,$1 to $5 billion (USD),Data Scientist,Remote,76000.0,182000.0,129200.0,Employer
2,Cotiviti,3.7,Information Technology,Information Technology Support Services,1979.0,$500 million to $1 billion (USD),Data Scientist I,Remote,93000.0,109000.0,101000.0,Employer
3,Alliant Credit Union,3.3,Financial Services,Banking & Lending,1935.0,$5 to $25 million (USD),Data Scientist - Hybrid,"Chicago, IL",115000.0,142000.0,127549.0,Glassdoor
4,"NextDeavor, Inc.",4.5,,,,Unknown / Non-Applicable,"Data Scientist, Product Insights","San Francisco, CA",172800.0,202809.6,187795.2,Employer
...,...,...,...,...,...,...,...,...,...,...,...,...
1866,Plaid,4.1,Information Technology,Enterprise Software & Network Solutions,2012.0,Unknown / Non-Applicable,Experienced Machine Learning Engineer - Credit,"San Francisco, CA",226000.0,319000.0,272520.0,Employer
1867,Apple,4.2,Information Technology,Computer Hardware Development,1976.0,$10+ billion (USD),AIML - Senior Data Infrastructure Software Eng...,"Cupertino, CA",200000.0,364000.0,281950.0,Employer
1868,Madhive,3.7,Information Technology,Enterprise Software & Network Solutions,2017.0,Unknown / Non-Applicable,"Engineering Manager, Machine Learning",Remote,240000.0,300000.0,270000.0,Employer
1869,TRICORPS SECURITY,3.4,Management & Consulting,Security & Protective,,Unknown / Non-Applicable,Artificial Intelligence Implementation Specialist,"Oklahoma City, OK",50000.0,50000.0,50000.0,Employer
