In [27]:
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import pickle

In [28]:
df = pd.read_csv("Resources/Salary_Data_Based_country_and_race.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race
0,0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0,UK,White
1,1,28.0,Female,Master's,Data Analyst,3.0,65000.0,USA,Hispanic
2,2,45.0,Male,PhD,Senior Manager,15.0,150000.0,Canada,White
3,3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0,USA,Hispanic
4,4,52.0,Male,Master's,Director,20.0,200000.0,USA,Asian


In [29]:
# Drop Unnamed: 0 column
df2 = df.drop(columns=['Unnamed: 0'])

# Rename columns
df2 = df2.rename(columns={'Education Level': 'Education', 
                   'Job Title': 'Job',})

df2.dropna(inplace=True)

# Rename phD to PhD
df2['Education'] = df2['Education'].replace('phD', 'PhD')                   
df2['Education'].unique()

df2

Unnamed: 0,Age,Gender,Education,Job,Years of Experience,Salary,Country,Race
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0,UK,White
1,28.0,Female,Master's,Data Analyst,3.0,65000.0,USA,Hispanic
2,45.0,Male,PhD,Senior Manager,15.0,150000.0,Canada,White
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0,USA,Hispanic
4,52.0,Male,Master's,Director,20.0,200000.0,USA,Asian
...,...,...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0,UK,Mixed
6700,32.0,Male,High School,Sales Associate,3.0,50000.0,Australia,Australian
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0,China,Chinese
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0,China,Korean


In [30]:
df2['Job'] = df2['Job'].replace('Senior', '', regex=True)
df2['Job'] = df2['Job'].replace('Junior', '', regex=True)
# df2['Job'].value_counts()

# Use for loop to replace all the values in job column with an integer value and append to a dictionary
job_dict = {}
value_mapping = []
for i, unique_value in enumerate(df2['Job'].unique()):
    job_dict[unique_value] = i + 1
    mapping = {'label': unique_value, 'value': i + 1}
    value_mapping.append(mapping)



        
print(value_mapping)

[{'label': 'Software Engineer', 'value': 1}, {'label': 'Data Analyst', 'value': 2}, {'label': ' Manager', 'value': 3}, {'label': 'Sales Associate', 'value': 4}, {'label': 'Director', 'value': 5}, {'label': 'Marketing Analyst', 'value': 6}, {'label': 'Product Manager', 'value': 7}, {'label': 'Sales Manager', 'value': 8}, {'label': 'Marketing Coordinator', 'value': 9}, {'label': ' Scientist', 'value': 10}, {'label': 'Software Developer', 'value': 11}, {'label': 'HR Manager', 'value': 12}, {'label': 'Financial Analyst', 'value': 13}, {'label': 'Project Manager', 'value': 14}, {'label': 'Customer Service Rep', 'value': 15}, {'label': 'Operations Manager', 'value': 16}, {'label': 'Marketing Manager', 'value': 17}, {'label': ' Engineer', 'value': 18}, {'label': 'Data Entry Clerk', 'value': 19}, {'label': 'Sales Director', 'value': 20}, {'label': 'Business Analyst', 'value': 21}, {'label': 'VP of Operations', 'value': 22}, {'label': 'IT Support', 'value': 23}, {'label': 'Recruiter', 'value': 

In [31]:
df2['Country'].unique()

array(['UK', 'USA', 'Canada', 'China', 'Australia'], dtype=object)

In [32]:
df2['Race'] = df2['Race'].replace('Black', 'African American')
df2['Race'] = df2['Race'].replace('Korean', 'Asian')
df2['Race'] = df2['Race'].replace('Chinese', 'Asian')
df2['Race'] = df2['Race'].replace('Welsh', 'White')
df2['Race'] = df2['Race'].replace('Australian', 'White')
df2['Race'].unique()   

array(['White', 'Hispanic', 'Asian', 'African American', 'Mixed'],
      dtype=object)

In [33]:

# make a list of the income categories converted to integers
education = {"Bachelor's": 1, "Master's": 2, 'PhD': 3, "Bachelor's Degree": 4, "Master's Degree": 5, 'High School':6}
country = {'UK': 1, 'USA': 2, 'Canada': 3, 'China': 4, 'Australia': 5}
race = {'White': 1, 'Hispanic': 2, 'Asian': 3, 'African American': 4, 'Mixed': 5}
gender = {'Female': 0, 'Male': 1}




df2['Education'] = df2['Education'].map(education)
df2['Country'] = df2['Country'].map(country)
df2['Gender'] = df2['Gender'].map(gender)
df2['Race'] = df2['Race'].map(race)
df2['Job'] = df2['Job'].map(job_dict)


df2.dropna(inplace=True)

df2.head()


Unnamed: 0,Age,Gender,Education,Job,Years of Experience,Salary,Country,Race
0,32.0,1.0,1,1,5.0,90000.0,1,1
1,28.0,0.0,2,2,3.0,65000.0,2,2
2,45.0,1.0,3,3,15.0,150000.0,3,1
3,36.0,0.0,1,4,7.0,60000.0,2,2
4,52.0,1.0,2,5,20.0,200000.0,2,3


In [34]:
df2.dtypes

Age                    float64
Gender                 float64
Education                int64
Job                      int64
Years of Experience    float64
Salary                 float64
Country                  int64
Race                     int64
dtype: object

In [35]:
# Split our preprocessed data into our features and target arrays
y = df2["Salary"].values
X = df2.drop(["Salary"],axis=1)

print(X.shape, y.shape)

(6684, 7) (6684,)


In [38]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler for ridge hyperparameter tuning
X_scaled = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaled.transform(X_train)
X_test_scaled = X_scaled.transform(X_test)


print(X_train_scaled.shape)
print(y_train.shape)

(5347, 7)
(5347,)


In [18]:
# Find best alpha hyperparameter for Ridge Regression model
model = Ridge()
alphas = [0.1, 1.0, 10.0, 100.0, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
param_grid = {'alpha': alphas}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)
best_alpha = grid_search.best_params_['alpha']
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred)

print("Best Alpha:", best_alpha)
print("R2 Score:", round(r2,2))

Best Alpha: 1.0
R2 Score: 0.72


In [19]:
# Model: Ridge Regression
model = Ridge(alpha=100)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred_ridge = model.predict(X_test_scaled)

# Metrics
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression")
print("Mean Squared Error:", round(mse_ridge,2))
print("Mean Absolute Error:", round(mae_ridge,2))
print("R-squared:", round(r2_ridge,2))

Ridge Regression
Mean Squared Error: 776049003.5
Mean Absolute Error: 22224.69
R-squared: 0.72


In [20]:
# Model: Lasso Regression
lasso_model = Lasso(alpha=100) 
lasso_model.fit(X_train_scaled, y_train)

# Predict
y_pred_lasso = lasso_model.predict(X_test_scaled)

# Metrics
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("Lasso Regression")
print("Mean Squared Error:", round(mse_lasso,2))
print("Mean Absolute Error:", round(mae_lasso,2))
print("R-squared:", round(r2_lasso,2))

Lasso Regression
Mean Squared Error: 769339121.37
Mean Absolute Error: 21933.43
R-squared: 0.72


In [39]:
# Model: Decision Tree Regressor
param_grid = {'max_depth': [50, 100, 150, 200, 400, 500, 1000, None], 'min_samples_split': [2, 5, 10]}
decision_tree_model = DecisionTreeRegressor(random_state=78)
grid_search = GridSearchCV(decision_tree_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)


# Best parameters
best_max_depth = grid_search.best_params_['max_depth']
best_min_samples_split = grid_search.best_params_['min_samples_split']
best_model = grid_search.best_estimator_


# Predict
y_pred = best_model.predict(X_test_scaled)

# Metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Hyperparameters:")
print("Best max_depth:", best_max_depth)
print("Best min_samples_split:", best_min_samples_split)
print("\nDecision Tree Regressor")
print("Mean Squared Error:", round(mse,2))
print("Mean Absolute Error:", round(mae,2))
print("R-squared:", round(r2,2))

Best Hyperparameters:
Best max_depth: 50
Best min_samples_split: 5

Decision Tree Regressor
Mean Squared Error: 115721906.91
Mean Absolute Error: 3864.79
R-squared: 0.96


In [22]:
from sklearn.ensemble import RandomForestRegressor

# Define a smaller parameter grid to search over
param_grid = {
    'n_estimators': [400, 500, 700, 800, 900],  # Reduce the number of estimators
    'max_depth': [10,20,30, 40, 50],  # Reduce the number of max_depth values
    'min_samples_split': [45, 55, 65, 75, 85],  # Reduce the number of min_samples_split values
    'min_samples_leaf': [6, 7]  # Reduce the number of min_samples_leaf values
}

# Create a smaller dataset for faster training
# For example, use the first 1000 samples for training and testing
X_train_small = X_train_scaled[:1000]
y_train_small = y_train[:1000]
X_test_small = X_test_scaled[:1000]
y_test_small = y_test[:1000]

# Create a Random Forest Regressor
random_forest = RandomForestRegressor()

# Create GridSearchCV with the Random Forest Regressor and smaller parameter grid
grid_search = GridSearchCV(random_forest, param_grid, cv=3, scoring='neg_mean_squared_error')

# Fit the GridSearchCV to the smaller training data
grid_search.fit(X_train_small, y_train_small)

# Get the best hyperparameters and the best model from the grid search
forest_best_params = grid_search.best_params_
forest_best_model = grid_search.best_estimator_

# Predict using the best model
forest_y_pred = forest_best_model.predict(X_test_small)

# Metrics
forest_mse = mean_squared_error(y_test_small, forest_y_pred)
forest_mae = mean_absolute_error(y_test_small, forest_y_pred)
forest_r2 = r2_score(y_test_small, forest_y_pred)

# Print the best hyperparameters
print("Best Hyperparameters:")
print(forest_best_params)
print("Mean Squared Error:", round(forest_mse, 2))
print("Mean Absolute Error:", round(forest_mae, 2))
print("R-squared:", round(forest_r2, 2))

Best Hyperparameters:
{'max_depth': 10, 'min_samples_leaf': 6, 'min_samples_split': 45, 'n_estimators': 400}
Mean Squared Error: 272769285.09
Mean Absolute Error: 12394.62
R-squared: 0.9


In [23]:
forest_model = RandomForestRegressor(n_estimators=400,max_depth=50, min_samples_leaf=7, min_samples_split=45)
forest_model.fit(X_train_scaled, y_train)


y_forest = forest_model.predict(X_test_scaled)

mse_forest = mean_squared_error(y_test, y_forest)
mae_forest = mean_absolute_error(y_test, y_forest)
r2_forest = r2_score(y_test, y_forest)


print("Mean Squared Error:", round(mse_forest,2))
print("Mean Absolute Error:", round(mae_forest,2))
print("R-squared:", round(r2_forest,2))

Mean Squared Error: 120178362.47
Mean Absolute Error: 6905.99
R-squared: 0.96


In [40]:
# Save model and scaler with pickle
with open('scaler.pkl', 'wb') as f:
    pickle.dump(X_scaled, f)

with open("decision_tree_model.pkl", 'wb') as f:
    pickle.dump(best_model, f)