# Base Model
## 1.0. Random Forest Regressor

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import collections
from scipy import stats
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics
from sklearn.model_selection import train_test_split
from math import sqrt
from datetime import datetime, timedelta
# Import Decision Tree Classifier model from Scikit-Learn
from sklearn.tree import DecisionTreeClassifier
# Plot the trained Decision Tree
from sklearn.tree import plot_tree
# for plotting confusion matrix
from sklearn.metrics import confusion_matrix
from collections import Counter


from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn. ensemble import RandomForestClassifier, RandomForestRegressor, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC, SVR
from sklearn.model_selection import cross_val_score, cross_validate

In [2]:
dsdata = pd.read_csv('new_ds_salaries.csv')
ds_new = dsdata.copy()

# Exploring the numeric data in hr_data
ds_num = ds_new.select_dtypes(['int64', 'float64']).copy()
ds_num.head()


Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio,compare_avr_salary,experience_levelN,company_sizeN,employment_typeN,employee_continentN,company_continentN
0,2020,70000,79833,0,1,1,2,1,0,0
1,2020,260000,260000,0,1,2,0,1,1,1
2,2020,85000,109024,1,1,2,1,1,0,0
3,2020,20000,20000,0,0,1,0,1,2,2
4,2020,150000,150000,1,1,2,2,1,2,2


### 1.1. Define Model Inputs (X) & Output (y)

In [3]:
X = ds_num.drop(['salary_in_usd', 'work_year', 'salary'], axis=1)
y = ds_num['salary_in_usd']

### 1.2. Split data by 70% to 30% Training & Testing respectively

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

### 1.3. Create Random Forest Model

In [5]:
rf = RandomForestRegressor(n_estimators = 10, max_depth=40)
rf.fit(X_train, y_train)

### 1.4. Evaluating the Random Forest Model

In [6]:
# Training MSE
train_mse = sqrt(mean_squared_error(rf.predict(X_train), y_train))
print('Training Root Mean Squared Error is: ', train_mse)

# testing MSE
test_mse = sqrt(mean_squared_error(rf.predict(X_test), y_test))
print('Testing Root Mean Squared Error is: ',test_mse)

Training Root Mean Squared Error is:  45190.911490144936
Testing Root Mean Squared Error is:  44443.89265218362


In [7]:
train_R2 = rf.score(X_train, y_train)
print('Training R^2 value is: ', train_R2)
test_R2 = rf.score(X_test, y_test)
print('Testing R^2 value is: ', test_R2)

Training R^2 value is:  0.6212813974704114
Testing R^2 value is:  0.5272515139533904


### 1.5. Using K-Fold Cross Validation to see if it will improve the accuracy on the Random Forest Regressor model

In [8]:
# Evaluate the Model using K-folds Cross validation
results = cross_validate(rf, X, y, scoring='neg_mean_squared_error', cv=5, return_train_score = True)

In [9]:
print('Cross Validation Training Root mean_squared_error is:', sqrt(sum(-results['train_score'])/len(results['train_score'])))
print('Cross Validation Testing Root mean_squared_error is:', sqrt(sum(-results['test_score'])/len(results['test_score'])))

Cross Validation Training Root mean_squared_error is: 42638.84387526745
Cross Validation Testing Root mean_squared_error is: 59220.645767839094


### 1.6. Performing Gridsearch to select best Hyperparameters of Random Forest Model

In [10]:
param_grid = { "criterion" : ["squared_error", "absolute_error", "friedman_mse", "poisson"],
                 "min_samples_leaf" : [15, 20, 25],
                 "min_samples_split" : [16, 22, 26],
                 'max_depth' : [40,45,50]}
gs = GridSearchCV(rf, param_grid=param_grid, scoring='neg_mean_squared_error', cv= 10, n_jobs=-1)
gs = gs.fit(X, y)
print(gs.best_score_)
print(gs.best_params_)

-2544821731.788275
{'criterion': 'friedman_mse', 'max_depth': 50, 'min_samples_leaf': 15, 'min_samples_split': 26}


### 1.7. Create Random Forest Regressor Model with Best Hyperparameters

In [11]:
rf_new = RandomForestRegressor(criterion='squared_error', max_depth = 50, min_samples_leaf = 15, min_samples_split= 16)
rf_new.fit(X_train, y_train)

### 1.8. Evaluating improved Random Forest Model

In [12]:
# Training MSE
new_train_mse = sqrt(mean_squared_error(rf_new.predict(X_train), y_train))
print('Training Root Mean Squared Error is: ', train_mse)

# testing MSE
new_test_mse = sqrt(mean_squared_error(rf_new.predict(X_test), y_test))
print('Testing Root Mean Squared Error is: ',test_mse)

Training Root Mean Squared Error is:  45190.911490144936
Testing Root Mean Squared Error is:  44443.89265218362


In [13]:
train_R2_new = rf_new.score(X_train, y_train)
print('New Training R^2 value is: ', train_R2_new)
test_R2_new = rf_new.score(X_test, y_test)
print('New Testing R^2 value is: ', test_R2_new)

New Training R^2 value is:  0.4861162345237209
New Testing R^2 value is:  0.58160077492952


### 1.9. Using K-Fold Cross Validation to see if it will improve accuracy on improved Random Forest Model

In [14]:
# Evaluate the Model using K-folds Cross validation
new_results = cross_validate(rf_new, X, y, scoring='neg_mean_squared_error', cv=5, return_train_score = True)
print('New Train_score: ', new_results['train_score'])
print('New Test_score: ', new_results['test_score'])

New Train_score:  [-2.13614285e+09 -2.45104538e+09 -2.41874710e+09 -2.63213773e+09
 -2.41818320e+09]
New Test_score:  [-3.72659663e+09 -2.47121518e+09 -2.65110925e+09 -1.57682413e+09
 -2.35917302e+09]


In [15]:
print('Cross Validation Training Root mean_squared_error is:', sqrt(sum(-new_results['train_score'])/len(new_results['train_score'])))
print('Cross Validation Testing Root mean_squared_error is:', sqrt(sum(-new_results['test_score'])/len(new_results['test_score'])))

Cross Validation Training Root mean_squared_error is: 49104.49320777993
Cross Validation Testing Root mean_squared_error is: 50566.62575501459


### 1.10. Summary

In [16]:
print("**Current Model of Random Forest**")
print("RMSE Values")
print("---Before---")
# training and test mse
print('Training Root Mean Squared Error is: ', train_mse)
print('Testing Root Mean Squared Error is: ',test_mse)
# cross validation of train, test mse
print('Cross Validation Training Root mean_squared_error is:', sqrt(sum(-results['train_score'])/len(results['train_score'])))
print('Cross Validation Testing Root mean_squared_error is:', sqrt(sum(-results['test_score'])/len(results['test_score'])))
print()
print("---After---")
print('Training Root Mean Squared Error is: ', new_train_mse)
print('Testing Root Mean Squared Error is: ', new_test_mse)
print('Cross Validation Training Root mean_squared_error is:', sqrt(sum(-new_results['train_score'])/len(new_results['train_score'])))
print('Cross Validation Testing Root mean_squared_error is:', sqrt(sum(-new_results['test_score'])/len(new_results['test_score'])))


print()
print("Accuracy")
print("---Before---")
print('Training R^2 value is: ', train_R2)
print('Testing R^2 value is: ', test_R2)
print()
print("---After---")
print('New Training R^2 value is: ', rf_new.score(X_train, y_train))
print('New Testing R^2 value is: ', rf_new.score(X_test, y_test))

**Current Model of Random Forest**
RMSE Values
---Before---
Training Root Mean Squared Error is:  45190.911490144936
Testing Root Mean Squared Error is:  44443.89265218362
Cross Validation Training Root mean_squared_error is: 42638.84387526745
Cross Validation Testing Root mean_squared_error is: 59220.645767839094

---After---
Training Root Mean Squared Error is:  52641.133342280285
Testing Root Mean Squared Error is:  41811.18268637178
Cross Validation Training Root mean_squared_error is: 49104.49320777993
Cross Validation Testing Root mean_squared_error is: 50566.62575501459

Accuracy
---Before---
Training R^2 value is:  0.6212813974704114
Testing R^2 value is:  0.5272515139533904

---After---
New Training R^2 value is:  0.4861162345237209
New Testing R^2 value is:  0.58160077492952


### 1.11 Overview

- Earlier it can be seen that the cross validation of the RMSE is much higher and is not as accurate as it was before 
- After applying the best hyperparameters and refitting the Random Forest Regressor Model
- It can be seen that the RMSE has decreased and Training RMSE is much closer to Testing RMSE as compared to before

---
- Now Comparing the R^2 values before selecting the best hyperparameters, it seems that the R^2 value is not correctly reflecting as the `Training R^2 value` is much higher than the `Testing R^2 value`.

- After Selecting `Best Hyperparameters` and Applying `K-Fold Cross Validation`, we can see that the R^2 is more realistic now as the `Training R^2 value` is much lower than the `Testing R^2 value`. And the overall performance on the `Test R^2 value` has increased from selecting the `best hyperparameters` and using `K-Fold Cross Validation`