In [1]:
# Import required packages 

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

In [2]:
# Import database
df = pd.read_csv('salary_data_cleaned.csv')

In [3]:
# Columns selection
df_model = df[['avg_salary','Rating','Size','Type of ownership','Industry','Sector','Revenue','num_comp','hour','employer_provided',
             'job_state','same_state','age','python','spark','aws','excel','job_simp','seniority','desc_len']]

In [4]:
# Get a dummy of each column and catenate it
df_dummies = pd.get_dummies(df_model,drop_first=True)
df_dummies.shape

(742, 170)

In [5]:
# Select the target and the rest of features
features= df_dummies.drop('avg_salary', axis =1)
target = df_dummies.avg_salary.values

In [6]:
# Import split tool 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [7]:
# Import Decision Tree Regressor model
from sklearn.tree import DecisionTreeRegressor

# Import mean absolute error metric
from sklearn.metrics import mean_absolute_error

for i in range(1,31):
    regressor = DecisionTreeRegressor(max_depth=i, random_state=42)
    regressor.fit(X_train, y_train)
    train_predict = regressor.predict(X_train)
    print('The mean absoluteerror for train max_depth {} is:'.format(i), mean_absolute_error(y_train,train_predict))
    test_predict = regressor.predict(X_test)
    print('The mean absoluteerror for test max_depth {} is:'.format(i), mean_absolute_error(y_test,test_predict))
    print('')

The mean absoluteerror for train max_depth 1 is: 27.66967265449559
The mean absoluteerror for test max_depth 1 is: 30.196354540314275

The mean absoluteerror for train max_depth 2 is: 25.4275132230183
The mean absoluteerror for test max_depth 2 is: 27.60886868838114

The mean absoluteerror for train max_depth 3 is: 23.851000024470647
The mean absoluteerror for test max_depth 3 is: 25.600744509597735

The mean absoluteerror for train max_depth 4 is: 21.383386913842795
The mean absoluteerror for test max_depth 4 is: 23.705910023214475

The mean absoluteerror for train max_depth 5 is: 19.399132541833417
The mean absoluteerror for test max_depth 5 is: 21.871782573549343

The mean absoluteerror for train max_depth 6 is: 16.97918721534741
The mean absoluteerror for test max_depth 6 is: 18.932433160887577

The mean absoluteerror for train max_depth 7 is: 14.34653858247407
The mean absoluteerror for test max_depth 7 is: 17.915728750302023

The mean absoluteerror for train max_depth 8 is: 11.83

In [8]:
for i in range(1,100,2):
    regressor = DecisionTreeRegressor(max_depth=25, min_samples_leaf=i, random_state=42)
    regressor.fit(X_train, y_train)
    train_predict = regressor.predict(X_train)
    print('The mean absoluteerror for train min_samples_leaf {} is:'.format(i), mean_absolute_error(y_train,train_predict))
    test_predict = regressor.predict(X_test)
    print('The mean absoluteerror for test min_samples_leaf {} is:'.format(i), mean_absolute_error(y_test,test_predict))
    print('')

The mean absoluteerror for train min_samples_leaf 1 is: 0.04384485666104553
The mean absoluteerror for test min_samples_leaf 1 is: 8.333333333333334

The mean absoluteerror for train min_samples_leaf 3 is: 7.3267847105115225
The mean absoluteerror for test min_samples_leaf 3 is: 14.463199105145412

The mean absoluteerror for train min_samples_leaf 5 is: 12.23715235953853
The mean absoluteerror for test min_samples_leaf 5 is: 17.53412432086929

The mean absoluteerror for train min_samples_leaf 7 is: 14.55768427281918
The mean absoluteerror for test min_samples_leaf 7 is: 19.477916006439496

The mean absoluteerror for train min_samples_leaf 9 is: 16.11203885074905
The mean absoluteerror for test min_samples_leaf 9 is: 19.850721476619626

The mean absoluteerror for train min_samples_leaf 11 is: 17.063480984278385
The mean absoluteerror for test min_samples_leaf 11 is: 21.42770989543929

The mean absoluteerror for train min_samples_leaf 13 is: 17.675419184800823
The mean absoluteerror for 

In [9]:
from sklearn.model_selection import GridSearchCV
DTR = DecisionTreeRegressor(random_state=42)

params = {
    'max_depth': [2, 3, 5, 10, 20, 25],
    'min_samples_leaf': [5, 10, 20, 50, 100]
}

grid_search = GridSearchCV(estimator=DTR, 
                           param_grid=params, 
                           cv=3, n_jobs=-1, verbose=1, scoring='neg_mean_absolute_error')

grid_search.fit(X_train, y_train)
# print best parameter after tuning
print(grid_search.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid_search.best_estimator_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


{'max_depth': 10, 'min_samples_leaf': 5}
DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    3.9s finished


In [10]:
test_grid_predictions = grid_search.predict(X_test)
test_grid_mae = mean_absolute_error(y_test, test_grid_predictions)
print(test_grid_mae)

18.088465281921657


### Random Forest Regressor

In [11]:
from sklearn.ensemble import RandomForestRegressor

# random forest for making predictions for regression
from sklearn.ensemble import RandomForestRegressor

# define the model
RFR = RandomForestRegressor()
# fit the model on the whole dataset
RFR.fit(X_train, y_train)

# make prediction

test_predictions = RFR.predict(X_test)
test_mae = mean_absolute_error(y_test, test_predictions)
print(test_mae)

11.728187919463087


### Tuning the Random Forest

In [28]:
# Number of trees in random forest
n_estimators = np.linspace(100, 500, 50, dtype=int) 

# Maximum number of levels in tree
max_depth = [1, 5, 10, 20]

# Minimum number of samples required to split a node
min_samples_split = [0.2, 0.4, 0.6]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

param_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [29]:
from sklearn.model_selection import GridSearchCV

rfr_base = RandomForestRegressor()

grid = GridSearchCV(rfr_base, param_grid, refit = True, verbose = 3, 
                    scoring='neg_mean_absolute_error', n_jobs=-1)
 
# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 3 folds for each of 4800 candidates, totalling 14400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   43.1s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 2040 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 3864 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 4600 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 5400 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 6264 tasks      | elapsed: 23.8min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | e

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_stat...
                         'min_samples_split': [0.2, 0.4, 0.6],
                         'n_estimators': array([100, 108, 116, 124, 132, 140, 148, 157, 165, 173, 181, 18

In [30]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 0.2, 'n_estimators': 483}
RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=0.2,
                      min_weight_fraction_leaf=0.0, n_estimators=483,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)


In [31]:
predictions = grid.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(test_grid_mae)

18.088465281921657
