***Random Forest Regression***  
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

We will be applying hyperparameter tuning in this code.

# **Data Preprocessing**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Datasets For Machine Learning/Salary Data (1).csv')
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gender               373 non-null    object 
 2   Education Level      373 non-null    object 
 3   Job Title            373 non-null    object 
 4   Years of Experience  373 non-null    float64
 5   Salary               373 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB


In [None]:
df.isnull().sum() #finding null values

Unnamed: 0,0
Age,2
Gender,2
Education Level,2
Job Title,2
Years of Experience,2
Salary,2


In [None]:
df['Age'] = df['Age'].fillna(df['Age'].mean())  #Handling null values
df['Gender'] = df['Gender'].fillna(df['Gender'].mode().iloc[0])
df['Education Level'] = df['Education Level'].fillna(df['Education Level'].mode().iloc[0])
df['Job Title'] = df['Job Title'].fillna(df['Job Title'].mode().iloc[0])
df['Years of Experience'] = df['Years of Experience'].fillna(df['Years of Experience'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())

In [None]:
df.isnull().sum()

Unnamed: 0,0
Age,0
Gender,0
Education Level,0
Job Title,0
Years of Experience,0
Salary,0


In [None]:
#Encoding categorical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
df['Gender'] = le.fit_transform(df['Gender']) # converting the categorical data into numerical
df['Education Level'] = le.fit_transform(df['Education Level'])
df['Job Title'] = le.fit_transform(df['Job Title'])

In [None]:
df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,1,0,159,5.0,90000.0
1,28.0,0,1,17,3.0,65000.0
2,45.0,1,2,130,15.0,150000.0
3,36.0,0,0,101,7.0,60000.0
4,52.0,1,1,22,20.0,200000.0
...,...,...,...,...,...,...
370,35.0,0,0,131,8.0,85000.0
371,43.0,1,1,30,19.0,170000.0
372,29.0,0,0,70,2.0,40000.0
373,34.0,1,0,137,7.0,90000.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  375 non-null    float64
 1   Gender               375 non-null    int64  
 2   Education Level      375 non-null    int64  
 3   Job Title            375 non-null    int64  
 4   Years of Experience  375 non-null    float64
 5   Salary               375 non-null    float64
dtypes: float64(3), int64(3)
memory usage: 17.7 KB


In [None]:
import matplotlib.pyplot as plt #for plotting
import seaborn as sns

In [None]:
#Finding exact correlation values by descending order
correlations =df.corr(method='pearson')['Salary'].sort_values(ascending=False)
correlations

Unnamed: 0,Salary
Salary,1.0
Years of Experience,0.930338
Age,0.922335
Education Level,0.669389
Job Title,0.135585
Gender,0.070931


In [None]:
#Separating feature and Target
x = df.drop(columns=['Salary']) #feature
y = df['Salary']   #target

*Scaling is not needed*

In [None]:
#Scaling data
#from sklearn.preprocessing import StandardScaler
#sc = StandardScaler()
#x = sc.fit_transform(x)
#x

In [None]:
#Splitting data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2) #20% data is for testing

In [None]:
print("train data size (features):",len(x_train))
print("train data size (target):",len(y_train))

print("test data size (features):",len(x_test))
print("test data size (target):",len(y_test))

train data size (features): 300
train data size (target): 300
test data size (features): 75
test data size (target): 75


# **Hyperparameter Tuning With CV**

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()

***Random Search CV***

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
n_estimators = [int(x) for x in np.linspace(start=100,stop=2000,num=10)]        # Number of trees in random forest

max_features = ['auto','sqrt','log2']                                           # Number of features to consider at every split

max_depth = [int(x) for x in np.linspace(10,1000,10)]                           # Maximum number of levels in tree

min_samples_split = [2,5,10,14]                                                 # Minimum number of samples requried to split a node

min_samples_leaf = [1,2,4,6,8]                                                  # Minimum number of samples required at each leaf node

random_grid = {'n_estimators':n_estimators,
               'max_features':max_features,
               'max_depth':max_depth,
               'min_samples_split':min_samples_split,
               'min_samples_leaf':min_samples_leaf,
               'criterion':['entropy','squared_error','poisson','absolute_error','friedman_mse']}

In [None]:
random_grid # It's a dictionary

{'n_estimators': [100, 311, 522, 733, 944, 1155, 1366, 1577, 1788, 2000],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000],
 'min_samples_split': [2, 5, 10, 14],
 'min_samples_leaf': [1, 2, 4, 6, 8],
 'criterion': ['entropy',
  'squared_error',
  'poisson',
  'absolute_error',
  'friedman_mse']}

Random search doesn't check all the possible combinations. But it is much faster than grid search.  
We should first use random search to get a basic idea about the approximation of desired values.

In [None]:
rf_randomcv = RandomizedSearchCV(estimator=rfr,param_distributions=random_grid,
                           n_iter=100,cv=3,verbose=2,
                           random_state=100,n_jobs=-1)  #n_iter = 100 means the code will run for 100 iteration with 100 different combination of parameters
rf_randomcv.fit(x_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


138 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skl

In [None]:
rf_randomcv.best_params_

{'n_estimators': 944,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 230,
 'criterion': 'friedman_mse'}

***Grid Search CV***

Checks all the possible combinations of hyperparameters.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
n_estimators = [int(x) for x in np.linspace(start=500,stop=1200,num=10)]        # Number of trees in random forest

max_features = ['log2']                                           # Number of features to consider at every split

max_depth = [int(x) for x in np.linspace(100,420,10)]                           # Maximum number of levels in tree

min_samples_split = [1,2,3,4,5]                                                 # Minimum number of samples requried to split a node

min_samples_leaf = [1,2,3,4]                                                  # Minimum number of samples required at each leaf node

random_grid = {'n_estimators':n_estimators,
               'max_features':max_features,
               'max_depth':max_depth,
               'min_samples_split':min_samples_split,
               'min_samples_leaf':min_samples_leaf,
               'criterion':['squared_error']}

In [None]:
grid_search = GridSearchCV(estimator=rfr,param_grid=random_grid,cv=10,n_jobs=-1,verbose=2) # n_jobs = -1 -> means all the cpu cores will be used for parallel processing
                                                                                           # cv=10: This specifies the number of folds to use for cross-validation
grid_search.fit(x_train,y_train)

Fitting 10 folds for each of 2000 candidates, totalling 20000 fits


4000 fits failed out of a total of 20000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4000 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError

In [None]:
grid_search.best_params_

{'criterion': 'squared_error',
 'max_depth': 100,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 655}

In [None]:
grid_search.best_estimator_

In [None]:
best_grid = grid_search.best_estimator_

In [None]:
best_grid # model with selected hyperparameter

# **Model Testing**

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

y_pred = best_grid.predict(x_test)  #best_grid is the best model now

In [None]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import math # for using sqrt function

In [None]:
R2 = r2_score(y_test,y_pred)
MSE = mean_squared_error(y_test,y_pred)
RMSE = math.sqrt(MSE)
MAE = mean_absolute_error(y_test,y_pred)

In [None]:
R2

0.9380136608725604

In [None]:
MSE

139245010.2365179

In [None]:
RMSE

11800.212296247806

In [None]:
MAE

7106.356012141493

The MAE is justified compared to the scale of the salary data.