# `Random Forest`
- `Random Forest` is a ensemble model that uses many decision trees to make predictions. The predictions are then averaged to get the final prediction. The advantage of using `Random Forest` over `Decision Tree` is that it is more robust and less prone to overfitting.

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.ensemble import RandomForestClassifier , RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix , classification_report , accuracy_score
from sklearn.metrics import mean_squared_error , r2_score , mean_absolute_error 

In [2]:
# load the data of tips
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
# encode features which are categorical or object using for loop
for col in df.columns:
    if df[col].dtype == 'object' or df[col].dtype == 'category':
        df[col] = LabelEncoder().fit_transform(df[col])
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.5,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4


In [4]:
# split the data into X and y for classfication 
X = df.drop('sex' , axis = 1)
y = df['sex']

# split the data into train and test
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42 )

# create a random forest classifier
rf = RandomForestClassifier(n_estimators=200 ,criterion='entropy',max_depth=3 , random_state=42 )

# fit the model
rf.fit(X_train , y_train)

# predict the test data
y_pred = rf.predict(X_test)

# evaluate the model
print(confusion_matrix(y_test , y_pred))
print(classification_report(y_test , y_pred))
# print(accuracy_score(y_test , y_pred))

[[ 6 13]
 [ 3 27]]
              precision    recall  f1-score   support

           0       0.67      0.32      0.43        19
           1       0.68      0.90      0.77        30

    accuracy                           0.67        49
   macro avg       0.67      0.61      0.60        49
weighted avg       0.67      0.67      0.64        49



In [None]:
# Use random forest for regression task
X = df.drop('tip' , axis = 1)
y = df['tip']

# split the data into train and split 
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42 )

# create a random forest regressor
rf = RandomForestRegressor(n_estimators=200 , random_state=42 )

# fit the model
rf.fit(X_train , y_train)

# predict the test data
y_pred = rf.predict(X_test)

# evaluate the model
print(f'Mean Squared Error: {mean_squared_error(y_test , y_pred)}')
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test , y_pred))}')
print(f'Mean Absolute Error: {mean_absolute_error(y_test , y_pred)}')
print(f'R2 Score: {r2_score(y_test , y_pred)}')


Mean Squared Error: 0.9382071243994643
Root Mean Squared Error: 0.9686109251910513
Mean Absolute Error: 0.7700433673469339
R2 Score: 0.249417094281707
