In [22]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt,style

In [23]:
dataset = pd.read_csv("petrol_consumption.csv")

In [24]:
dataset.head()
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Petrol_tax                    48 non-null     float64
 1   Average_income                48 non-null     int64  
 2   Paved_Highways                48 non-null     int64  
 3   Population_Driver_licence(%)  48 non-null     float64
 4   Petrol_Consumption            48 non-null     int64  
dtypes: float64(2), int64(3)
memory usage: 2.0 KB
None


In [25]:
X = dataset.iloc[:,:-1]

#For demonstration, lets assume "Petrol_Consumption" has been converted to a categorical target variable
y=np.where(dataset['Petrol_Consumption'] > dataset['Petrol_Consumption'].median(),1,0)


In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)



In [7]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100,random_state=0,oob_score=True)
classifier.fit(X_train,y_train)

In [8]:
from sklearn import metrics
y_pred_train = classifier.predict(X_train)
print('Train Accuracy:', metrics.accuracy_score(y_train, y_pred_train))
print('Train Precision:', metrics.precision_score(y_train, y_pred_train))
print('Train Recall:', metrics.recall_score(y_train, y_pred_train))

Train Accuracy: 1.0
Train Precision: 1.0
Train Recall: 1.0


In [9]:
X_train

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%)
11,7.5,5126,14186,0.525
31,7.0,3333,6594,0.513
33,7.5,3357,4121,0.547
27,7.5,3846,9061,0.579
47,7.0,5002,9794,0.593
2,9.0,3865,1586,0.58
46,7.0,4296,4083,0.623
18,7.0,4716,5915,0.724
15,7.0,4318,10340,0.586
28,8.0,4188,5975,0.563


In [15]:
classifier.feature_importances_

array([0.22527314, 0.26766953, 0.16359547, 0.34346185])

In [26]:
pd.DataFrame({'Feature Name':dataset.iloc[:,:-1].columns,'Importance Score':classifier.feature_importances_})
#pd.DataFrame({'Feature Name':dataset[:,:-1].columns,'Importance Score':classifier.feature_importances_})

Unnamed: 0,Feature Name,Importance Score
0,Petrol_tax,0.26898
1,Average_income,0.273738
2,Paved_Highways,0.112864
3,Population_Driver_licence(%),0.344418


In [28]:
classifier.oob_score # Out of Bag

True

In [29]:
classifier.oob_score_

0.7631578947368421

In [18]:
# the model is overfitted

In [30]:
classifier = RandomForestClassifier(n_estimators=100,random_state=0,oob_score=True,min_samples_split=12)
classifier.fit(X_train,y_train)

y_pred_train = classifier.predict(X_train)
print('Train Accuracy:', metrics.accuracy_score(y_train, y_pred_train))
print('Train Precision:', metrics.precision_score(y_train, y_pred_train))
print('Train Recall:', metrics.recall_score(y_train, y_pred_train))

y_pred = classifier.predict(X_test)
print('Train Accuracy:', metrics.accuracy_score(y_test, y_pred))
print('Train Precision:', metrics.precision_score(y_test, y_pred))
print('Train Recall:', metrics.recall_score(y_test, y_pred))

Train Accuracy: 0.9736842105263158
Train Precision: 0.9473684210526315
Train Recall: 1.0
Train Accuracy: 0.8
Train Precision: 1.0
Train Recall: 0.6666666666666666


In [31]:
classifier = RandomForestClassifier(n_estimators=100,random_state=0,oob_score=True,min_samples_split=12,n_jobs=-1,verbose=True)
classifier.fit(X_train,y_train)

# jobs=-1 => -1 will use all the hardware available on the sytem to build the tree, Verbose will print result for you on the screen
# 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished


In [None]:
#Conclusion
#These steps outline a complete workflow for a machine learning classification task, from data preparation to model evaluation.
#This process involves critical stages such as understanding the dataset, preparing the data for modeling, choosing an appropriate model, training the model, 
#and evaluating its performance to ensure it can make accurate predictions on new, unseen data.