<font size=6><b>Importing required packages:<b><font>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


<font size=5><b>Reading data from dataset:<b><font>

In [2]:
df=pd.read_csv("balance-scale.csv")

In [3]:
#dataset given:
df.head()


Unnamed: 0,Class,L-Weight,L-Distance,R-Weight,R-Distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [4]:
#basic statistical-summary of the entire data-set
df.describe()

Unnamed: 0,L-Weight,L-Distance,R-Weight,R-Distance
count,625.0,625.0,625.0,625.0
mean,3.0,3.0,3.0,3.0
std,1.415346,1.415346,1.415346,1.415346
min,1.0,1.0,1.0,1.0
25%,2.0,2.0,2.0,2.0
50%,3.0,3.0,3.0,3.0
75%,4.0,4.0,4.0,4.0
max,5.0,5.0,5.0,5.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625 entries, 0 to 624
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Class       625 non-null    object
 1   L-Weight    625 non-null    int64 
 2   L-Distance  625 non-null    int64 
 3   R-Weight    625 non-null    int64 
 4   R-Distance  625 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 24.5+ KB


In [6]:
df.shape

(625, 5)

In [7]:
#to check if there is any value missing in any of the column or row
df.isnull().sum()

Class         0
L-Weight      0
L-Distance    0
R-Weight      0
R-Distance    0
dtype: int64

In [8]:
cols = ['L-Weight','L-Distance','R-Weight','R-Distance']
X = df[cols] 
y = df.Class 

In [9]:
#dataset is splits into  train and test set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(437, 4) (188, 4) (437,) (188,)


<font size=6><b>DecisionTreeClassifier Algorithm:<b><font>


In [10]:
from sklearn.tree import DecisionTreeClassifier 

dtc=DecisionTreeClassifier()


In [11]:
 dtc.fit(X_train,y_train)

DecisionTreeClassifier()

In [12]:
y_prediction = dtc.predict(X_test)

In [13]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_prediction)

array([[ 0, 10,  4],
       [ 9, 77,  4],
       [10,  6, 68]], dtype=int64)

In [14]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_prediction))

Accuracy: 0.7712765957446809


<font size=6><b>HyperParameter Tuning:<b><font>
<br><font size=3><b>using GridSearchCV<b><font>

In [15]:
param_i={ "criterion":["gini","entropy"],
         "max_depth":[1,2,3,4,5,6,7,None]
}

In [16]:
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(dtc,param_grid=param_i,cv=10,n_jobs=-1)

In [17]:
grid.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, None]})

In [18]:
grid.best_estimator_

DecisionTreeClassifier(criterion='entropy')

In [19]:
grid.best_score_

0.7780655391120508

In [20]:
grid.best_params_

{'criterion': 'entropy', 'max_depth': None}

<h2><strong>RandomForestCL</strong></h2>

In [21]:
from sklearn.ensemble import RandomForestClassifier
rfc= RandomForestClassifier(n_estimators=300)
rfc.fit(X_train,y_train)

RandomForestClassifier(n_estimators=300)

In [22]:
rfc_pred = rfc.predict(X_test)

In [23]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [24]:
classification_report(y_test,rfc_pred)

'              precision    recall  f1-score   support\n\n           B       0.00      0.00      0.00        14\n           L       0.87      0.92      0.90        90\n           R       0.91      0.89      0.90        84\n\n    accuracy                           0.84       188\n   macro avg       0.60      0.61      0.60       188\nweighted avg       0.83      0.84      0.83       188\n'

In [25]:
confusion_matrix(y_test,rfc_pred)


array([[ 0,  9,  5],
       [ 5, 83,  2],
       [ 6,  3, 75]], dtype=int64)

In [26]:

print("Accuracy:",accuracy_score(y_test, y_prediction))

Accuracy: 0.7712765957446809


<font size=6><b>Random Search CV<b><font>


In [27]:
from sklearn.model_selection import RandomizedSearchCV

In [28]:
P_trials={'n_estimators':[100,200,300,500,1000],
                  'criterion':['gini','entropy'],
                  'max_depth': [2,3]}

search = RandomizedSearchCV(rfc, P_trials, n_iter=500, scoring='accuracy', n_jobs=-1, cv=10, random_state=1)
search_results=search.fit(X,y)

In [29]:
search_results.best_params_

{'n_estimators': 100, 'max_depth': 3, 'criterion': 'gini'}

In [30]:
search_results.cv_results_['params']

[{'n_estimators': 100, 'max_depth': 2, 'criterion': 'gini'},
 {'n_estimators': 200, 'max_depth': 2, 'criterion': 'gini'},
 {'n_estimators': 300, 'max_depth': 2, 'criterion': 'gini'},
 {'n_estimators': 500, 'max_depth': 2, 'criterion': 'gini'},
 {'n_estimators': 1000, 'max_depth': 2, 'criterion': 'gini'},
 {'n_estimators': 100, 'max_depth': 3, 'criterion': 'gini'},
 {'n_estimators': 200, 'max_depth': 3, 'criterion': 'gini'},
 {'n_estimators': 300, 'max_depth': 3, 'criterion': 'gini'},
 {'n_estimators': 500, 'max_depth': 3, 'criterion': 'gini'},
 {'n_estimators': 1000, 'max_depth': 3, 'criterion': 'gini'},
 {'n_estimators': 100, 'max_depth': 2, 'criterion': 'entropy'},
 {'n_estimators': 200, 'max_depth': 2, 'criterion': 'entropy'},
 {'n_estimators': 300, 'max_depth': 2, 'criterion': 'entropy'},
 {'n_estimators': 500, 'max_depth': 2, 'criterion': 'entropy'},
 {'n_estimators': 1000, 'max_depth': 2, 'criterion': 'entropy'},
 {'n_estimators': 100, 'max_depth': 3, 'criterion': 'entropy'},
 {'

In [31]:
search_results.best_score_

0.65284178187404

<font size=3><b>Observations and Problem faced: <br>
1.loading and displaying Balance Scale dataset.<br>
2. A function is used to perform the count, mean, min and max values and percentile functions which results to show us that all numerical values are approximately within the same range.<br>
3. After summarisinmg the dataset, we estimate the accuracy of models using statistical method that is splitting this dataset into two, 70% of which is used for training and evaluating and rest 30% is hold back as validation dataset.<br>
4.Predictions are made  by comparing the expected results in the validation set, then calculating classification accuracy and confusion matrix.<br>
5.After completion of the above procedure,we get an accuracy of 77.6%<br>
6.On applying hyperparametertuning for better accuracy, accuracy score obtained then becomes77.8%<br>
7. Applying Random Forest Classifier and then comparing it with descision tree model 
  same accuracy is obtained this is due to smaller dataset provided.<b><font>