In [95]:
import numpy as np
import pandas as pd

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix

# Load Data and exploration

In [2]:
data=pd.read_excel('./DryBeanDataset/Dry_Bean_Dataset.xlsx')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13611 entries, 0 to 13610
Data columns (total 17 columns):
Area               13611 non-null int64
Perimeter          13611 non-null float64
MajorAxisLength    13611 non-null float64
MinorAxisLength    13611 non-null float64
AspectRation       13611 non-null float64
Eccentricity       13611 non-null float64
ConvexArea         13611 non-null int64
EquivDiameter      13611 non-null float64
Extent             13611 non-null float64
Solidity           13611 non-null float64
roundness          13611 non-null float64
Compactness        13611 non-null float64
ShapeFactor1       13611 non-null float64
ShapeFactor2       13611 non-null float64
ShapeFactor3       13611 non-null float64
ShapeFactor4       13611 non-null float64
Class              13611 non-null object
dtypes: float64(14), int64(2), object(1)
memory usage: 1.8+ MB


In [51]:
data.isna().sum()

Area               0
Perimeter          0
MajorAxisLength    0
MinorAxisLength    0
AspectRation       0
Eccentricity       0
ConvexArea         0
EquivDiameter      0
Extent             0
Solidity           0
roundness          0
Compactness        0
ShapeFactor1       0
ShapeFactor2       0
ShapeFactor3       0
ShapeFactor4       0
Class              0
dtype: int64

In [62]:
type(data.iloc[:,0].iloc[0]).__name__

'int64'

In [63]:
for col in data:
    print(col, type(data[col].iloc[0]).__name__)

Area int64
Perimeter float64
MajorAxisLength float64
MinorAxisLength float64
AspectRation float64
Eccentricity float64
ConvexArea int64
EquivDiameter float64
Extent float64
Solidity float64
roundness float64
Compactness float64
ShapeFactor1 float64
ShapeFactor2 float64
ShapeFactor3 float64
ShapeFactor4 float64
Class str


In [52]:
Y=data.Class
X=data.drop('Class',1)

In [54]:
Y.value_counts()/Y.shape[0]*100

DERMASON    26.052458
SIRA        19.366689
SEKER       14.892366
HOROZ       14.165014
CALI        11.975608
BARBUNYA     9.712732
BOMBAY       3.835133
Name: Class, dtype: float64

# Data preprocessing

In [64]:
ss=StandardScaler()
X_=ss.fit_transform(X)

In [65]:
pca=PCA()
X_=pca.fit_transform(X_)

In [66]:
xtrain,xtest,ytrain,ytest=train_test_split(X_,Y, test_size=0.2)

In [67]:
xtest.shape

(2723, 16)

In [68]:
xtrain.shape

(10888, 16)

# Algorithms

In [82]:
from sklearn.model_selection import GridSearchCV

In [78]:
ada=AdaBoostClassifier(n_estimators=300)
lgr=LogisticRegression()

In [84]:
param_grid={'n_estimators':[200,300],'max_depth':[5,10,20]}
rf=RandomForestClassifier(n_estimators=2,max_depth=15,n_jobs=-1)

In [85]:
gridcv=GridSearchCV(estimator=rf,param_grid=param_grid,n_jobs=-1)

In [86]:
gridcv.fit(xtrain,ytrain)

GridSearchCV(estimator=RandomForestClassifier(max_depth=15, n_estimators=2,
                                              n_jobs=-1),
             n_jobs=-1,
             param_grid={'max_depth': [5, 10, 20], 'n_estimators': [200, 300]})

In [87]:
gridcv.score(xtrain,ytrain)

0.9985304922850845

In [88]:
gridcv.score(xtest,ytest)

0.9291222915901579

In [93]:
lgr.fit(xtrain,ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [94]:
lgr.score(xtest,ytest)

0.9287550495776716

In [99]:
lgr.classes_

array(['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA'],
      dtype=object)

In [96]:
ypred=lgr.predict(xtest)

In [100]:
confusion_matrix(y_true=ytest,y_pred=ypred,normalize='true').diagonal()

array([0.90974729, 1.        , 0.95950156, 0.93304843, 0.956743  ,
       0.95577396, 0.85877863])