In [1]:
from IPython.display import clear_output
!pip3 install -U lazypredict
!pip3 install -U pandas
clear_output()

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import OneHotEncoder
import time
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

## Load data

In [3]:
# read the train data
data = pd.read_csv('/kaggle/input/dry-bean-dataset/Dry_Bean_Dataset.csv')
data.head(10)

In [4]:
print('Number of rows in train data: {}'.format(data.shape[0]))
print('mNumber of columns in train data: {}'.format(data.shape[1]))
print('Number of values in train data: {}'.format(data.count().sum()))
print('Number missing values in train data: {}'.format(sum(data.isna().sum())))

In [5]:
data.iloc[:, :-1].describe().T.sort_values(by='std' , ascending = False)\
                 .style.background_gradient(cmap='GnBu')\
                 .bar(subset=["max"], color='red')\
                 .bar(subset=["mean",], color='green')

## Exploratory data analysis

### Continuous and Categorical Data Distribution

The data set is dominantly Continous data type

In [6]:
print(data.dtypes)

### Target distribution

Look at the unique output classes. There are 7 unique output classes. The distribution of the classes are not equal.

In [7]:
target_df = pd.DataFrame(data["Class"].value_counts()).reset_index()
target_df.columns = ["Class", 'Count']
fig = px.bar(data_frame=target_df, x="Class",y='Count')
fig.update_traces(marker_line_color='rgb(0,0,0)',marker_line_width=2)
fig.update_layout(title = "Target Class Distribution",
                  template = "plotly_white",
                  title_x = 0.5)

fig.show()

### Correlation Matrix

In [8]:
fig = px.imshow(data.corr(), text_auto=True, aspect="auto" , color_continuous_scale = "viridis")
fig.show()

### Encoding Categorical Features using Ordinal Encoding

In [9]:
bean_class_dict = {'SEKER' : 1, 'BARBUNYA': 2, 'BOMBAY': 3, 'CALI': 4, 'HOROZ': 5, 'SIRA': 6, 'DERMASON': 7}
data['Class_y'] = data['Class'].map(bean_class_dict)

data.head(5)

Observe if the attributes exhibit Gaussian Distribution and the correlations across the attributes.

In [10]:
data.plot(kind='density', subplots=True, layout=(5,4), sharex=False, legend=False, fontsize=1)
plt.show()

## Train-Test Data Split

Identify the best performing classifier model using the data set without pre-processing. Split the data into train-test set in ration of 80:20. 

In [11]:
from sklearn.model_selection import train_test_split

# separate the independent and target variable 
train_X = data.drop(columns=['Class','Class_y'])
train_Y = data['Class_y']

# randomly split the data
train_x, test_x, train_y, test_y = train_test_split(train_X, train_Y,test_size=0.25, random_state=21)

# shape of train and test splits
train_x.shape, test_x.shape, train_y.shape, test_y.shape

## Baseline model training

In [12]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

In [13]:
clf = LazyClassifier(verbose=0,
                     ignore_warnings=True,
                     custom_metric=None,
                     predictions=True,
                     random_state=12,
                     classifiers='all')

models, predictions = clf.fit(train_x, test_x, train_y, test_y)

In [14]:
models

### Training SVC with n-fold validation

In [18]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

In [None]:
k = 10
acc_score = []
skfold = StratifiedKFold(n_splits = k, shuffle=True, random_state=21)
param_grid = {'boosting_type': ['gbdt', 'dart', 'goss', 'rf'],
              'learning_rate': [0.2, 0.15, 0.1, 0.05],
              'n_estimators' : [50,100,200,300,400]}

model = LGBMClassifier(random_state=21)
grid = GridSearchCV(model, param_grid=param_grid, cv=skfold, scoring='accuracy')
start = time.time()
grid_result = grid.fit(train_x, train_y)
end = time.time()

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

print("Best: %f using %s (run time :%f)" % (grid_result.best_score_, grid_result.best_params_, end-start))

In [None]:
#Best: 0.929860 using {'boosting_type': 'gbdt', 'learning_rate': 0.05, 'n_estimators': 100} (run time :2559.013288)
model.set_params(**grid.best_params_)
model.fit(train_x, train_y)
print("Model training completed")

In [None]:
predict_test = model.predict(test_x)
print("Accuracy score {:.4f}".format(accuracy_score(test_y, predict_test)))
print(classification_report(test_y, predict_test))

In [None]:
print(confusion_matrix(test_y, predict_test))

## Handling Outliers

We can see a few cases of outliers from the dataset using box plots. One example is the outlier values of AREA, grouped by the classes.

In [None]:
data.boxplot(column=['Area'], by=['Class'], figsize=(12, 8))

Group the data according to the output classes and identify the outliers for 'Area' attribute by calculating the z_score. We drop the rows where the z_score > 3 or z_score < -3. Then we retrain the model and see if the output accuracy improves.

In [None]:
from scipy import stats

classes = data['Class'].unique()
update_data = pd.DataFrame()

for cat in classes:
    data_x = data[data["Class"] == cat]
    area_z_scores = np.abs(stats.zscore(data_x['Area']))
    data_x = data_x.drop(data_x['Area'].loc[(area_z_scores > 3) | (area_z_scores < -3)].index)
    
    aspect_z_scores = np.abs(stats.zscore(data_x['AspectRation']))
    data_x = data_x.drop(data_x['AspectRation'].loc[(aspect_z_scores > 3) | (aspect_z_scores < -3)].index)

    eccen_z_scores = np.abs(stats.zscore(data_x['Eccentricity']))
    data_x = data_x.drop(data_x['Eccentricity'].loc[(eccen_z_scores > 3) | (eccen_z_scores < -3)].index)

    update_data = pd.concat([update_data, data_x], axis=0)
    
update_data

Observe the boxplot if there are any more outliers.

In [None]:
update_data.boxplot(column=['Area'], by=['Class'], figsize=(12, 8))

## Model Retraining using data with outliers removed

The data out is trimed of outliers based on the Z-scores of attributes Area, AspectRation and Eccentricity when grouped by output classes

In [None]:
# separate the independent and target variable 
train_X = update_data.drop(columns=['Class','Class_y'])
train_Y = update_data['Class_y']

# randomly split the data
train_x, test_x, train_y, test_y = train_test_split(train_X, train_Y,test_size=0.20, random_state=21)

# shape of train and test splits
train_x.shape, test_x.shape, train_y.shape, test_y.shape

In [None]:
model = LGBMClassifier()
model.set_params(**grid.best_params_)
model.fit(train_x, train_y)
print("Model training completed")

The accuracy of the model trained with the trimmed data set has improved to 93.38%.

In [None]:
predict_test = model.predict(test_x)
print("Accuracy score {:.4f}".format(accuracy_score(test_y, predict_test)))
print(classification_report(test_y, predict_test))

In [None]:
print(confusion_matrix(test_y, predict_test))

## Model Training using Sklearn Pipeline

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, MaxAbsScaler

pipe = Pipeline([('scaler', StandardScaler()),
                 ('classifier', LGBMClassifier())])
pipe.get_params().keys()

Grid search gives us the ability to search over specified values for each of the parameters listed above. We do this by passing GridSearchCV a dictionary with parameter names as keys, and lists of values to try as arguments for those parameters. In this example I call this dictionary params and pass it to GridSearchCV. Once fitted, the GridSearchCV instance gs , acts just like any other estimator.

The parameter **n_jobs** tells Scikit-learn how many jobs to run in parallel. Setting it to -1 is equivalent to instructing Scikit-learn to use all available processors. Nowadays, most CPUs have more than one core. If you have a quad-core processor, using all 4 cores instead of 1 can make the process significantly faster.

In [21]:
params={
    'scaler': [StandardScaler(), MinMaxScaler(), Normalizer(), MaxAbsScaler()],
    'classifier__n_estimators': [100],
    'classifier__boosting_type': ['gbdt'],
    'classifier__learning_rate': [0.05]}

#setting up the grid search
k = 10
skfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=21)
gs = GridSearchCV(pipe, param_grid=params, n_jobs=-1, scoring='accuracy', cv=skfold, verbose=1)
start = time.time()
gs.fit(train_x, train_y)
end = time.time()

means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
params = gs.cv_results_['params']

print("Best: {:.4f} using {})".format(gs.best_score_, gs.best_params_))

Reference: https://towardsdatascience.com/an-introduction-to-building-pipelines-and-using-grid-searches-in-scikit-learn-92ea72f9b5b7

In [26]:
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('classifier', LGBMClassifier())])

pipe.set_params(**gs.best_params_)
pipe.fit(train_x,train_y)

In [27]:
predict_test = pipe.predict(test_x)
print("Accuracy score {:.4f}".format(accuracy_score(test_y, predict_test)))
print(classification_report(test_y, predict_test))

## Save the pipeline

In [None]:
import joblib

joblib.dump(pipe, "model.joblib")
print("Saving model pipeline to model.joblib")