## Finding best model for coconut copra type using supervised learning algorithm

In [1]:
# Lib untuk manipulasi data
import numpy as np
import pandas as pd

# lib untuk analisa statistik
import scipy.stats as sc
import statsmodels.api as sm

# lib untuk visualisasi data
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# lib untuk praproses dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# lib untuk klasifikasi data
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, NuSVC, LinearSVC

# library untuk evaluasi model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

# lib disble warning
import warnings
warnings.filterwarnings('ignore')

- config models

In [2]:
# config models
experiments = 7   # fill number 1 until 7
splitting = 90  # fill number 90 or 80

### 1. Data acquisition

In [3]:
# load dataset
dataset = pd.read_csv("dataset/ekstrasi-fitur-kopra-nonTelungkup_v3.csv")

In [4]:
# show metadata
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1839 entries, 0 to 1838
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Mean_R           1839 non-null   float64
 1   Mean_G           1839 non-null   float64
 2   Mean_B           1839 non-null   float64
 3   Mean_H           1839 non-null   float64
 4   Mean_V           1839 non-null   float64
 5   Mean_S           1839 non-null   float64
 6   Mean_Gray        1839 non-null   float64
 7   Standar_Deviasi  1839 non-null   float64
 8   Luas             1839 non-null   float64
 9   Perimeter        1839 non-null   float64
 10  Contrast         1839 non-null   float64
 11  Dissimilarity    1839 non-null   float64
 12  Homogeneity      1839 non-null   float64
 13  Energy           1839 non-null   float64
 14  Correlation      1839 non-null   float64
 15  Class            1839 non-null   object 
dtypes: float64(15), object(1)
memory usage: 230.0+ KB


In [5]:
# show dataset
np.round(dataset, 4)

Unnamed: 0,Mean_R,Mean_G,Mean_B,Mean_H,Mean_V,Mean_S,Mean_Gray,Standar_Deviasi,Luas,Perimeter,Contrast,Dissimilarity,Homogeneity,Energy,Correlation,Class
0,99.7735,85.2993,72.5793,11.2720,99.7906,51.0244,88.1944,54.5893,7933.5,2053.3048,43.4466,3.3410,0.4820,0.2282,0.9927,Edible
1,114.1627,110.4249,91.1458,19.0044,114.8282,36.8718,109.3581,79.9393,33489.0,1334.8469,68.7073,4.1415,0.4737,0.2763,0.9946,Edible
2,86.7587,77.0958,58.2202,14.3739,86.7604,58.7899,77.8335,52.0851,1958.5,573.8549,37.7350,2.9174,0.5221,0.2355,0.9930,Edible
3,80.3976,71.1794,55.9146,12.9194,80.3995,52.5784,72.2365,52.3093,2908.0,862.9747,34.4723,2.6061,0.5649,0.2639,0.9937,Edible
4,98.9635,89.0470,68.0217,14.3642,98.9674,56.0236,89.6345,63.7584,31867.5,706.1392,51.5219,3.0583,0.5406,0.2576,0.9936,Edible
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1834,65.6997,53.8509,38.5199,13.2483,65.7217,78.2459,55.6460,38.1312,274.5,94.8112,55.9696,3.6349,0.4470,0.2087,0.9806,Reject
1835,56.1067,52.0908,39.0243,20.3346,57.6599,63.2588,51.8051,35.6666,513.0,153.8234,87.9853,5.1970,0.3650,0.2058,0.9652,Reject
1836,57.3916,46.5022,29.4403,12.4246,57.3923,83.2977,47.8252,37.2503,150.0,172.8528,76.8202,4.5265,0.4297,0.2698,0.9722,Reject
1837,66.6046,58.9452,41.0883,14.9776,66.6068,68.8092,59.2135,41.0479,342.0,171.0538,68.5011,4.4026,0.4286,0.2436,0.9796,Reject


### 3. Data Preprocessing

- set feature and label

In [6]:
# set variabel warna, bentuk, texture
color = ["Mean_R", "Mean_G", "Mean_B", "Mean_H", "Mean_V", "Mean_S", "Mean_Gray", "Standar_Deviasi"]
shape = ["Luas", "Perimeter"]
texture = ["Contrast", "Dissimilarity", "Homogeneity", "Energy", "Correlation"]

In [7]:
# set experiment and feature
# experiment 1
if experiments == 1:
  exp = "Experiments 1"
  x = dataset[color].values

# experiment 2
if experiments == 2:
  exp = "Experiments 2"
  x = dataset[shape].values

# experiment 3
if experiments == 3:
  exp = "Experiments 3"
  x = dataset[texture].values

# experiment 4
if experiments == 4:
  exp = "Experiments 4"
  x = dataset[color + shape].values

# experiment 5
if experiments == 5:
  exp = "Experiments 5"
  x = dataset[color + texture].values

# experiment 6
if experiments == 6:
  exp = "Experiments 6"
  x = dataset[shape + texture].values

# experiment 7
if experiments == 7:
  exp = "Experiments 7"
  x = dataset[color + shape + texture].values

In [8]:
# show feature
print(np.round(x,4))

[[ 99.7735  85.2993  72.5793 ...   0.482    0.2282   0.9927]
 [114.1627 110.4249  91.1458 ...   0.4737   0.2763   0.9946]
 [ 86.7587  77.0958  58.2202 ...   0.5221   0.2355   0.993 ]
 ...
 [ 57.3916  46.5022  29.4403 ...   0.4297   0.2698   0.9722]
 [ 66.6046  58.9452  41.0883 ...   0.4286   0.2436   0.9796]
 [ 58.9383  49.4705  30.5259 ...   0.4488   0.2712   0.9813]]


In [9]:
# set label, dependent variable
y = dataset[["Class"]].values

In [10]:
# show feature
print(np.array(y))

[['Edible']
 ['Edible']
 ['Edible']
 ...
 ['Reject']
 ['Reject']
 ['Reject']]


- Scaling dataset

In [11]:
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(x)

In [12]:
np.round(scaled, 4)

array([[0.6636, 0.6414, 0.6765, ..., 0.5084, 0.1479, 0.9514],
       [0.8016, 0.8869, 0.8975, ..., 0.484 , 0.2742, 0.9767],
       [0.5388, 0.5613, 0.5056, ..., 0.625 , 0.1671, 0.9557],
       ...,
       [0.2573, 0.2623, 0.1631, ..., 0.3562, 0.2571, 0.6834],
       [0.3456, 0.3839, 0.3017, ..., 0.3528, 0.1884, 0.7798],
       [0.2721, 0.2913, 0.176 , ..., 0.4117, 0.2606, 0.8017]])

- Data Splitting

In [13]:
# split validation
if splitting == 80:
  trainX, testX, trainY, testY = train_test_split(scaled, y, train_size=0.8, test_size=0.2, random_state=0, shuffle=True)

# split validation
if splitting == 90:
  trainX, testX, trainY, testY = train_test_split(scaled, y, train_size=0.9, test_size=0.1, random_state=0, shuffle=True)

In [14]:
# reshape labels
trainY = trainY.reshape(-1)
testY = testY.reshape(-1)

In [15]:
# show dimension of data train
print(trainX.shape, trainY.shape)

(1655, 15) (1655,)


In [16]:
# show dimension of data test
print(testX.shape, testY.shape)

(184, 15) (184,)


### 4. Modeling Supervised Learning

In [17]:
# supervised learning algorithm
result_KNN = KNeighborsClassifier(n_neighbors=3).fit(trainX, trainY).predict(testX)
result_SVC = SVC(kernel='linear').fit(trainX, trainY).predict(testX)
result_C45 = DecisionTreeClassifier(criterion="gini", random_state=0).fit(trainX, trainY).predict(testX)
result_GNB = GaussianNB().fit(trainX, trainY).predict(testX)
result_LR = LogisticRegression(max_iter=1000).fit(trainX, trainY).predict(testX)

In [18]:
# join all result with labels
data = {
  "KNN" : result_KNN,
  "SVC" : result_SVC,
  "C45" : result_C45,
  "GNB" : result_GNB,
  "LR" : result_LR,
  "Labels": testY,
}

# convert result to dataframe
result = pd.DataFrame(data)

# show dataframe
np.array(result)

array([['Regular', 'Regular', 'Edible', 'Regular', 'Regular', 'Regular'],
       ['Edible', 'Edible', 'Edible', 'Edible', 'Edible', 'Edible'],
       ['Reject', 'Reject', 'Reject', 'Reject', 'Reject', 'Reject'],
       ...,
       ['Regular', 'Regular', 'Regular', 'Regular', 'Regular', 'Regular'],
       ['Regular', 'Regular', 'Regular', 'Regular', 'Regular', 'Regular'],
       ['Edible', 'Regular', 'Regular', 'Regular', 'Regular', 'Regular']],
      dtype=object)

### 5. Model Evaluation

- Accuracy, Precision and Recall

In [19]:
print("Supervised Learning: "+str(exp)+" and train test "+str(splitting)+"%")
print("")
for i in range(len(result.columns)-1):
    print(result.columns[i])
    print(np.round(accuracy_score(result["Labels"], result.iloc[:,i:i+1]), 3))
    print(np.round(precision_score(result["Labels"], result.iloc[:,i:i+1], average="macro"), 3))
    print(np.round(recall_score(result["Labels"], result.iloc[:,i:i+1], average="macro"), 3))
    print(np.round(f1_score(result["Labels"], result.iloc[:,i:i+1], average="macro"), 3))
    print("------------------------- \n")

Supervised Learning: Experiments 7 and train test 90%

KNN
0.842
0.841
0.837
0.837
------------------------- 

SVC
0.815
0.808
0.808
0.808
------------------------- 

C45
0.815
0.81
0.81
0.81
------------------------- 

GNB
0.799
0.795
0.79
0.791
------------------------- 

LR
0.821
0.813
0.814
0.813
------------------------- 



- confusion matrix

In [20]:
def heatmap_plotly(df):
  # plotting heatmap corr
  fig = px.imshow(
    img=df,
    x = ['1', '2', '3'],
    y = ['1', '2', '3'],
    color_continuous_scale = 'RdYlGn_r',
    aspect=True,
    text_auto=True,
  )
  
  fig.update_traces(
    xgap=3, ygap=3,
    textfont={"size":16}, 
  )

  # customize layout
  fig.update_layout(
    width=450, height=360,
    #title=algorithms,
  )

  # # show heatmap corr
  # fig.show()

  return fig

In [21]:
def heatmap_matplotlib(data):
  # Data sampel
  x = ["1", "2", "3"]
  y = ["1", "2", "3"]
  data = np.array(data)

  # create figure
  fig, ax = plt.subplots(figsize=(3,3))

  # create heatmap
  plt.imshow(data, cmap='RdYlGn_r')

  # Show all ticks and label them with the respective list entries
  ax.set_xticks(np.arange(len(x)), labels=x)
  ax.set_yticks(np.arange(len(y)), labels=y)

  # Loop over data dimensions and create text annotations.
  for i in range(len(x)):
    for j in range(len(y)):
      text = ax.text(j, i, data[i, j], ha="center", va="center", fontsize=12)

  # show heatmap
  return plt.show()

In [22]:
heatmap_plotly(pd.DataFrame(confusion_matrix(testY, result_KNN)))

In [23]:
heatmap_plotly(pd.DataFrame(confusion_matrix(testY, result_SVC)))

In [24]:
heatmap_plotly(pd.DataFrame(confusion_matrix(testY, result_C45)))

In [25]:
heatmap_plotly(pd.DataFrame(confusion_matrix(testY, result_GNB)))

In [26]:
heatmap_plotly(pd.DataFrame(confusion_matrix(testY, result_LR)))