## Finding best model for coconut copra type using supervised learning algorithm

In [None]:
# Lib untuk manipulasi data
import numpy as np
import pandas as pd

# lib untuk analisa statistik
import scipy.stats as sc
import statsmodels.api as sm

# lib untuk visualisasi data
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# lib untuk praproses dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# lib untuk klasifikasi data
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, NuSVC, LinearSVC

# library untuk evaluasi model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

# lib disble warning
import warnings
warnings.filterwarnings('ignore')

- config models

In [None]:
# config models
experiments = 7   # fill number 1 until 7
splitting = 90  # fill number 90 or 80

### 1. Data acquisition

In [None]:
# load dataset
dataset = pd.read_csv("dataset/ekstrasi-fitur-kopra-nonTelungkup_v3.csv")

In [None]:
# show metadata
dataset.info()

In [None]:
# show dataset
np.round(dataset, 4)

### 3. Data Preprocessing

- set feature and label

In [None]:
# set variabel warna, bentuk, texture
color = ["Mean_R", "Mean_G", "Mean_B", "Mean_H", "Mean_V", "Mean_S", "Mean_Gray", "Standar_Deviasi"]
shape = ["Luas", "Perimeter"]
texture = ["Contrast", "Dissimilarity", "Homogeneity", "Energy", "Correlation"]

In [None]:
# set experiment and feature
# experiment 1
if experiments == 1:
  exp = "Experiments 1"
  x = dataset[color].values

# experiment 2
if experiments == 2:
  exp = "Experiments 2"
  x = dataset[shape].values

# experiment 3
if experiments == 3:
  exp = "Experiments 3"
  x = dataset[texture].values

# experiment 4
if experiments == 4:
  exp = "Experiments 4"
  x = dataset[color + shape].values

# experiment 5
if experiments == 5:
  exp = "Experiments 5"
  x = dataset[color + texture].values

# experiment 6
if experiments == 6:
  exp = "Experiments 6"
  x = dataset[shape + texture].values

# experiment 7
if experiments == 7:
  exp = "Experiments 7"
  x = dataset[color + shape + texture].values

In [None]:
# show feature
print(np.round(x,4))

In [None]:
# set label, dependent variable
y = dataset[["Class"]].values

In [None]:
# show feature
print(np.array(y))

- Scaling dataset

In [None]:
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(x)

In [None]:
np.round(scaled, 4)

- Data Splitting

In [None]:
# split validation
if splitting == 80:
  trainX, testX, trainY, testY = train_test_split(scaled, y, train_size=0.8, test_size=0.2, random_state=0, shuffle=True)

# split validation
if splitting == 90:
  trainX, testX, trainY, testY = train_test_split(scaled, y, train_size=0.9, test_size=0.1, random_state=0, shuffle=True)

In [None]:
# reshape labels
trainY = trainY.reshape(-1)
testY = testY.reshape(-1)

In [None]:
# show dimension of data train
print(trainX.shape, trainY.shape)

In [None]:
# show dimension of data test
print(testX.shape, testY.shape)

### 4. Modeling Supervised Learning

In [None]:
# supervised learning algorithm
result_KNN = KNeighborsClassifier(n_neighbors=3).fit(trainX, trainY).predict(testX)
result_SVC = SVC(kernel='linear').fit(trainX, trainY).predict(testX)
result_C45 = DecisionTreeClassifier(criterion="gini", random_state=0).fit(trainX, trainY).predict(testX)
result_GNB = GaussianNB().fit(trainX, trainY).predict(testX)
result_LR = LogisticRegression(max_iter=1000).fit(trainX, trainY).predict(testX)

In [None]:
# join all result with labels
data = {
  "KNN" : result_KNN,
  "SVC" : result_SVC,
  "C45" : result_C45,
  "GNB" : result_GNB,
  "LR" : result_LR,
  "Labels": testY,
}

# convert result to dataframe
result = pd.DataFrame(data)

# show dataframe
np.array(result)

### 5. Model Evaluation

- Accuracy, Precision and Recall

In [None]:
print("Supervised Learning: "+str(exp)+" and train test "+str(splitting)+"%")
print("")
for i in range(len(result.columns)-1):
    print(result.columns[i])
    print(np.round(accuracy_score(result["Labels"], result.iloc[:,i:i+1]), 3))
    print(np.round(precision_score(result["Labels"], result.iloc[:,i:i+1], average="macro"), 3))
    print(np.round(recall_score(result["Labels"], result.iloc[:,i:i+1], average="macro"), 3))
    print(np.round(f1_score(result["Labels"], result.iloc[:,i:i+1], average="macro"), 3))
    print("------------------------- \n")

- confusion matrix

In [None]:
def heatmap_plotly(df):
  # plotting heatmap corr
  fig = px.imshow(
    img=df,
    x = ['1', '2', '3'],
    y = ['1', '2', '3'],
    color_continuous_scale = 'RdYlGn_r',
    aspect=True,
    text_auto=True,
  )
  
  fig.update_traces(
    xgap=3, ygap=3,
    textfont={"size":16}, 
  )

  # customize layout
  fig.update_layout(
    width=450, height=360,
    #title=algorithms,
  )

  # # show heatmap corr
  # fig.show()

  return fig

In [None]:
def heatmap_matplotlib(data):
  # Data sampel
  x = ["1", "2", "3"]
  y = ["1", "2", "3"]
  data = np.array(data)

  # create figure
  fig, ax = plt.subplots(figsize=(3,3))

  # create heatmap
  plt.imshow(data, cmap='RdYlGn_r')

  # Show all ticks and label them with the respective list entries
  ax.set_xticks(np.arange(len(x)), labels=x)
  ax.set_yticks(np.arange(len(y)), labels=y)

  # Loop over data dimensions and create text annotations.
  for i in range(len(x)):
    for j in range(len(y)):
      text = ax.text(j, i, data[i, j], ha="center", va="center", fontsize=12)

  # show heatmap
  return plt.show()

In [None]:
heatmap_plotly(pd.DataFrame(confusion_matrix(testY, result_KNN)))

In [None]:
heatmap_plotly(pd.DataFrame(confusion_matrix(testY, result_SVC)))

In [None]:
heatmap_plotly(pd.DataFrame(confusion_matrix(testY, result_C45)))

In [None]:
heatmap_plotly(pd.DataFrame(confusion_matrix(testY, result_GNB)))

In [None]:
heatmap_plotly(pd.DataFrame(confusion_matrix(testY, result_LR)))