# Objective

Memprediksi jumlah mantan seseorang berdasarkan tinggi badan dan status.

# Sample Dataset

In [25]:
import pandas as pd

mantan = {
    "tinggi": [158, 170, 150, 170, 165, 159, 160, 155, 168],
    "status": ["jomblo", "pacaran", "jomblo",
               "jomblo", "pacaran", "pacaran",
               "jomblo", "pacaran", "pacaran"],
    "mantan": [1, 5, 4, 2, 2, 0, 3, 2, 1]
}

mantan_df = pd.DataFrame(mantan)
mantan_df

Unnamed: 0,tinggi,status,mantan
0,158,jomblo,1
1,170,pacaran,5
2,150,jomblo,4
3,170,jomblo,2
4,165,pacaran,2
5,159,pacaran,0
6,160,jomblo,3
7,155,pacaran,2
8,168,pacaran,1


# Split Features and Target

In [2]:
import numpy as np

X_train = np.array(mantan_df[["tinggi", "status"]])
y_train = np.array(mantan_df["mantan"])

print(f"X_train:\n{X_train}\n")
print(f"y_train:\n{y_train}")

X_train:
[[158 'jomblo']
 [170 'pacaran']
 [150 'jomblo']
 [170 'jomblo']
 [165 'pacaran']
 [159 'pacaran']
 [160 'jomblo']
 [155 'pacaran']
 [168 'pacaran']]

y_train:
[1 5 4 2 2 0 3 2 1]


# Preprocessing

## Convert Feature Status to Binary Number

In [3]:
X_train_transposed = X_train.transpose()

print(f"X_train:\n{X_train}\n")
print(f"X_train_transposed:\n{X_train_transposed}\n")

X_train:
[[158 'jomblo']
 [170 'pacaran']
 [150 'jomblo']
 [170 'jomblo']
 [165 'pacaran']
 [159 'pacaran']
 [160 'jomblo']
 [155 'pacaran']
 [168 'pacaran']]

X_train_transposed:
[[158 170 150 170 165 159 160 155 168]
 ['jomblo' 'pacaran' 'jomblo' 'jomblo' 'pacaran' 'pacaran' 'jomblo'
  'pacaran' 'pacaran']]



In [4]:
from sklearn.preprocessing import LabelBinarizer

label_binarizer = LabelBinarizer()
X_train_label_binarizer = label_binarizer.fit_transform(X_train_transposed[1])

print(f"X_train:\n{X_train_transposed}\n")
print(f"X_train_transposed:\n{X_train_label_binarizer}\n")

X_train:
[[158 170 150 170 165 159 160 155 168]
 ['jomblo' 'pacaran' 'jomblo' 'jomblo' 'pacaran' 'pacaran' 'jomblo'
  'pacaran' 'pacaran']]

X_train_transposed:
[[0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]]



In [5]:
status_label_binarizer = X_train_label_binarizer.transpose()
status_label_binarizer

array([[0, 1, 0, 0, 1, 1, 0, 1, 1]])

In [6]:
X_train_transposed[1] = status_label_binarizer
X_train = X_train_transposed.transpose()

print(f"X_train_transposed:\n{X_train_transposed}\n")
print(f"X_train:\n{X_train}\n")

X_train_transposed:
[[158 170 150 170 165 159 160 155 168]
 [0 1 0 0 1 1 0 1 1]]

X_train:
[[158 0]
 [170 1]
 [150 0]
 [170 0]
 [165 1]
 [159 1]
 [160 0]
 [155 1]
 [168 1]]



# Training KNN Regression Model

In [7]:
from sklearn.neighbors import KNeighborsRegressor

k = 3
knn_model = KNeighborsRegressor(n_neighbors=k)
knn_model.fit(X_train, y_train);

# Predict

In [8]:
X_new = np.array([[140, 1]])
X_new

array([[140,   1]])

In [9]:
y_pred = knn_model.predict(X_new)
y_pred

array([2.33333333])

# Evaluation

## Test Set

In [10]:
X_test = np.array([[160, 0], [170, 1], [155, 1], [165, 0]])
y_test = np.array([2, 1, 3, 5])

print(f"X_test:\n{X_test}\n")
print(f"y_test:\n{y_test}")

X_test:
[[160   0]
 [170   1]
 [155   1]
 [165   0]]

y_test:
[2 1 3 5]


In [11]:
y_pred = knn_model.predict(X_test)
y_pred

array([1.33333333, 2.66666667, 1.        , 1.66666667])

## Coefficient of Determination atau $R^2$

In [12]:
from sklearn.metrics import r2_score

r_squared = r2_score(y_test, y_pred)

print(f"r_squared: {r_squared}")

r_squared: -1.095238095238095


## MAE (Mean Absolute Error) or MAD (Mean Absolute Deviation)
- Rata-rata dari nilai absolut dari kesalahan prediksi.
- $MAE = \frac{1}{n} \sum_{i=1}^{n} |y_i - \hat{y}_i|$

In [13]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)

print(f"MAE: {mae}")

MAE: 1.9166666666666665


## MSE (Mean Squared Error) or MSD (Mean Squared Deviation)
- Rata-rata kuadrat kesalahan prediksi.
- $MSE = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2$

In [14]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse}")

MSE: 4.583333333333332


# Scaling Problem
- Model-model Machine Learning yang menggunakan *distance* dalam melakukan prediksi, sangat sensitif dengan skala nilai dari tiap fitur. Apabila skala nilai antara fitur berbeda jauh, maka performa model menjadi jelek. Model tidak konsisten dalam menghasilkan prediksi.

## Before Scaling

In [15]:
from scipy.spatial.distance import euclidean

# tinggi in milimeter
X_train = np.array([[1700, 0], [1600, 1]])
X_new = np.array([[1640, 0]])

mili_distances = [euclidean(X_new[0], d) for d in X_train]

print(f"Tinggi in milimeter")
for idx, distance in enumerate(mili_distances):
    print(f"Distance datapoint-{idx + 1}: {distance}")
    
# tinggi in meter
X_train = np.array([[1.7, 0], [1.6, 1]])
X_new = np.array([[1.64, 0]])

meter_distances = [euclidean(X_new[0], d) for d in X_train]

print(f"\nTinggi in meter")
for idx, distance in enumerate(meter_distances):
    print(f"Distance datapoint-{idx + 1}: {distance}")

Tinggi in milimeter
Distance datapoint-1: 60.0
Distance datapoint-2: 40.01249804748511

Tinggi in meter
Distance datapoint-1: 0.06000000000000005
Distance datapoint-2: 1.0007996802557442


- Dari dua contoh diatas terdapat dua fitur dengan satuan yang berbeda, yaitu milimeter dan meter.
- Dari hasil pengukuran *distance* dari kedua data tersebut menghasilkan *distance* yang berbeda padahal nilai data tersebut sama yang membedakan hanya satuannya saja.
- Pengukuran data pertama (milimeter), prediksi model data baru lebih dekat dengan datapoint-2, sedangkan pada data kedua (meter), prediksi model data baru lebih dekat dengan datapoint-1.
- Terjadi ketidakkonsistenan model dalam melakukan prediksi dikarena skala nilai pada fitur berbeda.

## After Scaling

In [16]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()

In [17]:
from scipy.spatial.distance import euclidean

# tinggi in milimeter
X_train = np.array([[1700, 0], [1600, 1]])
X_new = np.array([[1640, 0]])
X_train_scaled = standard_scaler.fit_transform(X_train)
X_new_scaled = standard_scaler.transform(X_new)
print(f"Tinggi in Milimeter")
print(f"X_train_scaled: {X_train_scaled}")
print(f"X_new_scaled: {X_new_scaled}")

# tinggi in meter
X_train = np.array([[1.7, 0], [1.6, 1]])
X_new = np.array([[1.64, 0]])
X_train_scaled = standard_scaler.fit_transform(X_train)
X_new_scaled = standard_scaler.transform(X_new)
print(f"\nTinggi in Meter")
print(f"X_train_scaled: {X_train_scaled}")
print(f"X_new_scaled: {X_new_scaled}")

Tinggi in Milimeter
X_train_scaled: [[ 1. -1.]
 [-1.  1.]]
X_new_scaled: [[-0.2 -1. ]]

Tinggi in Meter
X_train_scaled: [[ 1. -1.]
 [-1.  1.]]
X_new_scaled: [[-0.2 -1. ]]


In [18]:
mili_distances = [euclidean(X_new[0], d) for d in X_train]
print(f"Tinggi in milimeter")
for idx, distance in enumerate(mili_distances):
    print(f"Distance datapoint-{idx + 1}: {distance}")

meter_distances = [euclidean(X_new[0], d) for d in X_train]
print(f"\nTinggi in meter")
for idx, distance in enumerate(meter_distances):
    print(f"Distance datapoint-{idx + 1}: {distance}")

Tinggi in milimeter
Distance datapoint-1: 0.06000000000000005
Distance datapoint-2: 1.0007996802557442

Tinggi in meter
Distance datapoint-1: 0.06000000000000005
Distance datapoint-2: 1.0007996802557442


Hasilnya sudah konsisten setelah dilakukan *scaling* (*Standard Scaler*).

# Training with Feature Scaling

## Dataset

In [19]:
X_train = X_train_transposed.transpose()
y_train = np.array(mantan_df["mantan"])

X_test = np.array([[160, 0], [170, 1], [155, 1], [165, 0]])
y_test = np.array([2, 1, 3, 5])

print("Train Set")
print(f"X_train:\n{X_train}\n")
print(f"y_train:\n{y_train}\n")
print("-" * 20, end="\n\n")
print("Test Set")
print(f"X_test:\n{X_test}\n")
print(f"y_test:\n{y_test}")

Train Set
X_train:
[[158 0]
 [170 1]
 [150 0]
 [170 0]
 [165 1]
 [159 1]
 [160 0]
 [155 1]
 [168 1]]

y_train:
[1 5 4 2 2 0 3 2 1]

--------------------

Test Set
X_test:
[[160   0]
 [170   1]
 [155   1]
 [165   0]]

y_test:
[2 1 3 5]


## Feature Scaling (Standard Scaler)

In [20]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

In [21]:
X_train_scaled = standard_scaler.fit_transform(X_train)
X_test_scaled = standard_scaler.transform(X_test)

print(f'X_train_scaled:\n{X_train_scaled}\n')
print(f'X_test_scaled:\n{X_test_scaled}\n')

X_train_scaled:
[[-0.55417199 -1.11803399]
 [ 1.2594818   0.89442719]
 [-1.76327453 -1.11803399]
 [ 1.2594818  -1.11803399]
 [ 0.50379272  0.89442719]
 [-0.40303418  0.89442719]
 [-0.25189636 -1.11803399]
 [-1.00758544  0.89442719]
 [ 0.95720617  0.89442719]]

X_test_scaled:
[[-0.25189636 -1.11803399]
 [ 1.2594818   0.89442719]
 [-1.00758544  0.89442719]
 [ 0.50379272 -1.11803399]]



## Training and Evaluation

In [22]:
knn_model.fit(X_train_scaled, y_train);

y_pred = knn_model.predict(X_test)
y_pred

array([2.66666667, 2.66666667, 2.66666667, 2.66666667])

In [23]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

print(f"R^2: {r_squared}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

R^2: -0.0031746031746031633
MAE: 1.25
MSE: 2.1944444444444446
