In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
df = pd.read_csv('dataframe/UFC_kombinasi4.csv')
df.head()

Unnamed: 0,B_avg_SIG_STR_att,B_avg_SIG_STR_landed,B_avg_HEAD_att,B_avg_HEAD_landed,B_avg_BODY_att,B_avg_DISTANCE_att,B_avg_DISTANCE_landed,B_avg_opp_DISTANCE_att,B_avg_opp_DISTANCE_landed,R_avg_opp_SIG_STR_att,...,R_avg_opp_HEAD_att,R_avg_opp_HEAD_landed,R_avg_opp_BODY_att,R_avg_opp_BODY_landed,R_avg_DISTANCE_att,R_avg_opp_DISTANCE_att,R_avg_opp_DISTANCE_landed,R_losses,R_age,Winner
0,3.540959,2.484907,3.260497,1.908674,0.0,3.223863,2.484907,4.276666,3.07385,4.369448,...,4.224979,3.022531,1.791759,1.609438,3.713572,4.023117,2.962167,0.0,3.713572,Red
1,4.930588,3.993315,4.773329,3.713,2.682732,4.852274,3.867287,4.496419,3.462067,4.777547,...,4.535284,3.355372,2.922826,2.231358,4.597642,4.71178,3.673924,0.693147,3.912023,Blue
2,4.204693,3.140365,4.025352,2.833213,1.812379,4.110874,2.845602,4.18205,3.448001,4.528559,...,4.516339,3.295837,1.378451,0.693147,4.353499,4.421602,3.218876,0.0,3.931826,Blue
3,3.694333,2.996513,3.445762,2.549811,1.860926,3.284781,2.290006,3.228207,1.995975,4.709515,...,4.539035,3.356482,2.257051,1.742787,4.225975,4.642033,3.538319,1.791759,3.988984,Blue
4,4.32443,3.433554,4.079129,2.978172,2.213743,4.054156,3.06811,4.259439,3.315916,4.844433,...,4.508539,3.600731,3.221996,2.781339,4.488636,4.640929,3.793204,1.098612,3.931826,Red


In [3]:
X = df.drop('Winner', axis=1).values
X

array([[3.54095932, 2.48490665, 3.2604975 , ..., 2.96216741, 0.        ,
        3.71357207],
       [4.93058825, 3.99331493, 4.77332943, ..., 3.67392443, 0.69314718,
        3.91202301],
       [4.20469262, 3.14036515, 4.02535169, ..., 3.21887582, 0.        ,
        3.93182563],
       ...,
       [3.82319179, 2.87638552, 3.4657359 , ..., 2.42036813, 0.        ,
        3.91202301],
       [4.33483675, 3.67432087, 3.69316712, ..., 3.11177769, 1.09861229,
        3.98898405],
       [3.71965111, 2.74084002, 3.61091791, ..., 2.58193174, 1.38629436,
        3.93182563]])

In [4]:
y = df['Winner'].values
y

array(['Red', 'Blue', 'Blue', ..., 'Red', 'Blue', 'Red'], dtype=object)

In [5]:
counter_y = Counter(y)
print(counter_y)

Counter({'Red': 3581, 'Blue': 1730, 'Draw': 99})


In [6]:
# Over Sampling
adasyn = ADASYN()
randomOver = RandomOverSampler()
smote = SMOTE()
borderSmote = BorderlineSMOTE()
svmSmote = SVMSMOTE()

# Melakukan resampling
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
X_randomOver, y_randomOver = randomOver.fit_resample(X, y)
X_smote, y_smote = smote.fit_resample(X, y)
X_borderSmote, y_borderSmote = borderSmote.fit_resample(X, y)
X_svmSmote, y_svmSmote = svmSmote.fit_resample(X, y)


# Under Sampling
rand_under = RandomUnderSampler(sampling_strategy='majority')
nearmiss = NearMiss()
nearmiss2 = NearMiss(version=2)
nearmiss3 = NearMiss(version=3)
tomek = TomekLinks()

# Melakukan resampling
X_rand_under, y_rand_under = rand_under.fit_resample(X, y)
X_nearmiss, y_nearmiss = nearmiss.fit_resample(X, y)
X_nearmiss2, y_nearmiss2 = nearmiss2.fit_resample(X, y)
X_nearmiss3, y_nearmiss3 = nearmiss3.fit_resample(X, y)
X_tomek, y_tomek = tomek.fit_resample(X, y)


In [7]:
def evaluate_classification(classifier):
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
  print("ADASYN OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_adasyn, y_adasyn, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_smote, y_smote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Borderline SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_borderSmote, y_borderSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("SVM SMOTE OverSampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_svmSmote, y_svmSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Random Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_rand_under, y_rand_under , scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 1 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss, y_nearmiss, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 2 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Near Miss 3 Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_nearmiss3, y_nearmiss3, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

  print()

  print("Tomek Links Undersampling")
  for i in range(len(scoring)):
    score = cross_val_score(classifier, X_tomek, y_tomek, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

# Decision Tree

In [8]:
dt = DecisionTreeClassifier()
dt

In [9]:
evaluate_classification(dt)

ADASYN OverSampling
f1_micro score: 0.7220572380672352
f1_macro score: 0.7174411019569777
precision_micro score: 0.7232672760657762
precision_macro score: 0.7203805503459599
recall_micro score: 0.7240127187995202
recall_macro score: 0.7229587944641196

Random OverSampling
f1_micro score: 0.8350548910627523
f1_macro score: 0.8287864749960614
precision_micro score: 0.8342172055254145
precision_macro score: 0.8373458900501944
recall_micro score: 0.8346830148360548
recall_macro score: 0.8356127720249644

SMOTE OverSampling
f1_micro score: 0.7289409651364412
f1_macro score: 0.7268823000140107
precision_micro score: 0.7321050542758184
precision_macro score: 0.7242575327642186
recall_micro score: 0.7296855408041332
recall_macro score: 0.7273561212272321

Borderline SMOTE OverSampling
f1_micro score: 0.751653837521761
f1_macro score: 0.7504865958218662
precision_micro score: 0.7532357521102448
precision_macro score: 0.7468735277706993
recall_micro score: 0.7507228254794358
recall_macro score: 

Karena Random Oversampling menghasilkan score terbaik maka akan digunakan Random Oversampling

In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(dt, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8338449826821707
f1_macro score: 0.8322649494597931
precision_micro score: 0.8344966651155576
precision_macro score: 0.8418346506704489
recall_micro score: 0.8358002032906041
recall_macro score: 0.8366356819875905


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_randomOver, y_randomOver, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [12]:
param_grid_c = {'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10],
               'max_depth' : [None, 10, 20, 30],
               'min_samples_leaf': [1, 2, 4]
               }

clf_dtc = GridSearchCV(estimator= DecisionTreeClassifier(), param_grid=param_grid_c, cv= 5)
clf_dtc.fit(X_train, y_train)

In [13]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_dtc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [14]:
best_hypeparam_dtc = DecisionTreeClassifier(**clf_dtc.best_params_)
best_hypeparam_dtc.fit(X_train, y_train)

In [15]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_dtc, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.801418134937688
f1_macro score: 0.7979606672846633
precision_micro score: 0.8019147209091344
precision_macro score: 0.8004380648613368
recall_micro score: 0.797568977547352
recall_macro score: 0.802440476455334


In [16]:
def classification_metrics(prediction, y_test):
  accuracy = accuracy_score(y_test, prediction)
  f1 = f1_score(y_test, prediction, average="macro")
  recall = recall_score(y_test, prediction, average="macro")
  precision = precision_score(y_test, prediction, average="macro")

  print('Accuracy: ' + str(accuracy))
  print('F1 Score: ' + str(f1))
  print('Recall Score: ' + str(recall))
  print('Precision Score: ' + str(precision))

In [17]:
prediction = best_hypeparam_dtc.predict(X_test)
classification_metrics(prediction, y_test)

Accuracy: 0.8212956068503351
F1 Score: 0.8188763210802437
Recall Score: 0.8230153651793698
Precision Score: 0.8261901759285745


# Random Forest

In [18]:
rf = RandomForestClassifier()

In [19]:
evaluate_classification(rf)

ADASYN OverSampling
f1_micro score: 0.8294517316180648
f1_macro score: 0.826205648046014
precision_micro score: 0.8301038476450184
precision_macro score: 0.8320401436866571
recall_micro score: 0.8314078628617819
recall_macro score: 0.8292571460830853

Random OverSampling
f1_micro score: 0.8776877730146888
f1_macro score: 0.8755109520385883
precision_micro score: 0.8773153335361041
precision_macro score: 0.8785938855760811
recall_micro score: 0.8744299240996419
recall_macro score: 0.8751732986346482

SMOTE OverSampling
f1_micro score: 0.835613636934766
f1_macro score: 0.8282354762568247
precision_micro score: 0.8314252525751442
precision_macro score: 0.8281816890583832
recall_micro score: 0.8314247326503255
recall_macro score: 0.8324502829656988

Borderline SMOTE OverSampling
f1_micro score: 0.8295633151446301
f1_macro score: 0.8306513724844524
precision_micro score: 0.8316113856603001
precision_macro score: 0.8302511658981615
recall_micro score: 0.8324493744871159
recall_macro score: 0

Karena score nya paling tinggi maka akan digunakan Random Oversampling

In [20]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(rf, X_randomOver, y_randomOver, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8793634040517742
f1_macro score: 0.8774134660809472
precision_micro score: 0.8739645913867522
precision_macro score: 0.8760208608967668
recall_micro score: 0.8774085300598866
recall_macro score: 0.877779465962304


## Hyperparameter Tuning

In [21]:
param_grid_c = {'criterion': ['gini', 'entropy'],
               'min_samples_split': [2, 5, 10],
               'max_depth' : [None, 10, 20, 30],
               'min_samples_leaf': [1, 2, 4]
               }

clf_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_c, cv= 5)
clf_rfc.fit(X_train, y_train)

In [22]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_rfc.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [23]:
best_hypeparam_rfc = RandomForestClassifier(**clf_rfc.best_params_)
best_hypeparam_rfc.fit(X_train, y_train)

In [24]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_rfc, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.8384013905639425
f1_macro score: 0.8369622301228381
precision_micro score: 0.8367883333102292
precision_macro score: 0.8392514705643934
recall_micro score: 0.836789103449763
recall_macro score: 0.8363715928259321


In [25]:
prediction_rf = best_hypeparam_rfc.predict(X_test)
classification_metrics(prediction_rf, y_test)

Accuracy: 0.8682055100521221
F1 Score: 0.8690137098900724
Recall Score: 0.8691802229714765
Precision Score: 0.8691264582947485


# Logistic Regression

In [26]:
logistic = LogisticRegression(max_iter=2000)
logistic

In [27]:
evaluate_classification(logistic)

ADASYN OverSampling
f1_micro score: 0.4456962597327352
f1_macro score: 0.4450728893551975
precision_micro score: 0.4456962597327352
precision_macro score: 0.44502366920439596
recall_micro score: 0.4456962597327352
recall_macro score: 0.44567132822143274

Random OverSampling
f1_micro score: 0.44382285988112785
f1_macro score: 0.4431156289955515
precision_micro score: 0.44382285988112785
precision_macro score: 0.4443221750174999
recall_micro score: 0.44382285988112785
recall_macro score: 0.44381968630934293

SMOTE OverSampling
f1_micro score: 0.44708096875858416
f1_macro score: 0.4467085400029891
precision_micro score: 0.44708096875858416
precision_macro score: 0.4471778862782269
recall_micro score: 0.44708096875858416
recall_macro score: 0.44708034979183386

Borderline SMOTE OverSampling
f1_micro score: 0.5012562683435975
f1_macro score: 0.5013357353000851
precision_micro score: 0.5012562683435975
precision_macro score: 0.5062100777307103
recall_micro score: 0.5012562683435975
recall_ma

Karena Nearmiss 2 undersampling menghasilkan nilai terbaik maka akan digunakan

In [28]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(logistic, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.6834463276836159
f1_macro score: 0.6857352796553915
precision_micro score: 0.6834463276836159
precision_macro score: 0.7022142923507844
recall_micro score: 0.6834463276836159
recall_macro score: 0.6835087719298245


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_nearmiss2, y_nearmiss2, test_size=0.25, random_state=42)

In [30]:
logistic.fit(X_train, y_train)

In [31]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(logistic, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.639090909090909
f1_macro score: 0.6357718285020517
precision_micro score: 0.639090909090909
precision_macro score: 0.6458901561687941
recall_micro score: 0.639090909090909
recall_macro score: 0.6342063492063492


In [32]:
prediction_logreg = logistic.predict(X_test)
classification_metrics(prediction_logreg, y_test)

Accuracy: 0.6666666666666666
F1 Score: 0.6818478803950959
Recall Score: 0.6702035886818495
Precision Score: 0.7002150537634408


# Softmax Regression

In [33]:
softmax = LogisticRegression(max_iter=2000, multi_class='multinomial')
softmax

In [34]:
evaluate_classification(softmax)

ADASYN OverSampling
f1_micro score: 0.4456962597327352
f1_macro score: 0.4450728893551975
precision_micro score: 0.4456962597327352
precision_macro score: 0.44502366920439596
recall_micro score: 0.4456962597327352
recall_macro score: 0.44567132822143274

Random OverSampling
f1_micro score: 0.44382285988112785
f1_macro score: 0.4431156289955515
precision_micro score: 0.44382285988112785
precision_macro score: 0.4443221750174999
recall_micro score: 0.44382285988112785
recall_macro score: 0.44381968630934293

SMOTE OverSampling
f1_micro score: 0.44708096875858416
f1_macro score: 0.4467085400029891
precision_micro score: 0.44708096875858416
precision_macro score: 0.4471778862782269
recall_micro score: 0.44708096875858416
recall_macro score: 0.44708034979183386

Borderline SMOTE OverSampling
f1_micro score: 0.5012562683435975
f1_macro score: 0.5013357353000851
precision_micro score: 0.5012562683435975
precision_macro score: 0.5062100777307103
recall_micro score: 0.5012562683435975
recall_ma

Karena Nearmiss 2 undersampling menghasilkan nilai terbaik maka akan digunakan

In [35]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(softmax, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.6834463276836159
f1_macro score: 0.6857352796553915
precision_micro score: 0.6834463276836159
precision_macro score: 0.7022142923507844
recall_micro score: 0.6834463276836159
recall_macro score: 0.6835087719298245


In [36]:
softmax.fit(X_train, y_train)

In [37]:
prediction_softmax = softmax.predict(X_test)
classification_metrics(prediction_softmax, y_test)

Accuracy: 0.6666666666666666
F1 Score: 0.6818478803950959
Recall Score: 0.6702035886818495
Precision Score: 0.7002150537634408


# KNN

In [38]:
knn = KNeighborsClassifier()
knn

In [39]:
evaluate_classification(knn)

ADASYN OverSampling
f1_micro score: 0.7079917064129152
f1_macro score: 0.6821206219280814
precision_micro score: 0.7079917064129152
precision_macro score: 0.710359595760805
recall_micro score: 0.7079917064129152
recall_macro score: 0.7079156904626932

Random OverSampling
f1_micro score: 0.7332218527867538
f1_macro score: 0.7256677281313025
precision_micro score: 0.7332218527867538
precision_macro score: 0.7262003861353179
recall_micro score: 0.7332218527867538
recall_macro score: 0.7332221728752899

SMOTE OverSampling
f1_micro score: 0.7185139162210478
f1_macro score: 0.6943488841540948
precision_micro score: 0.7185139162210478
precision_macro score: 0.7217162453144781
recall_micro score: 0.7185139162210478
recall_macro score: 0.7185141898402458

Borderline SMOTE OverSampling
f1_micro score: 0.7534204120750806
f1_macro score: 0.7406468539271225
precision_micro score: 0.7534204120750806
precision_macro score: 0.7596462590226534
recall_micro score: 0.7534204120750806
recall_macro score: 

Karena Nearmiss 2 undersampling menghasilkan nilai terbaik maka akan digunakan

In [40]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(knn, X_nearmiss2, y_nearmiss2, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7510734463276836
f1_macro score: 0.7503901570607254
precision_micro score: 0.7510734463276836
precision_macro score: 0.7793611539466118
recall_micro score: 0.7510734463276836
recall_macro score: 0.7514035087719299


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_nearmiss2, y_nearmiss2, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [42]:
tuned_params = [{'n_neighbors': [2,3,4,5,6,7,8,9,10],'metric': ['euclidean', 'manhattan', 'jaccard']}]

clf_knn = GridSearchCV(knn, tuned_params, cv=cv, verbose = 2)
clf_knn.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=2; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=3; total time=   0.0s
[CV] END ....................metric=euclidean, n_neighbors=4; total time=   0.0s
[CV] END ....................metric=euclidean, 

Traceback (most recent call last):
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 706, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\neighbors\_classification.py", line 254, in predict
    probabilities = self.predict_proba(X)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fadlan\AppData\Roaming\Python\Python311\site-packages\sklearn\neighbo

[CV] END ......................metric=jaccard, n_neighbors=3; total time=   0.0s
[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.0s
[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.0s
[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.0s
[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.0s
[CV] END ......................metric=jaccard, n_neighbors=4; total time=   0.0s
[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.0s
[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.0s
[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.0s
[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.0s
[CV] END ......................metric=jaccard, n_neighbors=5; total time=   0.0s
[CV] END ......................metric=jaccard, n_neighbors=6; total time=   0.0s
[CV] END ...................

 0.70727273 0.72535354 0.6940404         nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.33767677 0.33767677 0.30171717 0.29707071 0.30151515 0.30151515
 0.30616162 0.33333333 0.30616162]


In [43]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(clf_knn.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'metric': 'euclidean', 'n_neighbors': 3}


In [44]:
best_hypeparam_knn = KNeighborsClassifier(**clf_knn.best_params_)
best_hypeparam_knn.fit(X_train, y_train)

In [45]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_knn, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7522222222222222
f1_macro score: 0.7468227753120105
precision_micro score: 0.7522222222222222
precision_macro score: 0.7698006335134482
recall_micro score: 0.7522222222222222
recall_macro score: 0.748452380952381


In [46]:
prediction_knn = best_hypeparam_knn.predict(X_test)
classification_metrics(prediction_knn, y_test)

Accuracy: 0.6933333333333334
F1 Score: 0.7084458177481433
Recall Score: 0.7045376121463077
Precision Score: 0.7244623655913979


# Naive Bayes

# Neural Network (MLP)

In [47]:
mlp = MLPClassifier()
mlp

In [48]:
evaluate_classification(mlp)

ADASYN OverSampling
f1_micro score: 0.6222996186268311
f1_macro score: 0.6164049133030252
precision_micro score: 0.6217406992043811
precision_macro score: 0.6098964196882848
recall_micro score: 0.629189662245791
recall_macro score: 0.6145801639814381

Random OverSampling
f1_micro score: 0.6279441392774604
f1_macro score: 0.6025711705642467
precision_micro score: 0.6253345066303413
precision_macro score: 0.6213290913774028
recall_micro score: 0.6096989808607008
recall_macro score: 0.6224618145646172

SMOTE OverSampling
f1_micro score: 0.6268254343755226
f1_macro score: 0.6041581824110387
precision_micro score: 0.6257107155638628
precision_macro score: 0.6190901816918926
recall_micro score: 0.623101342879153
recall_macro score: 0.6268194084081978

Borderline SMOTE OverSampling
f1_micro score: 0.710415913858856
f1_macro score: 0.708953208526478
precision_micro score: 0.7134868498015188
precision_macro score: 0.7076835594874631
recall_micro score: 0.708833436018485
recall_macro score: 0.71

Karena SVM SMOTE Oversampling menghasilkan nilai terbaik maka akan digunakan

In [49]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(mlp, X_svmSmote, y_svmSmote, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.7000515729757607
f1_macro score: 0.710586913680744
precision_micro score: 0.6991232594120681
precision_macro score: 0.709997014614596
recall_micro score: 0.7075812274368231
recall_macro score: 0.7291023400385666


In [50]:
X_train, X_test, y_train, y_test = train_test_split(X_svmSmote, y_svmSmote, test_size=0.25, random_state=42)

## Hyperparameter Tuning

In [51]:
param_grid = {
    'hidden_layer_sizes': [(5), (10), (5, 10)],
    'alpha': [1e-5],
    'activation': ['identity', 'logistic', 'relu', 'tanh'],
    'solver': ['sgd', 'adam'],
    'max_iter': [1000],
    'random_state': [42]
}

tuned_mlp = GridSearchCV(mlp, param_grid, cv=cv)
tuned_mlp.fit(X_train, y_train)



In [52]:
print("Hyperparameter terbaik untuk Decision Tree Classifier")
print(tuned_mlp.best_params_)

Hyperparameter terbaik untuk Decision Tree Classifier
{'activation': 'logistic', 'alpha': 1e-05, 'hidden_layer_sizes': 10, 'max_iter': 1000, 'random_state': 42, 'solver': 'adam'}


In [53]:
best_hypeparam_mlp = MLPClassifier(**tuned_mlp.best_params_)
best_hypeparam_mlp.fit(X_train, y_train)



In [54]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1_micro', 'f1_macro', 'precision_micro', 'precision_macro', 'recall_micro', 'recall_macro']
for i in range(len(scoring)):
    score = cross_val_score(best_hypeparam_mlp, X_train, y_train, scoring=scoring[i], cv=cv, n_jobs=-1)
    print(f"{scoring[i]} score: {np.mean(score)}")

f1_micro score: 0.6640052562666325
f1_macro score: 0.6747144809044958
precision_micro score: 0.6640052562666325
precision_macro score: 0.668469096754424
recall_micro score: 0.6640052562666325
recall_macro score: 0.68627620232276


In [55]:
prediction_mlp = best_hypeparam_mlp.predict(X_test)
classification_metrics(prediction_mlp, y_test)

Accuracy: 0.6815181518151815
F1 Score: 0.6946479154080848
Recall Score: 0.707442186009097
Precision Score: 0.6865410121030617


# Kesimpulan

Dari semua model classification, untuk kobinasi pre processing ini yang menghasilkan hasil paling bagus adalah Random Forest dengan data yang di Random Oversampling