# PROJECT 4.1 - ENSEMBLE LEARNING TECHNIQUE
## `Classical Model Processing:`
`KNN, K-Means, Gaussian (NB)`

### IMPORT LIBRARY

In [29]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, silhouette_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [17]:
import joblib
import warnings
warnings.filterwarnings('ignore')

## 1. IMPORT DATA

In [1]:
from utils.data_storage import load_all_vectorizers
Xtrain_tfidf, Xtest_tfidf, Xtrain_bow, Xtest_bow, Xtrain_em, Xtest_em, y_train, y_test = load_all_vectorizers()

Succesfully loaded all vectorizers and vector representations


In [2]:
print(Xtrain_em.shape)
print(Xtest_em.shape)

(1600, 768)
(400, 768)


## 2. FIT TO MODEL AND TUNING

### **GAUSSIAN NB**

In [19]:
def gaussian_nb_tuning(X_train, X_test, y_train, y_test, embedding_type: str):
    """

    :param X_train:
    :param X_test:
    :param y_train:
    :param y_test:
    :return:
    """
    print("⏫ Tuning for Gaussian NB")
    param_grid = {
        "var_smoothing": [1e-9, 1e-10, 1e-11, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1.0]
    }
    gnb = GaussianNB()
    grid_search_nb = GridSearchCV(
        gnb,
        param_grid,
        cv=5,
        scoring="accuracy",
        n_jobs=1,
        verbose=2,
    )
    grid_search_nb.fit(X_train, y_train)

    # Show out best estimator
    print(f"✅ Grid search best parameters: {grid_search_nb.best_params_}")
    print(f"✅ Grid search best score: {grid_search_nb.best_score_}")

    optimized_gnb = grid_search_nb.best_estimator_
    y_pred = optimized_gnb.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    joblib.dump(optimized_gnb, f"../pkls/gaussian_nb_{embedding_type}.pkl")
    print(f"✅ Export optimized Gaussian NB of {embedding_type} successfully")

    return y_pred, score, report, grid_search_nb.best_params_

In [22]:
gb_pred_tfidf, gb_tfidf_accuracy, gb_tfidf_report, gb_best_pr_tfidf = gaussian_nb_tuning(Xtrain_tfidf, Xtest_tfidf, y_train, y_test, embedding_type="tfidf")
gb_pred_bow, gb_bow_accuracy, gb_bow_report, gb_best_pr_bow = gaussian_nb_tuning(Xtrain_bow, Xtest_bow, y_train, y_test, embedding_type="bow")
gb_pred_em, gb_embeddings_accuracy, gb_embeddings_report, gb_best_pr_em = gaussian_nb_tuning(Xtrain_em, Xtest_em, y_train, y_test, embedding_type="em")

print("✅ the accuracy of Tfidf with Gaussian NB is: ", gb_tfidf_accuracy)
print("✅ the accuracy of Bow with Gaussian NB is: ", gb_bow_accuracy)
print("✅ the accuracy of Em with Gaussian NB is: ", gb_embeddings_accuracy)

⏫ Tuning for Gaussian NB
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ................................var_smoothing=1e-09; total time=   0.6s
[CV] END ................................var_smoothing=1e-09; total time=   0.6s
[CV] END ................................var_smoothing=1e-09; total time=   0.4s
[CV] END ................................var_smoothing=1e-09; total time=   0.4s
[CV] END ................................var_smoothing=1e-09; total time=   0.4s
[CV] END ................................var_smoothing=1e-10; total time=   0.4s
[CV] END ................................var_smoothing=1e-10; total time=   0.4s
[CV] END ................................var_smoothing=1e-10; total time=   0.4s
[CV] END ................................var_smoothing=1e-10; total time=   0.4s
[CV] END ................................var_smoothing=1e-10; total time=   0.4s
[CV] END ................................var_smoothing=1e-11; total time=   0.4s
[CV] END ..............

### **KNN**

In [23]:
def KNN_tuning(X_train, X_test, y_train, y_test, embedding_type: str):
    """
    Hyperparameters tuning for KNN with GridSearchCV
    :return:
    """
    print("✅ Tuning for KNN")
    knn = KNeighborsClassifier()
    param_grid = {
        "n_neighbors": [3, 5, 7],
        "weights": ["uniform", "distance"],
        "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
        "metric": ["euclidean", "manhattan"],
        "p": [1, 2] # only use with Minkowski
    }
    random_search_knn = RandomizedSearchCV(
        knn,
        param_grid,
        n_iter=50,
        cv=5,
        scoring="accuracy",
        n_jobs=1,
        verbose=1,
        random_state=42,
    )
    random_search_knn.fit(X_train, y_train)

    # Best parameters after tuning
    print(f"✅ Random search best parameters: {random_search_knn.best_params_}")
    print(f"✅ Random search best score: {random_search_knn.best_score_}")

    # show result
    optimized_knn = random_search_knn.best_estimator_
    y_pred = optimized_knn.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    joblib.dump(optimized_knn, f"../pkls/knn_{embedding_type}.pkl")
    print(f"✅ Export optimized KNN of {embedding_type} successfully")

    return y_pred, score, report, random_search_knn.best_params_

In [24]:
knn_pred_tfidf, knn_tfidf_accuracy, knn_tfidf_report, knn_best_pr_tfidf = KNN_tuning(Xtrain_tfidf, Xtest_tfidf, y_train, y_test, embedding_type="tfidf")
knn_pred_bow, knn_bow_accuracy, knn_bow_report, knn_best_pr_bow = KNN_tuning(Xtrain_bow, Xtest_bow, y_train, y_test, embedding_type="bow")
knn_pred_em, knn_embeddings_accuracy, knn_embeddings_report, knn_best_pr_em = KNN_tuning(Xtrain_em, Xtest_em, y_train, y_test, embedding_type="em")

print("✅ the accuracy of Tfidf with KNN is: ", knn_tfidf_accuracy)
print("✅ the accuracy of Bow with KNN is: ", knn_bow_accuracy)
print("✅ the accuracy of Em with KNN is: ", knn_embeddings_accuracy)

✅ Tuning for KNN
Fitting 5 folds for each of 50 candidates, totalling 250 fits
✅ Random search best parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 7, 'metric': 'euclidean', 'algorithm': 'ball_tree'}
✅ Random search best score: 0.7831249999999998
✅ Export optimized KNN of tfidf successfully
✅ Tuning for KNN
Fitting 5 folds for each of 50 candidates, totalling 250 fits
✅ Random search best parameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 7, 'metric': 'euclidean', 'algorithm': 'brute'}
✅ Random search best score: 0.415625
✅ Export optimized KNN of bow successfully
✅ Tuning for KNN
Fitting 5 folds for each of 50 candidates, totalling 250 fits
✅ Random search best parameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 5, 'metric': 'manhattan', 'algorithm': 'kd_tree'}
✅ Random search best score: 0.82875
✅ Export optimized KNN of em successfully
✅ the accuracy of Tfidf with KNN is:  0.7925
✅ the accuracy of Bow with KNN is:  0.4025
✅ the accuracy of Em with KNN is:  0

### **KMEAN**

In [30]:
def kmean_tuning(X_train, X_test, y_train, y_test, max_k: int=15, embedding_type: str=""):
    """
    Function to optimize KMeans with silouhoute & iters
    :return:
    """
    print("✅ Tuning for Kmean")
    inerts = []
    silhouettes = []
    k_range = range(2, max_k + 1)

    for k in k_range:
        kmeans = KMeans(
            n_clusters=k,
            init="k-means++",
            n_init=10,
            max_iter=100,
            random_state=42
        )
        kmeans.fit(X_train)
        inerts.append(kmeans.inertia_)
        silhouettes_avg = silhouette_score(X_train, kmeans.labels_)
        silhouettes.append(silhouettes_avg)

    optimized_k = k_range[np.argmax(silhouettes)]
    print(f"\n✅ Optimal K (based on Silhouette): {optimized_k}")
    print(f"✅ Best Silhouette Score: {max(silhouettes):.4f}")

    optimized_kmeans = KMeans(
        n_clusters=optimized_k,
        init="k-means++",
        n_init=10,
        max_iter=100,
        random_state=42
    )
    optimized_kmeans.fit(X_train)
    y_pred = optimized_kmeans.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    joblib.dump(optimized_kmeans, f"../pkls/kmeans_{embedding_type}.pkl")
    print(f"✅ Export optimized KMeans of {embedding_type} successfully")
    
    return y_pred, score, report, optimized_kmeans.cluster_centers_

In [31]:
km_pred_tfidf, km_tfidf_accuracy, km_tfidf_report, km_best_pr_tfidf = kmean_tuning(Xtrain_tfidf, Xtest_tfidf, y_train, y_test, embedding_type="tfidf")
km_pred_bow, km_bow_accuracy, km_bow_report, km_best_pr_bow = kmean_tuning(Xtrain_bow, Xtest_bow, y_train, y_test, embedding_type="bow")
km_pred_em, km_embeddings_accuracy, km_embeddings_report, km_best_pr_em = kmean_tuning(Xtrain_em, Xtest_em, y_train, y_test, embedding_type="em")

print("✅ the accuracy of Tfidf with Kmeans is: ", km_tfidf_accuracy)
print("✅ the accuracy of Bow with Kmeans is: ", km_bow_accuracy)
print("✅ the accuracy of Em with Kmeans is: ", km_embeddings_accuracy)

✅ Tuning for Kmean

✅ Optimal K (based on Silhouette): 2
✅ Best Silhouette Score: 0.0033
✅ Export optimized KMeans of tfidf successfully
✅ Tuning for Kmean

✅ Optimal K (based on Silhouette): 2
✅ Best Silhouette Score: 0.1762
✅ Export optimized KMeans of bow successfully
✅ Tuning for Kmean

✅ Optimal K (based on Silhouette): 3
✅ Best Silhouette Score: 0.0410
✅ Export optimized KMeans of em successfully
✅ the accuracy of Tfidf with Kmeans is:  0.025
✅ the accuracy of Bow with Kmeans is:  0.0575
✅ the accuracy of Em with Kmeans is:  0.16
