# PROJECT 4.1 - ENSEMBLE LEARNING TECHNIQUE
## `Classical Model Processing:`
`KNN, K-Means, Gaussian (NB)`

### IMPORT LIBRARY

In [6]:
from gradio.themes.utils.colors import green
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from i_test_module import y_pred

In [9]:
import warnings
warnings.filterwarnings('ignore')

## 1. IMPORT DATA

In [1]:
from utils.data_storage import load_all_vectorizers
Xtrain_tfidf, Xtest_tfidf, Xtrain_bow, Xtest_bow, Xtrain_em, Xtest_em, y_train, y_test = load_all_vectorizers()

Succesfully loaded all vectorizers and vector representations


In [2]:
print(Xtrain_em.shape)
print(Xtest_em.shape)

(1600, 768)
(400, 768)


## 2. FIT TO MODEL AND TUNING

### **GAUSSIAN NB**

In [7]:
def gaussian_nb_tuning(X_train, X_test, y_train, y_test):
    """

    :param X_train:
    :param X_test:
    :param y_train:
    :param y_test:
    :return:
    """
    print("⏫ Tuning for Gaussian NB")
    param_grid = {
        "var_smoothing": [1e-9, 1e-10, 1e-11, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1.0]
    }
    gnb = GaussianNB()
    grid_search_nb = GridSearchCV(
        gnb,
        param_grid,
        cv=5,
        scoring="accuracy",
        n_jobs=1,
        verbose=2,
    )
    grid_search_nb.fit(X_train, y_train)

    # Show out best estimator
    print(f"✅ Grid search best parameters: {grid_search_nb.best_params_}")
    print(f"✅ Grid search best score: {grid_search_nb.best_score_}")

    optimized_gnb = grid_search_nb.best_estimator_
    y_pred = optimized_gnb.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return y_pred, score, report

In [10]:
_, gb_tfidf_accuracy, gb_tfidf_report = gaussian_nb_tuning(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, gb_bow_accuracy, gb_bow_report = gaussian_nb_tuning(Xtrain_bow, Xtest_bow, y_train, y_test)
_, gb_embeddings_accuracy, gb_embeddings_report = gaussian_nb_tuning(Xtrain_em, Xtest_em, y_train, y_test)

print("✅ the accuracy of Tfidf with Gaussian NB is: ", gb_tfidf_accuracy)
print("✅ the accuracy of Bow with Gaussian NB is: ", gb_bow_accuracy)
print("✅ the accuracy of Em with Gaussian NB is: ", gb_embeddings_accuracy)

⏫ Tuning for Gaussian NB
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ................................var_smoothing=1e-09; total time=   0.5s
[CV] END ................................var_smoothing=1e-09; total time=   0.5s
[CV] END ................................var_smoothing=1e-09; total time=   0.5s
[CV] END ................................var_smoothing=1e-09; total time=   0.5s
[CV] END ................................var_smoothing=1e-09; total time=   0.5s
[CV] END ................................var_smoothing=1e-10; total time=   0.5s
[CV] END ................................var_smoothing=1e-10; total time=   0.5s
[CV] END ................................var_smoothing=1e-10; total time=   0.5s
[CV] END ................................var_smoothing=1e-10; total time=   0.5s
[CV] END ................................var_smoothing=1e-10; total time=   0.5s
[CV] END ................................var_smoothing=1e-11; total time=   0.5s
[CV] END ..............

### **KNN**

In [14]:
def KNN_tuning(X_train, X_test, y_train, y_test):
    """
    Hyperparameters tuning for KNN with GridSearchCV
    :return:
    """
    print("✅ Tuning for KNN")
    knn = KNeighborsClassifier()
    param_grid = {
        "n_neighbors": [3, 5, 7],
        "weights": ["uniform", "distance"],
        "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
        "metric": ["euclidean", "manhattan"],
        "p": [1, 2] # only use with Minkowski
    }
    random_search_knn = RandomizedSearchCV(
        knn,
        param_grid,
        n_iter=100,
        cv=5,
        scoring="accuracy",
        n_jobs=1,
        verbose=1,
        random_state=42,
    )
    random_search_knn.fit(X_train, y_train)

    # Best parameters after tuning
    print(f"✅ Random search best parameters: {random_search_knn.best_params_}")
    print(f"✅ Random search best score: {random_search_knn.best_score_}")

    # show result
    optimized_knn = random_search_knn.best_estimator_
    y_pred = optimized_knn.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return y_pred, score, report

In [15]:
_, knn_tfidf_accuracy, knn_tfidf_report = KNN_tuning(Xtrain_tfidf, Xtest_tfidf, y_train, y_test)
_, knn_bow_accuracy, knn_bow_report = KNN_tuning(Xtrain_bow, Xtest_bow, y_train, y_test)
_, knn_embeddings_accuracy, knn_embeddings_report = KNN_tuning(Xtrain_em, Xtest_em, y_train, y_test)

print("✅ the accuracy of Tfidf with KNN is: ", knn_tfidf_accuracy)
print("✅ the accuracy of Bow with KNN is: ", knn_bow_accuracy)
print("✅ the accuracy of Em with KNN is: ", knn_embeddings_accuracy)

✅ Tuning for KNN
Fitting 5 folds for each of 96 candidates, totalling 480 fits
✅ Random search best parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 7, 'metric': 'euclidean', 'algorithm': 'auto'}
✅ Random search best score: 0.7831249999999998
✅ Tuning for KNN
Fitting 5 folds for each of 96 candidates, totalling 480 fits
✅ Random search best parameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 7, 'metric': 'euclidean', 'algorithm': 'auto'}
✅ Random search best score: 0.415625
✅ Tuning for KNN
Fitting 5 folds for each of 96 candidates, totalling 480 fits
✅ Random search best parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 5, 'metric': 'manhattan', 'algorithm': 'auto'}
✅ Random search best score: 0.82875
✅ the accuracy of Tfidf with KNN is:  0.7925
✅ the accuracy of Bow with KNN is:  0.4025
✅ the accuracy of Em with KNN is:  0.82
