# PROJECT 4.1 - ENSEMBLE LEARNING TECHNIQUE
## `XGBoost, GradienBoost & Adaboost test`

## Load Library

In [7]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingRegressor
import lightgbm as lgb

In [10]:
import pandas as pd
import numpy as np
from utils.EmbeddingVectorizer import tfidf_vectorizer, bow_vectorizer, EmbeddingVectorizer
from utils.text_preprocessing import category_processing

## 1. Import Data

In [12]:
train_data = pd.read_csv("../dataset/train_data.csv")
test_data = pd.read_csv("../dataset/test_data.csv")
X_train = train_data.iloc[:, 0]
X_test = test_data.iloc[:, 0]
y_train = train_data.iloc[:, 1].map(category_processing)
y_test = test_data.iloc[:, 1].map(category_processing)

Check the value of `y_train, y_test` to be converted to number & ensure no blank. If blank cell is available, we need to use `mean` or `prediction` this missing value

In [15]:
check_train = y_train[y_train == 0].count()
check_test = y_train[y_train == 0].count()
if check_test == 0 and check_train == 0:
    print("✅ No blank value from train & test target/ label")
print(y_train[:10])

✅ No blank value from train & test target/ label
0          cs
1     physics
2    astro-ph
3    cond-mat
4    astro-ph
5    astro-ph
6          cs
7        math
8          cs
9     physics
Name: categories, dtype: object


## 2.Embedding Data

In [5]:
Xtrain_tfidf, Xtest_tfidf = tfidf_vectorizer(X_train=X_train, X_test=X_test)
Xtrain_bow, Xtest_bow = bow_vectorizer(X_train=X_train, X_test=X_test)

In [18]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(Xtrain_tfidf, y_train)
y_pred = rfc.predict(Xtest_tfidf)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    astro-ph       0.98      0.88      0.93        59
    cond-mat       0.73      0.77      0.75        75
          cs       0.86      0.85      0.85       113
        math       0.65      0.93      0.76       109
     math-ph       0.00      0.00      0.00         9
     physics       0.00      0.00      0.00        35

    accuracy                           0.77       400
   macro avg       0.54      0.57      0.55       400
weighted avg       0.70      0.77      0.73       400

0.7675


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [19]:
rfc.fit(Xtrain_bow, y_train)
y_pred = rfc.predict(Xtest_bow)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    astro-ph       0.98      0.86      0.92        59
    cond-mat       0.68      0.77      0.72        75
          cs       0.85      0.84      0.84       113
        math       0.66      0.92      0.77       109
     math-ph       0.00      0.00      0.00         9
     physics       0.00      0.00      0.00        35

    accuracy                           0.76       400
   macro avg       0.53      0.57      0.54       400
weighted avg       0.69      0.76      0.72       400

0.76


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [20]:
adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
adaboost.fit(Xtrain_tfidf, y_train)
y_pred = adaboost.predict(Xtest_tfidf)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    astro-ph       0.94      0.58      0.72        59
    cond-mat       0.56      0.41      0.48        75
          cs       0.81      0.58      0.68       113
        math       0.45      0.94      0.61       109
     math-ph       0.00      0.00      0.00         9
     physics       0.00      0.00      0.00        35

    accuracy                           0.58       400
   macro avg       0.46      0.42      0.41       400
weighted avg       0.60      0.58      0.55       400

0.585


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [21]:
adaboost.fit(Xtrain_bow, y_train)
y_pred = adaboost.predict(Xtest_bow)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    astro-ph       0.98      0.69      0.81        59
    cond-mat       0.71      0.47      0.56        75
          cs       0.76      0.73      0.75       113
        math       0.49      0.85      0.62       109
     math-ph       0.00      0.00      0.00         9
     physics       0.22      0.06      0.09        35

    accuracy                           0.64       400
   macro avg       0.53      0.47      0.47       400
weighted avg       0.65      0.64      0.61       400

0.635
