In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_excel("/content/bert_features_output.xlsx")

# Take first 5000 samples
df = df.head(5000)

# Drop 'text_' column and prepare X, y
X = df.drop(columns=['text_', 'label'])
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)

# Predictions
y_pred_train = rf.predict(X_train_scaled)
y_pred_test = rf.predict(X_test_scaled)

# Evaluation
print("Random Forest - Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Random Forest - Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test):\n", classification_report(y_test, y_pred_test))
print("\nClassification Report (Train):\n", classification_report(y_train, y_pred_train))



Random Forest - Train Accuracy: 1.0
Random Forest - Test Accuracy: 0.736

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.73      0.75      0.74       502
           1       0.74      0.72      0.73       498

    accuracy                           0.74      1000
   macro avg       0.74      0.74      0.74      1000
weighted avg       0.74      0.74      0.74      1000


Classification Report (Train):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2006
           1       1.00      1.00      1.00      1994

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000



XGBoost

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_scaled, y_train)

y_pred_train = xgb_model.predict(X_train_scaled)
y_pred_test = xgb_model.predict(X_test_scaled)

print("XGBoost - Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("XGBoost - Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test):\n", classification_report(y_test, y_pred_test))
print("\nClassification Report (Train):\n", classification_report(y_train, y_pred_train))



Parameters: { "use_label_encoder" } are not used.



XGBoost - Train Accuracy: 1.0
XGBoost - Test Accuracy: 0.734

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.74      0.73      0.73       502
           1       0.73      0.74      0.74       498

    accuracy                           0.73      1000
   macro avg       0.73      0.73      0.73      1000
weighted avg       0.73      0.73      0.73      1000


Classification Report (Train):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2006
           1       1.00      1.00      1.00      1994

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000



AdaBoost



In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_model = AdaBoostClassifier(random_state=42)
ada_model.fit(X_train_scaled, y_train)

y_pred_train = ada_model.predict(X_train_scaled)
y_pred_test = ada_model.predict(X_test_scaled)

print("AdaBoost - Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("AdaBoost - Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test):\n", classification_report(y_test, y_pred_test))
print("\nClassification Report (Train):\n", classification_report(y_train, y_pred_train))


AdaBoost - Train Accuracy: 0.7505
AdaBoost - Test Accuracy: 0.721

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.73      0.72      0.72       502
           1       0.72      0.73      0.72       498

    accuracy                           0.72      1000
   macro avg       0.72      0.72      0.72      1000
weighted avg       0.72      0.72      0.72      1000


Classification Report (Train):
               precision    recall  f1-score   support

           0       0.75      0.76      0.75      2006
           1       0.75      0.75      0.75      1994

    accuracy                           0.75      4000
   macro avg       0.75      0.75      0.75      4000
weighted avg       0.75      0.75      0.75      4000



Naive Bayes (Gaussian)

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

y_pred_train = nb_model.predict(X_train_scaled)
y_pred_test = nb_model.predict(X_test_scaled)

print("Naive Bayes - Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Naive Bayes - Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test):\n", classification_report(y_test, y_pred_test))
print("\nClassification Report (Train):\n", classification_report(y_train, y_pred_train))


Naive Bayes - Train Accuracy: 0.68775
Naive Bayes - Test Accuracy: 0.688

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.68      0.71      0.69       502
           1       0.69      0.67      0.68       498

    accuracy                           0.69      1000
   macro avg       0.69      0.69      0.69      1000
weighted avg       0.69      0.69      0.69      1000


Classification Report (Train):
               precision    recall  f1-score   support

           0       0.67      0.74      0.70      2006
           1       0.71      0.64      0.67      1994

    accuracy                           0.69      4000
   macro avg       0.69      0.69      0.69      4000
weighted avg       0.69      0.69      0.69      4000



MLP (Neural Network)



In [None]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)
mlp_model.fit(X_train_scaled, y_train)

y_pred_train = mlp_model.predict(X_train_scaled)
y_pred_test = mlp_model.predict(X_test_scaled)

print("MLP - Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("MLP - Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test):\n", classification_report(y_test, y_pred_test))
print("\nClassification Report (Train):\n", classification_report(y_train, y_pred_train))


MLP - Train Accuracy: 1.0
MLP - Test Accuracy: 0.784

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.79      0.78      0.78       502
           1       0.78      0.79      0.78       498

    accuracy                           0.78      1000
   macro avg       0.78      0.78      0.78      1000
weighted avg       0.78      0.78      0.78      1000


Classification Report (Train):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2006
           1       1.00      1.00      1.00      1994

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000



K-Nearest Neighbors



In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

y_pred_train = knn_model.predict(X_train_scaled)
y_pred_test = knn_model.predict(X_test_scaled)

print("KNN - Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("KNN - Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test):\n", classification_report(y_test, y_pred_test))
print("\nClassification Report (Train):\n", classification_report(y_train, y_pred_train))


KNN - Train Accuracy: 0.79125
KNN - Test Accuracy: 0.688

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.65      0.83      0.73       502
           1       0.76      0.55      0.64       498

    accuracy                           0.69      1000
   macro avg       0.70      0.69      0.68      1000
weighted avg       0.70      0.69      0.68      1000


Classification Report (Train):
               precision    recall  f1-score   support

           0       0.75      0.89      0.81      2006
           1       0.86      0.70      0.77      1994

    accuracy                           0.79      4000
   macro avg       0.80      0.79      0.79      4000
weighted avg       0.80      0.79      0.79      4000



Decision Tree



In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled, y_train)

y_pred_train = dt_model.predict(X_train_scaled)
y_pred_test = dt_model.predict(X_test_scaled)

print("Decision Tree - Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Decision Tree - Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test):\n", classification_report(y_test, y_pred_test))
print("\nClassification Report (Train):\n", classification_report(y_train, y_pred_train))


Decision Tree - Train Accuracy: 1.0
Decision Tree - Test Accuracy: 0.601

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.60      0.64      0.62       502
           1       0.61      0.57      0.59       498

    accuracy                           0.60      1000
   macro avg       0.60      0.60      0.60      1000
weighted avg       0.60      0.60      0.60      1000


Classification Report (Train):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2006
           1       1.00      1.00      1.00      1994

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000

