In [None]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import RandomizedSearchCV

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
param_dist = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}


In [None]:
import os
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix,
    roc_auc_score, roc_curve
)
from sklearn.preprocessing import label_binarize

def train_phase(phase_path, random_state=42, sample=False):
    best_model = None
    best_score = 0
    all_metrics = []
    
    for fold in range(1, 6):
        fold_path = os.path.join(phase_path, f"fold{fold}")
        scaler = joblib.load(f"{fold_path}/scaler.pkl")

        if sample:
            X_train = pd.read_csv(f"{fold_path}/X_train_resampled.csv")
            y_train = pd.read_csv(f"{fold_path}/y_train_resampled.csv").values.ravel()
            X_train_scaled = X_train
        else:
            X_train = pd.read_csv(f"{fold_path}/X_train.csv")
            y_train = pd.read_csv(f"{fold_path}/y_train.csv").values.ravel()
            X_train_scaled = scaler.transform(X_train)
            X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

        X_val = pd.read_csv(f"{fold_path}/X_val.csv")
        y_val = pd.read_csv(f"{fold_path}/y_val.csv").values.ravel()
        X_val_scaled = scaler.transform(X_val)
        X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_train.columns)

        # SVM model with predefined hyperparameters
        model = SVC(C=0.1, kernel='rbf', gamma='scale', probability=True, random_state=random_state)
        model.fit(X_train_scaled, y_train)

        y_pred = model.predict(X_val_scaled)
        y_proba = model.predict_proba(X_val_scaled)

        acc = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred, average='macro')

        # AUC
        classes = model.classes_
        y_val_bin = label_binarize(y_val, classes=classes)
        auc = roc_auc_score(y_val_bin, y_proba, average='macro', multi_class='ovr')

        all_metrics.append({'fold': fold, 'accuracy': acc, 'f1_macro': f1, 'auc': auc})

        if f1 > best_score:
            best_score = f1
            best_model = model
            best_scaler = scaler
            best_fold = fold

        print(f"\nFold {fold} Classification Report:\n", classification_report(y_val, y_pred))

        # Confusion Matrix
        cm = confusion_matrix(y_val, y_pred)
        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=classes, yticklabels=classes)
        plt.title(f'Confusion Matrix - Fold {fold}')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.tight_layout()
        plt.show()

        # ROC Curve
        plt.figure(figsize=(8, 6))
        for i, cls in enumerate(classes):
            fpr, tpr, _ = roc_curve(y_val_bin[:, i], y_proba[:, i])
            plt.plot(fpr, tpr, label=f'Class {cls}')
        plt.plot([0, 1], [0, 1], 'k--', label='Chance')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - Fold {fold}')
        plt.legend()
        plt.tight_layout()
        plt.show()

    # Average metrics
    avg_metrics = pd.DataFrame(all_metrics).mean().to_dict()
    print(f"\nAverage Accuracy: {avg_metrics['accuracy']:.4f}")
    print(f"Average F1 Macro: {avg_metrics['f1_macro']:.4f}")
    print(f"Average AUC: {avg_metrics['auc']:.4f}")

    return best_model, best_scaler, best_fold, all_metrics


In [None]:
# import os
# import pandas as pd
# import numpy as np
# import joblib
# import matplotlib.pyplot as plt
# import seaborn as sns

# from sklearn.metrics import (
#     accuracy_score, f1_score, classification_report, confusion_matrix,
#     roc_auc_score, roc_curve
# )
# from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Dropout
# from tensorflow.keras.utils import to_categorical


# def create_ann_lstm_model(n_timesteps, n_features, n_outputs):
#     model = Sequential()
#     model._name = 'ANN-LSTM'
#     model.add(LSTM(200, input_shape=(n_timesteps, n_features), recurrent_dropout=0.2, name="LSTM_Layer"))
#     model.add(Dropout(0.5, name="Dropout_layer"))
#     model.add(Dense(100, activation='relu', name="ANN_Hidden_Layer"))
#     model.add(Dense(n_outputs, activation='softmax', name="ANN_Output_Layer"))
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
#     return model


# def train_phase(phase_path, random_state=42, sample=False):
#     best_model = None
#     best_score = 0
#     all_metrics = []

#     for fold in range(1, 6):
#         fold_path = os.path.join(phase_path, f"fold{fold}")
#         scaler = joblib.load(f"{fold_path}/scaler.pkl")

#         if sample:
#             X_train = pd.read_csv(f"{fold_path}/X_train_resampled.csv")
#             y_train = pd.read_csv(f"{fold_path}/y_train_resampled.csv").values.ravel()
#             X_train_scaled = X_train
#         else:
#             X_train = pd.read_csv(f"{fold_path}/X_train.csv")
#             y_train = pd.read_csv(f"{fold_path}/y_train.csv").values.ravel()
#             X_train_scaled = scaler.transform(X_train)
#             X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

#         X_val = pd.read_csv(f"{fold_path}/X_val.csv")
#         y_val = pd.read_csv(f"{fold_path}/y_val.csv").values.ravel()
#         X_val_scaled = scaler.transform(X_val)
#         X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_train.columns)

#         # Encode labels
#         le = LabelEncoder()
#         y_train_enc = le.fit_transform(y_train)
#         y_val_enc = le.transform(y_val)

#         # One-hot encode
#         y_train_cat = to_categorical(y_train_enc)
#         y_val_cat = to_categorical(y_val_enc)

#         # Reshape input to (samples, timesteps, features)
#         X_train_seq = np.expand_dims(X_train_scaled.values, axis=1)
#         X_val_seq = np.expand_dims(X_val_scaled.values, axis=1)

#         n_timesteps = X_train_seq.shape[1]
#         n_features = X_train_seq.shape[2]
#         n_outputs = y_train_cat.shape[1]

#         # Create and train model
#         model = create_ann_lstm_model(n_timesteps, n_features, n_outputs)
#         print(f"\nFold {fold} - Model Summary:")
#         print(model.summary())

#         history = model.fit(
#             X_train_seq, y_train_cat,
#             validation_data=(X_val_seq, y_val_cat),
#             epochs=30, batch_size=32, verbose=0
#         )

#         # Predict & evaluate
#         y_proba = model.predict(X_val_seq)
#         y_pred = np.argmax(y_proba, axis=1)
#         y_val_true = np.argmax(y_val_cat, axis=1)

#         acc = accuracy_score(y_val_true, y_pred)
#         f1 = f1_score(y_val_true, y_pred, average='macro')
#         auc = roc_auc_score(y_val_cat, y_proba, average='macro', multi_class='ovr')

#         all_metrics.append({'fold': fold, 'accuracy': acc, 'f1_macro': f1, 'auc': auc})

#         if f1 > best_score:
#             best_score = f1
#             best_model = model
#             best_scaler = scaler
#             best_fold = fold
#             best_encoder = le

#         print(f"\nFold {fold} Classification Report:\n", classification_report(y_val_true, y_pred, target_names=le.classes_.astype(str)))

#         # Confusion Matrix
#         cm = confusion_matrix(y_val_true, y_pred)
#         plt.figure(figsize=(6, 5))
#         sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
#                     xticklabels=le.classes_, yticklabels=le.classes_)
#         plt.title(f'Confusion Matrix - Fold {fold}')
#         plt.xlabel('Predicted')
#         plt.ylabel('True')
#         plt.tight_layout()
#         plt.show()

#         # ROC Curve
#         plt.figure(figsize=(8, 6))
#         for i, cls in enumerate(le.classes_):
#             fpr, tpr, _ = roc_curve(y_val_cat[:, i], y_proba[:, i])
#             plt.plot(fpr, tpr, label=f'Class {cls}')
#         plt.plot([0, 1], [0, 1], 'k--', label='Chance')
#         plt.xlabel('False Positive Rate')
#         plt.ylabel('True Positive Rate')
#         plt.title(f'ROC Curve - Fold {fold}')
#         plt.legend()
#         plt.tight_layout()
#         plt.show()

#     # Average metrics
#     avg_metrics = pd.DataFrame(all_metrics).mean().to_dict()
#     print(f"\nAverage Accuracy: {avg_metrics['accuracy']:.4f}")
#     print(f"Average F1 Macro: {avg_metrics['f1_macro']:.4f}")
#     print(f"Average AUC: {avg_metrics['auc']:.4f}")

#     return best_model, best_scaler, best_fold, all_metrics


In [None]:
phase_paths = [f"/kaggle/input/smotesvm-train-filtered-data/outputs/phase{i}" for i in range(1, 5)]
results = {}

for i, phase_path in enumerate(phase_paths, start=1):
    print(f"\n===================================")
    print(f"\n======= Training Phase {i} ========")
    print(f"\n===================================")
    sample = False
    model, scaler, best_fold, metrics = train_phase(phase_path, sample = sample)
    # Save best model & scaler
    if sample: 
        temp = "sample"
    else: temp = "no_sample"
    joblib.dump(model, f"best_model_{temp}_phase{i}.pkl")
    joblib.dump(scaler, f"best_scaler_{temp}_phase{i}.pkl")
    results[f"phase{i}"] = metrics


## Predict on Test Data

In [None]:
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize

def predict_on_test(test_file_path, model_file, scaler_file, school_mapping_file):
    # Load model, scaler, and mapping
    model = joblib.load(model_file)
    scaler = joblib.load(scaler_file)
    mapping = joblib.load(school_mapping_file)

    # Load and preprocess test data
    df_test = pd.read_csv(test_file_path)
    y_test = df_test['label_encoded']
    X_test = df_test.drop(columns=['user_id', 'course_id', 'label_encoded', 'label', 'total_score'], axis=1)

    # Apply school mapping
    if 'school' in X_test.columns:
        X_test['school'] = X_test['school'].map(mapping).fillna(0).astype(int)

    # Scale features
    X_test_scaled = scaler.transform(X_test)

    # Predict
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)

    # Classification Report
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()

    # AUC Score
    classes = model.classes_
    y_test_bin = label_binarize(y_test, classes=classes)
    auc_score = roc_auc_score(y_test_bin, y_proba, average='macro', multi_class='ovr')
    print(f"\nTest AUC (macro-average, OVR): {auc_score:.4f}")

    # Plot ROC Curves
    plt.figure(figsize=(8, 6))
    for i, cls in enumerate(classes):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
        plt.plot(fpr, tpr, label=f'Class {cls}')

    plt.plot([0, 1], [0, 1], 'k--', label='Chance')
    plt.title('ROC Curves for Test Set')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.tight_layout()
    plt.show()

    return y_pred


In [None]:
# import pandas as pd
# import joblib
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
# from sklearn.preprocessing import label_binarize


# def predict_on_test(test_file_path, model_file, scaler_file, school_mapping_file):
#     import numpy as np

#     # Load model, scaler và mapping
#     model = joblib.load(model_file)  # Đây là Keras model lưu bằng joblib
#     scaler = joblib.load(scaler_file)
#     mapping = joblib.load(school_mapping_file)

#     # Load test data
#     df_test = pd.read_csv(test_file_path)
#     y_test = df_test['label_encoded'].values
#     X_test = df_test.drop(columns=['user_id', 'course_id', 'label_encoded', 'label', 'total_score'], axis=1)

#     if 'school' in X_test.columns:
#         X_test['school'] = X_test['school'].map(mapping).fillna(0).astype(int)

#     # Scale
#     X_test_scaled = scaler.transform(X_test)

#     # ✅ THÊM DÒNG NÀY: reshape thành (batch, time_steps=1, features)
#     X_test_scaled = np.expand_dims(X_test_scaled, axis=1)

#     # Predict
#     y_proba = model.predict(X_test_scaled)
#     y_pred = np.argmax(y_proba, axis=1)

#     # Classification Report
#     print("\nClassification Report:\n")
#     print(classification_report(y_test, y_pred))

#     # Confusion Matrix
#     cm = confusion_matrix(y_test, y_pred)
#     plt.figure(figsize=(8, 6))
#     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
#     plt.title('Confusion Matrix')
#     plt.xlabel('Predicted')
#     plt.ylabel('True')
#     plt.tight_layout()
#     plt.show()

#     # AUC
#     classes = np.unique(y_test)
#     y_test_bin = label_binarize(y_test, classes=classes)
#     auc_score = roc_auc_score(y_test_bin, y_proba, average='macro', multi_class='ovr')
#     print(f"\nTest AUC (macro-average, OVR): {auc_score:.4f}")

#     # ROC Curves
#     plt.figure(figsize=(8, 6))
#     for i, cls in enumerate(classes):
#         fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
#         plt.plot(fpr, tpr, label=f'Class {cls}')
#     plt.plot([0, 1], [0, 1], 'k--', label='Chance')
#     plt.title('ROC Curves for Test Set')
#     plt.xlabel('False Positive Rate')
#     plt.ylabel('True Positive Rate')
#     plt.legend()
#     plt.tight_layout()
#     plt.show()

#     return y_pred


In [None]:
import glob

for i in range(1, 5):
    test_dir = f'/kaggle/input/filtered-final-data/phase{i}/user_train_phase_{i}_test.csv'
    
    # Get model and scaler using glob
    model_path = f'/kaggle/working/best_model_{temp}_phase{i}.pkl'
    scaler_path = f'/kaggle/working/best_scaler_{temp}_phase{i}.pkl'
    
    # School mapping path
    school_mapping_file = f'/kaggle/input/smotesvm-train-filtered-data/outputs/phase{i}/mappings/school_mapping.pkl'

    # Predict on test set
    predict_on_test(test_dir, model_path, scaler_path, school_mapping_file)


# Add SMOTE Sample

In [None]:
phase_paths = [f"/kaggle/input/smotesvm-train-filtered-data/outputs/phase{i}" for i in range(1, 5)]
results = {}

for i, phase_path in enumerate(phase_paths, start=1):
    print(f"\n=== Training Phase {i} ===")
    sample = True
    model, scaler, best_fold, metrics = train_phase(phase_path, sample = sample)
    # Save best model & scaler
    if sample: 
        temp = "sample"
    else: temp = "no_sample"
    joblib.dump(model, f"best_model_{temp}_phase{i}.pkl")
    joblib.dump(scaler, f"best_scaler_{temp}_phase{i}.pkl")
    results[f"phase{i}"] = metrics


In [None]:
import glob

for i in range(1, 5):
    test_dir = f'/kaggle/input/filtered-final-data/phase{i}/user_train_phase_{i}_test.csv'
    
    # Get model and scaler using glob
    model_path = f'/kaggle/working/best_model_{temp}_phase{i}.pkl'
    scaler_path = f'/kaggle/working/best_scaler_{temp}_phase{i}.pkl'
    
    # School mapping path
    school_mapping_file = f'/kaggle/input/smotesvm-train-filtered-data/outputs/phase{i}/mappings/school_mapping.pkl'

    # Predict on test set
    predict_on_test(test_dir, model_path, scaler_path, school_mapping_file)
