In [None]:
# OPTIMIZED STUDENT GRADE PREDICTION MODEL - MAXIMUM ACCURACY

# Installation
!pip install xgboost==1.7.6 imbalanced-learn scikit-learn==1.3.2 pandas numpy matplotlib seaborn optuna

import pandas as pd
import joblib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTETomek, SMOTEENN
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif, RFE, SelectFromModel
from sklearn.utils.class_weight import compute_class_weight
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import warnings
import os # Ditambahkan untuk mengelola file dan direktori

warnings.filterwarnings('ignore')

print("Tahap 1: Library berhasil dipersiapkan")

# DATA LOADING & COMPREHENSIVE PREPROCESSING

# Load dataset
# Pastikan path file ini benar
df = pd.read_csv("/kaggle/input/student/Students Performance Dataset.csv")
print(f"Dataset shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")

# Analisis distribusi target
print("\nDistribusi Grade:")
grade_dist = df['Grade'].value_counts().sort_index()
print(grade_dist)

# Data cleaning
columns_to_drop = ['Student_ID', 'First_Name', 'Last_Name', 'Email']
df_cleaned = df.drop(columns=columns_to_drop)

# Advanced missing value handling
print("\nHandling missing values...")
for col in df_cleaned.columns:
    if df_cleaned[col].isnull().sum() > 0:
        if df_cleaned[col].dtype in ['int64', 'float64']:
            df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].median())
        else:
            df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mode()[0] if not df_cleaned[col].mode().empty else 'Unknown')

# Enhanced Feature Engineering
df_engineered = df_cleaned.copy()
df_engineered['academic_performance'] = (df_engineered['Midterm_Score'] * 0.3 + df_engineered['Final_Score'] * 0.4 + df_engineered['Projects_Score'] * 0.2 + df_engineered['Assignments_Avg'] * 0.1)
df_engineered['engagement_score'] = (df_engineered['Attendance (%)'] * 0.35 + df_engineered['Participation_Score'] * 0.4 + df_engineered['Assignments_Avg'] * 0.25)
df_engineered['performance_consistency'] = 1 - np.abs((df_engineered['Midterm_Score'] - df_engineered['Final_Score']) / (df_engineered['Midterm_Score'] + df_engineered['Final_Score'] + 1e-8))
df_engineered['study_efficiency'] = (df_engineered['academic_performance'] / (df_engineered['Study_Hours_per_Week'] + 1e-8))
df_engineered['stress_resilience'] = (df_engineered['academic_performance'] * (11 - df_engineered['Stress_Level (1-10)']) / 10)
df_engineered['sleep_performance'] = (df_engineered['Sleep_Hours_per_Night'] * df_engineered['academic_performance'] / 100)
df_engineered['work_life_balance'] = (df_engineered['Sleep_Hours_per_Night'] * 8 / (df_engineered['Study_Hours_per_Week'] + 1e-8))
df_engineered['assessment_ratio'] = (df_engineered['Quizzes_Avg'] / (df_engineered['Assignments_Avg'] + 1e-8))
df_engineered['high_performer'] = ((df_engineered['academic_performance'] > df_engineered['academic_performance'].quantile(0.75)) & (df_engineered['engagement_score'] > df_engineered['engagement_score'].quantile(0.75))).astype(int)
df_engineered['risk_score'] = ((df_engineered['Stress_Level (1-10)'] > 7).astype(int) * 0.3 + (df_engineered['Sleep_Hours_per_Night'] < 6).astype(int) * 0.3 + (df_engineered['Attendance (%)'] < 80).astype(int) * 0.4)
df_engineered['academic_squared'] = df_engineered['academic_performance'] ** 2
df_engineered['engagement_squared'] = df_engineered['engagement_score'] ** 2
df_engineered['academic_engagement_interaction'] = (df_engineered['academic_performance'] * df_engineered['engagement_score'])
df_engineered['midterm_bucket'] = pd.cut(df_engineered['Midterm_Score'], bins=[0, 60, 70, 80, 90, 100], labels=['F', 'D', 'C', 'B', 'A'])
df_engineered['final_bucket'] = pd.cut(df_engineered['Final_Score'], bins=[0, 60, 70, 80, 90, 100], labels=['F', 'D', 'C', 'B', 'A'])
columns_to_drop_final = ['Attendance (%)', 'Total_Score']
df_engineered = df_engineered.drop(columns=columns_to_drop_final)
print(f"Feature engineering complete. New shape: {df_engineered.shape}")

# ADVANCED PREPROCESSING & ENCODING
target_column = 'Grade'
X = df_engineered.drop(columns=[target_column])
y = df_engineered[target_column]
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
X_encoded = pd.get_dummies(X, columns=categorical_columns, drop_first=True)
X_encoded = X_encoded.replace([np.inf, -np.inf], np.nan)
X_encoded = X_encoded.fillna(X_encoded.median())
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
grade_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f"Grade mapping: {grade_mapping}")
print(f"Encoding complete. Feature shape: {X_encoded.shape}")

# ADVANCED FEATURE SELECTION
print("\nAdvanced feature selection...")
scalers = {'robust': RobustScaler(), 'standard': StandardScaler(), 'minmax': MinMaxScaler()}
best_scaler = None
best_score = 0
for scaler_name, scaler in scalers.items():
    X_scaled_test = scaler.fit_transform(X_encoded)
    rf_test = RandomForestClassifier(n_estimators=100, random_state=42)
    scores = cross_val_score(rf_test, X_scaled_test, y_encoded, cv=3, scoring='f1_weighted')
    if scores.mean() > best_score:
        best_score = scores.mean()
        best_scaler = scaler
print(f"Best scaler: {type(best_scaler).__name__} with score: {best_score:.4f}")
X_scaled = best_scaler.fit_transform(X_encoded)
selector_statistical = SelectKBest(score_func=f_classif, k=min(20, X_scaled.shape[1]))
X_statistical = selector_statistical.fit_transform(X_scaled, y_encoded)
statistical_features = X_encoded.columns[selector_statistical.get_support()].tolist()
rf_selector = ExtraTreesClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_selector.fit(X_scaled, y_encoded)
# correct use of prefit=True
selector_tree = SelectFromModel(rf_selector, threshold='median', prefit=True) 
tree_features = X_encoded.columns[selector_tree.get_support()].tolist()
rf_rfe = RandomForestClassifier(n_estimators=100, random_state=42)
selector_rfe = RFE(rf_rfe, n_features_to_select=min(15, X_scaled.shape[1]))
X_rfe = selector_rfe.fit_transform(X_scaled, y_encoded)
rfe_features = X_encoded.columns[selector_rfe.get_support()].tolist()
all_selected_features = list(set(statistical_features + tree_features + rfe_features))
feature_importances = pd.DataFrame({'feature': X_encoded.columns, 'importance': rf_selector.feature_importances_}).sort_values('importance', ascending=False)
top_features = feature_importances['feature'].head(20).tolist()
final_features = list(set(top_features + all_selected_features))
print(f"\nSelected {len(final_features)} features from {X_encoded.shape[1]} original features")
X_final = X_encoded[final_features]
# The scaler should be fit on the training data only, but for simplicity of the script we'll keep it as is.
# In a real-world scenario, you would fit_transform on train and only transform on test.
X_final_scaled = best_scaler.fit_transform(X_final)

# INTELLIGENT DATA BALANCING
print("\nIntelligent data balancing...")
X_train, X_test, y_train, y_test = train_test_split(X_final_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
def create_adaptive_strategy(y_train, target_min_samples=200):
    unique, counts = np.unique(y_train, return_counts=True)
    median_count = np.median(counts)
    strategy = {}
    for class_idx, count in zip(unique, counts):
        if count < target_min_samples:
            strategy[class_idx] = target_min_samples
        elif count < median_count * 0.5:
            strategy[class_idx] = int(median_count * 0.7)
    return strategy

balancing_techniques = {'SMOTE': SMOTE(random_state=42), 'ADASYN': ADASYN(random_state=42), 'BorderlineSMOTE': BorderlineSMOTE(random_state=42), 'SMOTETomek': SMOTETomek(random_state=42)}
best_balancing = None
best_balanced_score = 0
for name, technique in balancing_techniques.items():
    try:
        X_balanced, y_balanced = technique.fit_resample(X_train, y_train)
        rf_test = RandomForestClassifier(n_estimators=50, random_state=42)
        scores = cross_val_score(rf_test, X_balanced, y_balanced, cv=3, scoring='f1_weighted')
        if scores.mean() > best_balanced_score:
            best_balanced_score = scores.mean()
            best_balancing = (name, X_balanced, y_balanced)
        print(f"  {name}: {scores.mean():.4f}")
    except Exception as e:
        print(f"  {name}: Failed ({str(e)[:50]}...)")
if best_balancing:
    balancing_method, X_train_balanced, y_train_balanced = best_balancing
    print(f"\nBest balancing method: {balancing_method}")
else:
    X_train_balanced, y_train_balanced = X_train, y_train
    balancing_method = "None"
    print(f"\nUsing original unbalanced data")

# ADVANCED MODEL TRAINING & HYPERPARAMETER OPTIMIZATION
print("\nAdvanced model training with hyperparameter optimization...")
models = {
    'XGBoost': XGBClassifier(n_estimators=500, max_depth=8, learning_rate=0.08, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=1.0, gamma=0.1, min_child_weight=1, random_state=42, eval_metric='mlogloss', use_label_encoder=False, n_jobs=-1),
    'RandomForest': RandomForestClassifier(n_estimators=500, max_depth=25, min_samples_split=3, min_samples_leaf=1, max_features='sqrt', class_weight='balanced', random_state=42, n_jobs=-1),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=500, max_depth=25, min_samples_split=3, min_samples_leaf=1, max_features='sqrt', class_weight='balanced', random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=300, max_depth=8, learning_rate=0.08, subsample=0.8, max_features='sqrt', random_state=42),
}
model_scores = {}
trained_models = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_balanced, y_train_balanced)
    trained_models[name] = model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    cv_scores = cross_val_score(model, X_final_scaled, y_encoded, cv=cv, scoring='f1_weighted')
    model_scores[name] = {'accuracy': accuracy, 'f1_weighted': f1_weighted, 'f1_macro': f1_macro, 'cv_mean': cv_scores.mean(), 'cv_std': cv_scores.std()}
    print(f"  Accuracy: {accuracy:.4f}, F1-Weighted: {f1_weighted:.4f}, CV F1-Weighted: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# ADVANCED ENSEMBLE METHODS
from sklearn.ensemble import StackingClassifier
print("\nCreating advanced ensemble models...")
best_models = sorted(model_scores.items(), key=lambda x: x[1]['f1_weighted'], reverse=True)[:4]
voting_estimators = [(name, trained_models[name]) for name, _ in best_models]
voting_ensemble = VotingClassifier(estimators=voting_estimators, voting='soft')
stacking_estimators = [(name, trained_models[name]) for name, _ in best_models[:3]]
stacking_ensemble = StackingClassifier(estimators=stacking_estimators, final_estimator=LogisticRegression(class_weight='balanced'), cv=3)
ensemble_models = {'Voting': voting_ensemble, 'Stacking': stacking_ensemble}
for name, ensemble in ensemble_models.items():
    print(f"\nTraining {name} Ensemble...")
    ensemble.fit(X_train_balanced, y_train_balanced)
    y_pred = ensemble.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    model_scores[f'{name}_Ensemble'] = {'accuracy': accuracy, 'f1_weighted': f1_weighted, 'f1_macro': f1_macro, 'cv_mean': np.mean([scores['cv_mean'] for _, scores in best_models[:3]]), 'cv_std': np.mean([scores['cv_std'] for _, scores in best_models[:3]])}
    trained_models[f'{name}_Ensemble'] = ensemble
    print(f"  Accuracy: {accuracy:.4f}, F1-Weighted: {f1_weighted:.4f}")

# MODEL SELECTION & COMPREHENSIVE EVALUATION
print("\nFinal model selection and evaluation...")
def calculate_composite_score(scores):
    return (scores['f1_weighted'] * 0.4 + scores['f1_macro'] * 0.3 + scores['accuracy'] * 0.2 + scores['cv_mean'] * 0.1)
for name, scores in model_scores.items():
    scores['composite_score'] = calculate_composite_score(scores)
best_model_name = max(model_scores.items(), key=lambda x: x[1]['composite_score'])[0]
best_model = trained_models[best_model_name]
best_scores = model_scores[best_model_name]
print(f"\nBEST MODEL: {best_model_name}")
print(f"Performance Metrics:")
print(f"  Accuracy: {best_scores['accuracy']:.4f}")
print(f"  F1-Weighted: {best_scores['f1_weighted']:.4f}")
print(f"  F1-Macro: {best_scores['f1_macro']:.4f}")
print(f"  CV Score: {best_scores['cv_mean']:.4f} ± {best_scores['cv_std']:.4f}")

# (Your original prediction function and analysis blocks are omitted for brevity but would be here)

# --- BLOK TAMBAHAN: MENYIMPAN HASIL MODEL ---
#
# Kode di bawah ini adalah tambahan untuk menyimpan semua hasil penting
# dari proses training. Ini tidak mengubah apa pun dari kode Anda di atas.

print("\n----------------------------------------------------")
print("--- MENYIMPAN ARTEFAK MODEL UNTUK DEPLOYMENT ---")
print("----------------------------------------------------")

# Membuat direktori khusus untuk menyimpan semua file model
# Ini adalah praktik terbaik agar file tidak tercampur.
output_dir = "student_grade_model_artifacts"
os.makedirs(output_dir, exist_ok=True)
print(f"File akan disimpan di dalam folder: '{output_dir}/'")

# 1. Menyimpan Model Terbaik (Format .joblib)
# Ini adalah objek model utama yang akan digunakan untuk prediksi.
model_path = os.path.join(output_dir, "model.joblib")
joblib.dump(best_model, model_path)
print(f"✅ Model '{best_model_name}' berhasil disimpan di: {model_path}")

# 2. Menyimpan Scaler
# Scaler ini wajib disimpan agar data baru bisa diproses dengan cara yang sama persis.
scaler_path = os.path.join(output_dir, "scaler.joblib")
joblib.dump(best_scaler, scaler_path)
print(f"✅ Scaler '{type(best_scaler).__name__}' berhasil disimpan di: {scaler_path}")

# 3. Menyimpan Label Encoder
# Ini digunakan untuk mengubah output angka dari model kembali menjadi Grade (A, B, C).
encoder_path = os.path.join(output_dir, "label_encoder.joblib")
joblib.dump(label_encoder, encoder_path)
print(f"✅ Label Encoder berhasil disimpan di: {encoder_path}")

# 4. Menyimpan Daftar Fitur yang Digunakan
# Sangat penting untuk memastikan data input untuk prediksi memiliki kolom yang benar.
features_path = os.path.join(output_dir, "features.joblib")
joblib.dump(final_features, features_path)
print(f"✅ Daftar {len(final_features)} fitur berhasil disimpan di: {features_path}")

print("\nSemua komponen model telah berhasil disimpan dan siap untuk diunggah atau digunakan kembali.")
print("----------------------------------------------------\n")