In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier,HistGradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,  classification_report
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from tabulate import tabulate
from collections import Counter
import warnings
warnings.filterwarnings("ignore", category=UserWarning)



# Load dataset
data = pd.read_parquet("Final_Model_Dataset.parquet").reset_index(drop=True)

# Convert necessary columns to numeric
data["red_fightTime"] = pd.to_numeric(data["red_fightTime"], errors="coerce")
data["blue_fightTime"] = pd.to_numeric(data["blue_fightTime"], errors="coerce")

# Filter data
data = data.loc[data['OUTCOME'].isin(['Red', 'Blue'])]

# Encode categorical target variables
label_encoder_outcome = LabelEncoder()
label_encoder_method = LabelEncoder()
data['OUTCOME'] = label_encoder_outcome.fit_transform(data['OUTCOME'])
data['METHOD'] = label_encoder_method.fit_transform(data['METHOD'])

# Define features and target
X = data.drop(columns=["OUTCOME", "METHOD", "BOUT", "EVENT", "WEIGHTCLASS", "REFEREE", "DETAILS", "URL", "TIME FORMAT", "TIME", "Unnamed: 0","ROUND"])
X.columns = X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

y = data[["OUTCOME"]]

y = y.loc[X.index]



# Train-test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

y_train = y_train.values.ravel()
y_val = y_val.values.ravel()
y_test = y_test.values.ravel()

# Base models
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=15,
    random_state=42,
    #class_weight='balanced'
)

xgb = XGBClassifier(
    n_estimators=100,
    #scale_pos_weight = 0.55,
    max_depth=4,
    learning_rate=0.02,
    subsample=0.6,
    colsample_bytree=0.5,
    reg_lambda=10.0,
    reg_alpha=4.0,
    gamma=5.0,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    
)

lgb = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.02,
    max_depth=4,
    min_child_samples=30,
    min_split_gain=0.01,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=3.0,
    reg_alpha=2.0,
    random_state=42,
    verbose=-1,
    #scale_pos_weight = 0.55
)

cat = CatBoostClassifier(
    iterations=300,
    learning_rate=0.02,
    depth=4,
    l2_leaf_reg=9.0,
    verbose=0,
    random_state=42,
    #scale_pos_weight = 0.55
)

histgb = HistGradientBoostingClassifier(
    max_iter=300,
    learning_rate=0.02,
    max_depth=4,
    l2_regularization=2.0,
    min_samples_leaf=20,
    early_stopping=False,
    random_state=42,
    class_weight='balanced',
    
)



log_reg = LogisticRegression(max_iter=1000, C= 0.06, solver='liblinear', penalty='l2')


# Fit all models
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgb.fit(X_train, y_train)
cat.fit(X_train, y_train)
histgb.fit(X_train, y_train)


# Meta-features for stacking
train_meta = np.column_stack((
    rf.predict_proba(X_train)[:, 1],
    xgb.predict_proba(X_train)[:, 1],
    lgb.predict_proba(X_train)[:, 1],
    cat.predict_proba(X_train)[:, 1],
    histgb.predict_proba(X_train)[:, 1],
  
))

test_meta = np.column_stack((
    rf.predict_proba(X_test)[:, 1],
    xgb.predict_proba(X_test)[:, 1],
    lgb.predict_proba(X_test)[:, 1],
    cat.predict_proba(X_test)[:, 1],
    histgb.predict_proba(X_test)[:, 1],
 
))



# Define voting classifier using soft voting
voting_clf = VotingClassifier(
    estimators=[
        ("rf", rf),
        #("xgb", xgb),
        ("lgb", lgb),
        ("cat", cat),
        ("histgb", histgb),
     
    ],
    voting="soft"
)

# Fit voting classifier
voting_clf.fit(X_train, y_train)

# Predict with voting classifier
y_pred_voting = voting_clf.predict(X_test)

# Fit the stacking meta-model on the base model predictions (meta-features)
log_reg.fit(train_meta, y_train)
y_pred_stack = log_reg.predict(test_meta)

# Store all models for iteration
models = {
    "Random Forest": rf,
    "XGBoost": xgb,
    "LightGBM": lgb,
    "CatBoost": cat,
    "HistGradientBoosting": histgb,
    "Voting Classifier": voting_clf,
    
}

# Evaluate base models + ensembles
results = []
for name, model in models.items():
    train_acc = accuracy_score(y_train, model.predict(X_train))
    test_acc = accuracy_score(y_test, model.predict(X_test))
    results.append([name, f"{train_acc:.4f}", f"{test_acc:.4f}"])

# Add stacking results
train_acc_stack = accuracy_score(y_train, log_reg.predict(train_meta))
test_acc_stack = accuracy_score(y_test, y_pred_stack)
results.append(["Stacking Model", f"{train_acc_stack:.4f}", f"{test_acc_stack:.4f}"])

# Print performance table
print("\n📊 Model Accuracy Summary:")
print(tabulate(results, headers=["Model", "Train Accuracy", "Test Accuracy"], tablefmt="fancy_grid"))

# Print detailed metrics for ensemble models
print("\n🔍 Voting Classifier Detailed Metrics:")
print(f"Precision: {precision_score(y_test, y_pred_voting, average='weighted'):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred_voting, average='weighted'):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_voting, average='weighted'):.4f}")
print(classification_report(y_test, y_pred_voting))

print("\n🔍 Stacking Model Detailed Metrics:")
print(f"Precision: {precision_score(y_test, y_pred_stack, average='weighted'):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred_stack, average='weighted'):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred_stack, average='weighted'):.4f}")
print(classification_report(y_test, y_pred_stack))

# Confusion Matrix for stacking
print("\n🧮 Confusion Matrix (Stacking Model):")
print(confusion_matrix(y_test, y_pred_stack))

# Confusion Matrix for voting
print("\n🧮 Confusion Matrix (Voting Classifier):")
print(confusion_matrix(y_test, y_pred_voting))

# Count class distribution
counter = Counter(y_train)
neg = counter[0]
pos = counter[1]
print(f"\n🔢 Class Distribution in Training Data: 0 -> {neg}, 1 -> {pos}")

# Detailed reports for all models
print("\n📄 Detailed Classification Reports:\n")
for name, model in models.items():
    print(f"🔹 {name} Classification Report:")
    y_pred_model = model.predict(X_test)
    print(classification_report(y_test, y_pred_model))
    print("-" * 60)

# Reprint stacking model for consistency
print("🔹 Stacking Model Classification Report:")
print(classification_report(y_test, y_pred_stack))
print("-" * 60)

