# Test Pre-Processed Data

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold
from xgboost import XGBClassifier

# Load dataset
data = pd.read_csv("Pre-Processed_Dataset.csv").reset_index(drop=True)

# Drop irrelevant columns
data = data.drop(columns=["RED_ELO_BEFORE", "BLUE_ELO_BEFORE", "RED_ELO_PEAK", "BLUE_ELO_PEAK"])

# Convert fightTime columns to numeric
data["red_fightTime"] = pd.to_numeric(data["red_fightTime"], errors="coerce")
data["blue_fightTime"] = pd.to_numeric(data["blue_fightTime"], errors="coerce")

# Filter outcome
data = data.loc[data['OUTCOME'].isin(['Red', 'Blue'])]

# Encode categorical labels
label_encoder_outcome = LabelEncoder()
label_encoder_method = LabelEncoder()
data['OUTCOME'] = label_encoder_outcome.fit_transform(data['OUTCOME'])
data['METHOD'] = label_encoder_method.fit_transform(data['METHOD'])

# Define features and target
X = data.drop(columns=["OUTCOME", "METHOD", "BOUT", "EVENT", "WEIGHTCLASS", "REFEREE", "DETAILS", "URL", "TIME FORMAT", "TIME", "Unnamed: 0", "ROUND", "Unnamed: 0.1"])
y = data["OUTCOME"]

# Train-test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train initial Random Forest
rf = RandomForestClassifier(n_estimators=300, max_depth=7, min_samples_split=20, min_samples_leaf=10, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
importance_df = importance_df.sort_values(by="Importance", ascending=True)

# Filter out low-importance features
threshold = 0.005  # tune this threshold
low_importance_features = importance_df[importance_df["Importance"] < threshold]["Feature"].tolist()
print(f"\nDropping {len(low_importance_features)} low-importance features (below {threshold}):")
print(low_importance_features)

# Drop low-importance features
X = X.drop(columns=low_importance_features)

# Redo split on cleaned data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Retrain Random Forest
rf = RandomForestClassifier(n_estimators=300, max_depth=7, min_samples_split=20, min_samples_leaf=10, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf_test = rf.predict(X_test)

# Train XGBoost
xgb = XGBClassifier(
    n_estimators=400,
    max_depth=3,
    learning_rate=0.03,
    subsample=0.7,
    colsample_bytree=0.6,
    reg_lambda=5.0,
    reg_alpha=2.0,
    gamma=4.0,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb.fit(X_train, y_train)
y_pred_xgb_test = xgb.predict(X_test)

# Evaluation
print(f"\nRandom Forest Accuracy: {accuracy_score(y_test, y_pred_rf_test):.4f}")
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb_test):.4f}")

print("\nClassification Report - Random Forest:")
print(classification_report(y_test, y_pred_rf_test))

print("\nClassification Report - XGBoost:")
print(classification_report(y_test, y_pred_xgb_test))

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(rf, X, y.values.ravel(), cv=cv, scoring="accuracy")

# Print results
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation: {np.std(cv_scores):.4f}")

# Feature Importance Plot
final_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=True)

plt.figure(figsize=(8, 5))
plt.barh(final_importance_df["Feature"], final_importance_df["Importance"], color="skyblue")
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importance")
plt.tight_layout()
plt.show()
