In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [None]:
# ----------------------------------
# Feature Engineering I
# ----------------------------------

def create_region_interaction_features(df):
    df["East_West_Diff"] = df["RegionWinner_East"] - df["RegionWinner_West"]
    df["South_Midwest_Diff"] = df["RegionWinner_South"] - df["RegionWinner_Midwest"]
    return df

def haversine_vectorized(df, lat1_col, lon1_col, lat2_col, lon2_col):
    R = 6371
    lat1, lon1 = np.radians(df[lat1_col]), np.radians(df[lon1_col])
    lat2, lon2 = np.radians(df[lat2_col]), np.radians(df[lon2_col])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    return R * 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

def create_enhanced_features(df_bracket, df_institutions):
    df = df_bracket.copy()
    for region in ["East", "West", "South", "Midwest"]:
        df = df.merge(
            df_institutions[["InstitutionID", "InstitutionLatitude", "InstitutionLongitude", "RegularSeasonWins", "InstitutionConference"]],
            how="left",
            left_on=f"RegionWinner_{region}",
            right_on="InstitutionID"
        ).rename(columns={
            "RegularSeasonWins": f"{region}_Wins",
            "InstitutionConference": f"{region}_Conference"
        })
        df[f"DistanceTo{region}"] = haversine_vectorized(
            df, "CustomerPostalCodeLatitude", "CustomerPostalCodeLongitude",
            "InstitutionLatitude", "InstitutionLongitude"
        )
        df.drop(["InstitutionID", "InstitutionLatitude", "InstitutionLongitude"], axis=1, inplace=True)
    return df

In [None]:
# ----------------------------------
# Data Loading
# ----------------------------------

train_data_path = "bracket_training.csv"
test_data_path = "bracket_test.csv"
institutions_path = "CCAC 2025 - Institutions.csv"
submission_template_path = "submission_template.csv"

df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)
df_institutions = pd.read_csv(institutions_path)
df_submission = pd.read_csv(submission_template_path)

In [None]:
# ----------------------------------
# Feature Engineering II and Preprocessing
# ----------------------------------

df_train_attempt = create_region_interaction_features(df_train.copy())
df_test_attempt = create_region_interaction_features(df_test.copy())
df_train_attempt = create_enhanced_features(df_train_attempt, df_institutions)
df_test_attempt = create_enhanced_features(df_test_attempt, df_institutions)

enhanced_features = [
    "RegionWinner_East", "RegionWinner_West", "RegionWinner_South", "RegionWinner_Midwest",
    "CustomerPostalCode", "CustomerPostalCodeLatitude", "CustomerPostalCodeLongitude",
    "East_West_Diff", "South_Midwest_Diff",
    "DistanceToEast", "DistanceToWest", "DistanceToSouth", "DistanceToMidwest",
    "East_Wins", "West_Wins", "South_Wins", "Midwest_Wins",
    "East_Conference", "West_Conference", "South_Conference", "Midwest_Conference"
]

for feature in enhanced_features:
    if feature not in df_test_attempt.columns:
        df_test_attempt[feature] = np.nan

df_train_attempt.replace("Unknown", np.nan, inplace=True)
df_test_attempt.replace("Unknown", np.nan, inplace=True)
df_train_attempt["CustomerPostalCode"] = pd.to_numeric(df_train_attempt["CustomerPostalCode"], errors="coerce")
df_test_attempt["CustomerPostalCode"] = pd.to_numeric(df_test_attempt["CustomerPostalCode"], errors="coerce")
median_zip = df_train_attempt["CustomerPostalCode"].median()
df_train_attempt["CustomerPostalCode"] = df_train_attempt["CustomerPostalCode"].fillna(median_zip)
df_test_attempt["CustomerPostalCode"] = df_test_attempt["CustomerPostalCode"].fillna(median_zip)

numeric_features = [col for col in enhanced_features if df_train_attempt[col].dtype in ["int64", "float64"]]
categorical_features = [col for col in enhanced_features if df_train_attempt[col].dtype == "object"]

numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

In [None]:
# ----------------------------------
# Configuration for Four Runs
# ----------------------------------

ATTEMPTS = [
    {"name": "Run 1: Default", "params": {"learning_rate": 0.0292, "subsample": 0.7546, "random_state": 42}},       # Private Score: 0.63439, Public Score: 0.62916
    {"name": "Run 2: High LR", "params": {"learning_rate": 0.05, "subsample": 0.7546, "random_state": 42}},         # Private Score: 0.63361, Public Score: 0.63002
    {"name": "Run 3: Seed 123", "params": {"learning_rate": 0.0292, "subsample": 0.7546, "random_state": 123}},     # Private Score: 0.63458, Public Score: 0.62900
    {"name": "Run 4: High Subsample", "params": {"learning_rate": 0.0292, "subsample": 0.9, "random_state": 42}}    # Private Score: 0.63324, Public Score: 0.63070 (*Final Selection*)
]

TARGETS = ["SemifinalWinner_East_West", "SemifinalWinner_South_Midwest", "NationalChampion"]
base_xgb_params = {
    'max_depth': 9, 'colsample_bytree': 0.5541, 'min_child_weight': 9, 'gamma': 3.7034,     # Did the best parameter search during a different session
    'n_estimators': 50, 'eval_metric': 'mlogloss', 'n_jobs': -1
}

In [7]:
# ----------------------------------
# Loop Over Attempts
# ----------------------------------

for attempt in ATTEMPTS:
    print(f"\n=== Running {attempt['name']} ===")
    df_submission_final = df_submission.copy()
    
    for tgt in TARGETS:
        print(f"Training for target: {tgt}")
        X = df_train_attempt[enhanced_features].copy()
        X_test = df_test_attempt[enhanced_features].copy()
        le_tgt = LabelEncoder()
        y = le_tgt.fit_transform(df_train_attempt[tgt].astype(str))
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
        X_train_enc = preprocessor.fit_transform(X_train)
        X_val_enc = preprocessor.transform(X_val)
        X_full_enc = preprocessor.fit_transform(X)
        X_test_enc = preprocessor.transform(X_test)
        
        # Combine base params with attempt-specific params
        xgb_params = {**base_xgb_params, **attempt["params"]}
        xgb_model = XGBClassifier(**xgb_params)
        xgb_model.fit(X_full_enc, y)
        
        val_preds = xgb_model.predict(X_val_enc)
        print(f"Validation accuracy for {tgt}: {accuracy_score(y_val, val_preds):.4f}")
        
        test_preds = xgb_model.predict(X_test_enc)
        test_preds_orig = le_tgt.inverse_transform(test_preds)
        df_submission_final[tgt] = test_preds_orig
    
    output_filename = f"{attempt['name'].replace(' ', '_').lower()}_submission.csv"
    df_submission_final.to_csv(output_filename, index=False)
    print(f"Submission file created: {output_filename}")


=== Running Run 1: Default ===
Training for target: SemifinalWinner_East_West
Validation accuracy for SemifinalWinner_East_West: 0.7005
Training for target: SemifinalWinner_South_Midwest
Validation accuracy for SemifinalWinner_South_Midwest: 0.6534
Training for target: NationalChampion
Validation accuracy for NationalChampion: 0.4868
Submission file created: run_1:_default_submission.csv

=== Running Run 2: High LR ===
Training for target: SemifinalWinner_East_West
Validation accuracy for SemifinalWinner_East_West: 0.7000
Training for target: SemifinalWinner_South_Midwest
Validation accuracy for SemifinalWinner_South_Midwest: 0.6555
Training for target: NationalChampion
Validation accuracy for NationalChampion: 0.4871
Submission file created: run_2:_high_lr_submission.csv

=== Running Run 3: Seed 123 ===
Training for target: SemifinalWinner_East_West
Validation accuracy for SemifinalWinner_East_West: 0.7004
Training for target: SemifinalWinner_South_Midwest
Validation accuracy for Sem