# Final Analysis and Baseline

In [3]:
# Full Classification Pipeline for Tournament Prediction

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Load data
df = pd.read_csv('/home/lambjos3/cmse492_project/data/raw/cbb.csv')

# Preprocess features
def preprocess(df, target_col):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return X, y

# Build model pipelines
def build_models():
    models = {
        "logreg": Pipeline([
            ('scaler', StandardScaler()),
            ('clf', LogisticRegression(max_iter=500))
        ]),
        "svm": Pipeline([
            ('scaler', StandardScaler()),
            ('clf', SVC(probability=True))
        ]),
        "rf": RandomForestClassifier(),
        "gb": GradientBoostingClassifier()
    }
    return models

# Train a model
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

# Evaluate model
def evaluate(model, X_test, y_test):
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))
    return accuracy_score(y_test, preds)

# Ensemble prediction
def ensemble_predict(models, X):
    preds = np.array([m.predict_proba(X)[:,1] for m in models])
    avg_preds = preds.mean(axis=0)
    return (avg_preds > 0.5).astype(int)

# Tournament simulation
def simulate_tournament(model, matchups_df):
    results = []
    for _, row in matchups_df.iterrows():
        features = row.drop(['team1','team2'])
        pred = model.predict(features.values.reshape(1, -1))[0]
        winner = row['team1'] if pred == 1 else row['team2']
        results.append(winner)
    return results

# Full pipeline
def run_pipeline(data_path, target_col, matchups_path):
    df = load_data(data_path)
    X, y = preprocess(df, target_col)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models_dict = build_models()

    trained = {}
    for name, model in models_dict.items():
        trained[name] = train_model(model, X_train, y_train)
        print(f"Model {name} accuracy:", evaluate(trained[name], X_test, y_test))

    ensemble_models = list(trained.values())

    matchups = load_data(matchups_path)
    ensemble_winners = []
    for _, row in matchups.iterrows():
        X_row = row.drop(['team1','team2']).values.reshape(1, -1)
        preds = np.array([m.predict_proba(X_row)[:,1] for m in ensemble_models])
        avg_pred = preds.mean(axis=0)
        winner = row['team1'] if avg_pred > 0.5 else row['team2']
        ensemble_winners.append(winner)

    return ensemble_winners