In [1]:
import pandas as pd
import json
import numpy as np
from typing import Dict, List, Tuple
#from collections import Counter
#from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib


class DataProcessor:
    def __init__(self, games_path: str = None, champion_info_path: str = None, champion_info_path_2: str = None):
        if games_path and champion_info_path:
            self.games_df = pd.read_csv(games_path)
            with open(champion_info_path, "r") as f:
                self.champion_info = json.load(f)["data"]
            
            with open(champion_info_path_2, "r") as f:
                self.champion_info_2 = json.load(f)["data"]
            
            self.all_tags = sorted({
                tag
                for info in self.champion_info_2.values()
                for tag in info.get("tags", [])
            })
            # Remove gameDuration < 17min
            self.games_df = self.games_df[self.games_df["gameDuration"] >= 1020]
            # Remove outliers
            # Calculate the IQR for gameDuration
            # IQR = Q3 - Q1
            durations = self.games_df["gameDuration"]
            Q1 = durations.quantile(0.25)
            Q3 = durations.quantile(0.75)
            IQR = Q3 - Q1
            upper_fence = Q3 + 1.5 * IQR
            self.games_df = self.games_df[durations <= upper_fence]

        else:
            self.games_df = None
            self.champion_info = None
        self.stats_cols = ["firstBlood"]

        # Initialize scaler
        self.scaler = StandardScaler()

        if games_path and champion_info_path:
            # Create mappings
            # Create a list of all champion keys from the dictionary
            champion_keys = [
                champion["key"]
                for champion in self.champion_info.values()
                if champion["key"] != "None"
            ]
            champion_keys.sort()

            print(f"Number of champions: {len(champion_keys)}")
            print(f"Number of games: {len(self.games_df)}")

            # Create mappings
            self.champion_id_to_key = {
                champion["id"]: champion["key"]
                for champion in self.champion_info.values()
                if champion["key"] != "None"
            }
            self.champion_key_to_id = {
                champion["key"]: champion["id"]
                for champion in self.champion_info.values()
                if champion["key"] != "None"
            }

    def save(self, path: str):
        """Save the data processor with its fitted scaler."""
        # Create a dictionary of attributes to save
        processor_state = {
            'champion_info': self.champion_info,
            'champion_info_2': self.champion_info_2,
            'all_tags': self.all_tags,
            'champion_id_to_key': self.champion_id_to_key,
            'champion_key_to_id': self.champion_key_to_id,
            'scaler': self.scaler,
            'games_df': self.games_df
        }
        joblib.dump(processor_state, path)

    @classmethod
    def load(cls, path: str):
        """Load a saved data processor."""
        processor_state = joblib.load(path)
        
        # Create a new instance
        processor = cls()
        
        # Restore the state
        processor.champion_info = processor_state['champion_info']
        processor.champion_info_2 = processor_state['champion_info_2']
        processor.all_tags = processor_state['all_tags']
        processor.champion_id_to_key = processor_state['champion_id_to_key']
        processor.champion_key_to_id = processor_state['champion_key_to_id']
        processor.scaler = processor_state['scaler']
        processor.games_df = processor_state['games_df']
        
        return processor

    def prepare_train_test_split(self, test_size=0.2, random_state=42):
        X, y = self.process_data()

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
        
        # Fit and transform the training data
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        return X_train_scaled, X_test_scaled, y_train, y_test

    def process_data(self):
        """Process the data to create features and labels.
        
        Returns:
            X: pd.DataFrame - Features
            y: pd.Series - Labels
        """
        # Create features for both teams
        team1_features = self._create_team_features("t1")
        team2_features = self._create_team_features("t2")
        team1_tag_counts = self._create_tag_count_features("t1")
        team2_tag_counts = self._create_tag_count_features("t2")
        stats_cols = [
            "firstBlood"
        ]
        game_stats = self.games_df[stats_cols]
        # Combine features
        X = pd.concat([team1_features, team2_features, team1_tag_counts, team2_tag_counts, game_stats], axis=1)
        y = self.games_df["winner"].map(
            {1: 1, 2: 0}
        )  # Convert to binary (1 = team1 wins, 0 = team2 wins)
        
        # Remove All 0 Columns
        # variances = X.var(axis=0)
        # zero_var = variances[variances == 0].index.tolist()
        # if zero_var:
        #     print(f"Dropping zero‐variance columns: {zero_var}")
        #     X = X.drop(columns=zero_var)
        # self._dropped_cols = zero_var

        return X, y

    def _create_tag_count_features(self, team_prefix: str):
        """
        count the number of each tag in each team
        for_example team1_tank_num, team2_fighter_num。
        """
        cols = [f"{team_prefix}_{tag.lower()}_num" for tag in self.all_tags]
        df = pd.DataFrame(0, index=self.games_df.index, columns=cols)

        for tag in self.all_tags:
            col = f"{team_prefix}_{tag.lower()}_num"
            def count_tag(row):
                cnt = 0
                for i in range(1, 6):
                    champ_id = row[f"{team_prefix}_champ{i}id"]
                    champ_key = self.champion_id_to_key.get(champ_id)
                    info = self.champion_info_2.get(champ_key, {})
                    if tag in info.get("tags", []):
                        cnt += 1
                return cnt
            df[col] = self.games_df.apply(count_tag, axis=1)

        return df
    
    def _create_team_features(self, team_prefix):
        """Create features for a specific team.
        
        Args:
            team_prefix: str - Prefix for the team (either "t1" or "t2")
            
        Returns:
            pd.DataFrame: DataFrame with features in the format expected by the model
        """
        # Initialize the feature columns
        feature_cols = [
            f"{champ_key}_picked_{team_prefix}"
            for champ_key in self.champion_id_to_key.values()
        ]
        # start all zeros
        feature_df = pd.DataFrame(0, index=self.games_df.index, columns=feature_cols)

        # for each pick‐slot, ADD into the correct champion column
        for i in range(1, 6):
            slot_col = f"{team_prefix}_champ{i}id"
            # vectorized per‐champ:
            for champ_id, champ_key in self.champion_id_to_key.items():
                mask = (self.games_df[slot_col] == champ_id)
                feature_df.loc[mask, f"{champ_key}_picked_{team_prefix}"] += 1

        return feature_df

    def get_champion_name(self, champion_id: int) -> str:
        """Convert champion ID to name."""
        return self.champion_id_to_key.get(champion_id, "Unknown")

    def get_all_champions(self) -> List[Dict[str, str]]:
        """Get list of all champions with their IDs and names."""
        return [
            {"id": str(id), "name": name}
            for id, name in self.champion_id_to_key.items()
        ]

    def prepare_prediction_data(
        self,
        team1_champs: List[str],
        team2_champs: List[str],
    ) -> np.ndarray:
        # 1) build the same pick+tag columns you used in process_data()
        feature_cols = []
        for t in ("t1", "t2"):
            # champion‐pick columns, in the same order as your training code
            for champ_key in self.champion_key_to_id:
                feature_cols.append(f"{champ_key}_picked_{t}")
            # tag‐count columns
            for tag in self.all_tags:
                feature_cols.append(f"{t}_{tag.lower()}_num")

        # 2) initialize a zero‐row DataFrame
        features = pd.DataFrame(0, index=[0], columns=feature_cols)

        # 3) fill in picks
        for champ in team1_champs:
            features.at[0, f"{champ}_picked_t1"] += 1
        for champ in team2_champs:
            features.at[0, f"{champ}_picked_t2"] += 1

        # 4) fill in tag‐counts
        for champ in team1_champs:
            for tag in self.champion_info_2[champ]["tags"]:
                features.at[0, f"t1_{tag.lower()}_num"] += 1
        for champ in team2_champs:
            for tag in self.champion_info_2[champ]["tags"]:
                features.at[0, f"t2_{tag.lower()}_num"] += 1

        # 5) make sure your stats columns are present (and in the same order!)
        #    you probably should have done `self.stats_cols = [...]` in __init__
        for stat in self.stats_cols:
            if stat not in features.columns:
                features[stat] = 0.0

        # 6) **THIS IS THE KEY**: reorder to exactly what the scaler saw during fit
        features = features.loc[:, self.scaler.feature_names_in_]

        # 7) finally scale and return
        return self.scaler.transform(features)


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import joblib
import pandas as pd
import numpy as np
from typing import List
import json


class LogisticRegressionScratch:
    def __init__(self, C: float = 1.0, lr: float = 0.01, n_iter: int = 1000):
        """
        C     : inverse regularization strength (bigger C => less reg)
        lr    : learning rate
        n_iter: number of gradient steps
        """
        self.C = C
        self.lr = lr
        self.n_iter = n_iter
        self.weights = None
        self.bias = None

    def fit(self, X: np.ndarray, y: np.ndarray):
        n_samples, n_features = X.shape
        # init parameters
        self.weights = np.zeros(n_features)
        self.bias = 0.0
        # lambda = 1/C
        lambda_param = 1.0 / self.C

        for _ in range(self.n_iter):
            # linear model
            linear = X.dot(self.weights) + self.bias
            # sigmoid
            y_pred = 1.0 / (1.0 + np.exp(-linear))

            # gradients w/ L2 regularization
            dw = (1.0 / n_samples) * X.T.dot(y_pred - y) \
                 + lambda_param * self.weights
            db = (1.0 / n_samples) * np.sum(y_pred - y)

            # update
            self.weights -= self.lr * dw
            self.bias    -= self.lr * db

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        linear = X.dot(self.weights) + self.bias
        probs  = 1.0 / (1.0 + np.exp(-linear))
        return np.vstack([1-probs, probs]).T

    def predict(self, X: np.ndarray) -> np.ndarray:
        return (self.predict_proba(X)[:,1] >= 0.5).astype(int)


class SVMScratch:
    def __init__(self, C=1.0, lr=1e-3, n_iter=1000, kernel='linear', gamma='scale', n_components=100):
        self.C = C
        self.lr = lr
        self.n_iter = n_iter
        self.kernel = kernel
        self.gamma = gamma
        self.n_components = n_components
        self.w = None
        self.b = None
        self.W = None
        self.basis = None

    def _compute_gamma(self, X):
        n_features = X.shape[1]
        if self.gamma == 'scale':
            var = np.var(X, axis=0).mean()
            return 1.0 / (n_features * var) if var > 0 else 1.0
        elif self.gamma == 'auto':
            return 1.0 / n_features
        else:
            return float(self.gamma)

    def _rbf_feature_map(self, X):
        proj = X.dot(self.W) + self.basis
        return np.sqrt(2.0/self.n_components) * np.cos(proj)

    def fit(self, X, y):
        # map {0,1} → {-1,+1}
        y_signed = np.where(y<=0, -1, +1)
        n_samples, n_features = X.shape

        # 1) transform to RFF if needed
        if self.kernel == 'rbf':
            γ = self._compute_gamma(X)
            self.W     = np.random.normal(scale=np.sqrt(2*γ), size=(n_features, self.n_components))
            self.basis = np.random.uniform(0, 2*np.pi, size=self.n_components)
            X_train = self._rbf_feature_map(X)
        else:
            X_train = X

        # 2) init
        D = X_train.shape[1]
        self.w = np.zeros(D)
        self.b = 0.0

        # 3) SGD
        for _ in range(self.n_iter):
            for i in range(n_samples):
                xi = X_train[i]
                yi = y_signed[i]
                margin = yi * (xi.dot(self.w) + self.b)

                if margin >= 1:
                    # only regularizer ∂(½‖w‖²)/∂w = w
                    grad_w = self.w
                    grad_b = 0.0
                else:
                    # ∂hinge/∂w = –C yi xi, ∂hinge/∂b = –C yi
                    grad_w = self.w - self.C * yi * xi
                    grad_b = -self.C * yi

                self.w -= self.lr * grad_w
                self.b -= self.lr * grad_b

    def decision_function(self, X):
        if self.kernel == 'rbf':
            Z = self._rbf_feature_map(X)
            return Z.dot(self.w) + self.b
        else:
            return X.dot(self.w) + self.b

    def predict(self, X):
        return (self.decision_function(X) >= 0).astype(int)

    def predict_proba(self, X):
        scores = self.decision_function(X)
        probs = 1.0/(1.0 + np.exp(-scores))
        return np.vstack([1-probs, probs]).T

    
    
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, *, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # for leaf nodes


class DecisionTreeScratch:
    def __init__(self, min_samples_split=2, max_depth=100):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.root = None

    def fit(self, X: np.ndarray, y: np.ndarray):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        num_labels = len(np.unique(y))

        # stopping criteria
        if (depth >= self.max_depth
            or n_samples < self.min_samples_split
            or num_labels == 1):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        best_feat, best_thresh = self._best_criteria(X, y, n_features)
        if best_feat is None:
            return Node(value=self._most_common_label(y))

        left_idx, right_idx = self._split(X[:, best_feat], best_thresh)
        left  = self._grow_tree(X[left_idx, :], y[left_idx], depth + 1)
        right = self._grow_tree(X[right_idx, :], y[right_idx], depth + 1)
        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, n_features):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feat in range(n_features):
            thresholds = np.unique(X[:, feat])
            for t in thresholds:
                left_idx, right_idx = self._split(X[:, feat], t)
                # **skip** thresholds that put everything on one side
                if len(left_idx) == 0 or len(right_idx) == 0:
                    continue
                gain = self._information_gain(y, X[:, feat], t)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat
                    split_thresh = t
        return split_idx, split_thresh

    def _information_gain(self, y, X_col, split_thresh):
        # parent Gini
        parent_gini = self._gini(y)
        left_idx, right_idx = self._split(X_col, split_thresh)
        if len(left_idx) == 0 or len(right_idx) == 0:
            return 0
        n = len(y)
        n_l, n_r = len(left_idx), len(right_idx)
        gini_l = self._gini(y[left_idx])
        gini_r = self._gini(y[right_idx])
        # weighted
        child_gini = (n_l/n) * gini_l + (n_r/n) * gini_r
        return parent_gini - child_gini

    def _gini(self, y):
        counts = np.bincount(y)
        ps = counts / counts.sum()
        return 1 - np.sum(ps**2)

    def _split(self, X_col, split_thresh):
        left_idx  = np.where(X_col <= split_thresh)[0]
        right_idx = np.where(X_col >  split_thresh)[0]
        return left_idx, right_idx

    def _most_common_label(self, y):
        return np.bincount(y).argmax()

    def _traverse(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self._traverse(x, node.left)
        return self._traverse(x, node.right)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._traverse(x, self.root) for x in X])
    
class RandomForestScratch:
    def __init__(self,
                 n_estimators=100,
                 max_depth=10,
                 min_samples_split=2,
                 max_features='sqrt'):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees: List[DecisionTreeScratch] = []
        self.features_idxs: List[np.ndarray] = []

    def fit(self, X: np.ndarray, y: np.ndarray):
        y = np.asarray(y)
        n_samples, n_features = X.shape
        if self.max_features == 'sqrt':
            max_feats = int(np.sqrt(n_features))
        elif self.max_features == 'log2':
            max_feats = int(np.log2(n_features))
        else:
            max_feats = n_features

        for _ in range(self.n_estimators):
            # bootstrap sample
            idxs = np.random.choice(n_samples, n_samples, replace=True)
            X_samp, y_samp = X[idxs], y[idxs]
            feat_idxs = np.random.choice(n_features, max_feats, replace=False)

            tree = DecisionTreeScratch(
                min_samples_split=self.min_samples_split,
                max_depth=self.max_depth
            )
            # train on subset of features
            tree.fit(X_samp[:, feat_idxs], y_samp)
            self.trees.append(tree)
            self.features_idxs.append(feat_idxs)

    def predict(self, X: np.ndarray) -> np.ndarray:
        # collect each tree's predictions
        all_preds = np.array([
            tree.predict(X[:, fi]) for tree, fi in zip(self.trees, self.features_idxs)
        ])
        # majority vote
        return np.apply_along_axis(lambda row: np.bincount(row).argmax(),
                                   axis=0,
                                   arr=all_preds)

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        # fraction of trees voting class 1
        all_votes = np.array([
            tree.predict(X[:, fi]) for tree, fi in zip(self.trees, self.features_idxs)
        ])
        proba1 = all_votes.mean(axis=0)
        return np.vstack([1 - proba1, proba1]).T
    
class DecisionTreeRegressorScratch:
    def __init__(self, min_samples_split=2, max_depth=3):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.root = None

    def fit(self, X: np.ndarray, y: np.ndarray):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        if (depth >= self.max_depth) or (n_samples < self.min_samples_split):
            leaf_val = y.mean()
            return Node(value=leaf_val)

        best_feat, best_thresh = None, None
        best_loss = float('inf')

        for feat in range(n_features):
            for t in np.unique(X[:, feat]):
                left_idx, right_idx = np.where(X[:, feat] <= t)[0], np.where(X[:, feat] > t)[0]
                if len(left_idx) == 0 or len(right_idx) == 0:
                    continue
                loss = (
                    len(left_idx) * y[left_idx].var() +
                    len(right_idx)* y[right_idx].var()
                ) / n_samples
                if loss < best_loss:
                    best_loss, best_feat, best_thresh = loss, feat, t

        if best_feat is None:
            return Node(value=y.mean())

        left_idx, right_idx = np.where(X[:, best_feat] <= best_thresh)[0], np.where(X[:, best_feat] > best_thresh)[0]
        left = self._grow_tree(X[left_idx], y[left_idx], depth+1)
        right= self._grow_tree(X[right_idx], y[right_idx], depth+1)
        return Node(best_feat, best_thresh, left, right)

    def _traverse(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self._traverse(x, node.left)
        return self._traverse(x, node.right)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._traverse(x, self.root) for x in X])


class XGBoostScratch:
    def __init__(self,
                 n_estimators=100,
                 learning_rate=0.1,
                 max_depth=3,
                 min_samples_split=2):
        self.n_estimators      = n_estimators
        self.lr                = learning_rate
        self.max_depth         = max_depth
        self.min_samples_split = min_samples_split
        self.trees: List[DecisionTreeRegressorScratch] = []
        self.init_score = None

    def fit(self, X: np.ndarray, y: np.ndarray):
        # initialize log-odds
        if hasattr(y, "to_numpy"):
            y = y.to_numpy()

        p = np.clip(y.mean(), 1e-5, 1 - 1e-5)
        self.init_score = np.log(p / (1 - p))
        y_pred = np.full_like(y, self.init_score, dtype=float)

        for _ in range(self.n_estimators):
            # pseudo-residuals
            pred_proba = 1 / (1 + np.exp(-y_pred))
            grad = y - pred_proba

            tree = DecisionTreeRegressorScratch(
                min_samples_split=self.min_samples_split,
                max_depth=self.max_depth
            )
            tree.fit(X, grad)
            self.trees.append(tree)

            update = tree.predict(X)
            y_pred += self.lr * update

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        y_pred = np.full(X.shape[0], self.init_score, dtype=float)
        for tree in self.trees:
            y_pred += self.lr * tree.predict(X)
        probs = 1 / (1 + np.exp(-y_pred))
        return np.vstack([1 - probs, probs]).T

    def predict(self, X: np.ndarray) -> np.ndarray:
        return (self.predict_proba(X)[:, 1] >= 0.5).astype(int)
    



class DraftPredictor:
    def __init__(
        self,
        champion_info_path: str,
        model_type: str = 'xgboost',
        # shared hyper-params
        lr: float = 0.01,
        n_iter: int = 1000,
        C: float = 1.0,
        kernel: str = 'linear',
        gamma: str = 'scale',
        # RF / XGBoost
        n_estimators: int = 100,
        max_depth: int = None,
        learning_rate: float = 0.1,
        n_components: int = 100
    ):
        mt = model_type.lower()

        if mt == 'logistic_regression':
            # our scratch logistic takes C, lr, n_iter
            self.model = LogisticRegressionScratch(
                C=C,
                lr=lr,
                n_iter=n_iter
            )

        elif mt == 'svm':
            # scratch SVM takes C, lr, n_iter, kernel, gamma
            self.model = SVMScratch(
                C=C,
                lr=lr,
                n_iter=n_iter,
                kernel=kernel,
                gamma=gamma,
                n_components=n_components
            )

        elif mt == 'random_forest':
            self.model = RandomForestScratch(
                n_estimators=n_estimators,
                max_depth=max_depth
            )

        elif mt == 'xgboost':
            self.model = XGBoostScratch(
                n_estimators=n_estimators,
                max_depth=max_depth,
                learning_rate=learning_rate
            )

        else:
            raise ValueError(f"Unknown model_type '{model_type}'")

        # load champion info
        with open(champion_info_path, 'r') as f:
            data = json.load(f)["data"]

        self.champion_info = data
        self.id_to_name = {
            int(champ["id"]): champ["key"]
            for champ in data.values()
            if champ["key"] != "None"
        }

    def train(self, X: np.ndarray, y: np.ndarray):
        print(f"Training {self.model.__class__.__name__} on {X.shape[0]} samples, {X.shape[1]} features")
        self.model.fit(X, y)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return self.model.predict(X)

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        return self.model.predict_proba(X)[:, 1]

    def save(self, path: str):
        joblib.dump({'model': self.model}, path)

    def load(self, path: str):
        state = joblib.load(path)
        self.model = state['model']

In [5]:
import argparse
import os
from data_processor import DataProcessor
from model import DraftPredictor
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
)
def main():
    # Parse command line arguments
    # determine the directory this file lives in
    HERE = os.path.dirname(os.path.abspath(__file__))

    # build absolute defaults
    DEFAULT_GAMES   = os.path.join(HERE, 'data', 'games.csv')
    DEFAULT_CHAMPS  = os.path.join(HERE, 'data', 'champion_info.json')
    DEFAULT_CHAMPS2  = os.path.join(HERE, 'data', 'champion_info_2.json')
    parser = argparse.ArgumentParser(
        description='Train a League of Legends draft prediction model'
    )
    parser.add_argument(
        '--games_path', 
        type=str,
        default=DEFAULT_GAMES,
        help='Path to the games data CSV file'
    )
    parser.add_argument(
        '--champion_info_path',
        type=str,
        default=DEFAULT_CHAMPS,
        help='Path to the champion info JSON file'
    )
    parser.add_argument(
        '--champion_info_path_2',
        type=str,
        default=DEFAULT_CHAMPS2,
        help='Path to the champion info 2 JSON file'
    )
    parser.add_argument(
        '--test_size',
        type=float,
        default=0.2,
        help='Proportion of data to use for testing'
    )
    parser.add_argument(
        '--random_state',
        type=int,
        default=42,
        help='Random seed for reproducibility'
    )
    parser.add_argument(
        '--model_type',
        type=str,
        default='xgboost',
        choices=['logistic_regression', 'svm', 'random_forest', 'xgboost'],
        help='Which algorithm to use'
    )
    parser.add_argument(
        '--model_name',
        type=str,
        default='draft_predictor',
        help='Name to save the model under'
    )
    
    # RF & XGB
    parser.add_argument('--n_estimators',   type=int,   nargs='+', default=[50,100])
    parser.add_argument('--max_depth',      type=int,   nargs='+', default=[5,10])
    parser.add_argument('--learning_rate',  type=float, nargs='+', default=[0.1,0.3])
    # LR & SVM
    parser.add_argument('--C',              type=float, nargs='+', default=[0.01,0.1,1,10])
    parser.add_argument('--kernel',         type=str,   nargs='+', default=['linear','rbf'])
    parser.add_argument('--gamma',          type=str,   nargs='+', default=['scale'])
    args = parser.parse_args()

    # Create checkpoints directory if it doesn't exist
    os.makedirs('checkpoints', exist_ok=True)

    # Initialize data processor and process data
    print("Processing data...")
    data_processor = DataProcessor(args.games_path, args.champion_info_path, args.champion_info_path_2)
    X_train, X_test, y_train, y_test = data_processor.prepare_train_test_split(
        test_size=args.test_size,
        random_state=args.random_state
    )
    print("Data processing complete.")
    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    grid = []
    if args.model_type in ('random_forest','xgboost'):
        for n in args.n_estimators:
            for d in args.max_depth:
                for lr in (args.learning_rate if args.model_type=='xgboost' else [None]):
                    params = {'n_estimators':n, 'max_depth':d}
                    if lr is not None: params['learning_rate'] = lr
                    grid.append(params)

    elif args.model_type == 'logistic_regression':
        for C in args.C:
            grid.append({'C':C})

    elif args.model_type == 'svm':
        for C in args.C:
            for kernel in args.kernel:
                for gamma in args.gamma:
                    grid.append({'C':C, 'kernel':kernel, 'gamma':gamma})
                    
                    
    # 3) sweep
    best_acc, best_params = 0.0, None
    for params in grid:
        print("→ trying", args.model_type, params)
        model = DraftPredictor(
            model_type=args.model_type,
            **params,champion_info_path=args.champion_info_path
        )
        model.train(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        print("   ↳ acc =", acc)
        if acc > best_acc:
            best_acc, best_params = acc, params.copy()

    print(f"Best validation accuracy = {best_acc:.4f} with {best_params}")

    # Initialize and train model
    print(f"Training a {args.model_type} model...")
    final_model = DraftPredictor(model_type=args.model_type, **best_params,champion_info_path=args.champion_info_path)
    final_model.train(X_train, y_train)

    # 5) evaluate on test set
    y_pred      = final_model.predict(X_test)
    print("Test set predictions:", y_pred)
    # for AUC we need scores / probabilities if available
    try:
        y_score = final_model.model.predict_proba(X_test)[:,1]
    except AttributeError:
        y_score = final_model.model.decision_function(X_test)
    print("\n=== TEST SET METRICS ===")
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("AUC-ROC  :", roc_auc_score(y_test, y_score))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall   :", recall_score(y_test, y_pred))
    

    # Save model and processor
    model_path = os.path.join('checkpoints', f'{args.model_name}.joblib')
    processor_path = os.path.join('checkpoints', f'{args.model_name}_processor.joblib')
    
    model.save(model_path)
    data_processor.save(processor_path)
    
    print(f"Model saved to {model_path}")
    print(f"Data processor saved to {processor_path}")

In [11]:
!python train_model.py --model_type logistic_regression  --games_path data/games.csv --champion_info_path data/champion_info.json 

Processing data...
Number of champions: 138
Number of games: 48555
Data processing complete.
Training set size: 38844
Test set size: 9711
→ trying logistic_regression {'C': 0.01}
Training LogisticRegressionScratch on 38844 samples, 295 features
   ↳ acc = 0.5019050561219236
→ trying logistic_regression {'C': 0.1}
Training LogisticRegressionScratch on 38844 samples, 295 features
   ↳ acc = 0.5191020492225311
→ trying logistic_regression {'C': 1}
Training LogisticRegressionScratch on 38844 samples, 295 features
   ↳ acc = 0.5922150139017609
→ trying logistic_regression {'C': 10}
Training LogisticRegressionScratch on 38844 samples, 295 features
   ↳ acc = 0.5938626300072083
Best validation accuracy = 0.5939 with {'C': 10}
Training a logistic_regression model...
Training LogisticRegressionScratch on 38844 samples, 295 features
Test set predictions: [1 0 1 ... 0 0 1]

=== TEST SET METRICS ===
Accuracy : 0.5938626300072083
AUC-ROC  : 0.6269684704544176
Precision: 0.5903614457831325
Recall   

In [13]:
!python train_model.py --model_type svm  --games_path data/games.csv --champion_info_path data/champion_info.json 

Processing data...
Number of champions: 138
Number of games: 48555
Data processing complete.
Training set size: 38844
Test set size: 9711
→ trying svm {'C': 0.01, 'kernel': 'linear', 'gamma': 'scale'}
Training SVMScratch on 38844 samples, 295 features
   ↳ acc = 0.5019050561219236
→ trying svm {'C': 0.01, 'kernel': 'rbf', 'gamma': 'scale'}
Training SVMScratch on 38844 samples, 295 features
   ↳ acc = 0.5019050561219236
→ trying svm {'C': 0.1, 'kernel': 'linear', 'gamma': 'scale'}
Training SVMScratch on 38844 samples, 295 features
   ↳ acc = 0.5019050561219236
→ trying svm {'C': 0.1, 'kernel': 'rbf', 'gamma': 'scale'}
Training SVMScratch on 38844 samples, 295 features
   ↳ acc = 0.5019050561219236
→ trying svm {'C': 1, 'kernel': 'linear', 'gamma': 'scale'}
Training SVMScratch on 38844 samples, 295 features
   ↳ acc = 0.5024199361548759
→ trying svm {'C': 1, 'kernel': 'rbf', 'gamma': 'scale'}
Training SVMScratch on 38844 samples, 295 features
   ↳ acc = 0.5019050561219236
→ trying svm {'

In [17]:
!python train_model.py --model_type random_forest  --games_path data/games.csv --champion_info_path data/champion_info.json 

Processing data...
Number of champions: 138
Number of games: 48555
Data processing complete.
Training set size: 38844
Test set size: 9711
→ trying random_forest {'n_estimators': 50, 'max_depth': 5}
Training RandomForestScratch on 38844 samples, 295 features
   ↳ acc = 0.5151889609720935
→ trying random_forest {'n_estimators': 50, 'max_depth': 10}
Training RandomForestScratch on 38844 samples, 295 features
   ↳ acc = 0.513953248893008
→ trying random_forest {'n_estimators': 100, 'max_depth': 5}
Training RandomForestScratch on 38844 samples, 295 features
   ↳ acc = 0.5028318401812377
→ trying random_forest {'n_estimators': 100, 'max_depth': 10}
Training RandomForestScratch on 38844 samples, 295 features
   ↳ acc = 0.5094223046030275
Best validation accuracy = 0.5152 with {'n_estimators': 50, 'max_depth': 5}
Training a random_forest model...
Training RandomForestScratch on 38844 samples, 295 features
Test set predictions: [1 1 1 ... 1 1 1]

=== TEST SET METRICS ===
Accuracy : 0.5060240963

In [18]:
!python train_model.py --model_type xgboost  --games_path data/games.csv --champion_info_path data/champion_info.json 

Processing data...
Number of champions: 138
Number of games: 48555
Data processing complete.
Training set size: 38844
Test set size: 9711
→ trying xgboost {'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.1}
Training XGBoostScratch on 38844 samples, 295 features
   ↳ acc = 0.5923179899083514
→ trying xgboost {'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.3}
Training XGBoostScratch on 38844 samples, 295 features
   ↳ acc = 0.5938626300072083
→ trying xgboost {'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.1}
Training XGBoostScratch on 38844 samples, 295 features
   ↳ acc = 0.5862424055195139
→ trying xgboost {'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.3}
Training XGBoostScratch on 38844 samples, 295 features
   ↳ acc = 0.5839769333745237
→ trying xgboost {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}
Training XGBoostScratch on 38844 samples, 295 features
   ↳ acc = 0.5921120378951704
→ trying xgboost {'n_estimators': 100, 'max_depth': 5,

In [None]:
from typing import List, Tuple

def recommend_pick(
    team1_champs: List[str],
    team2_champs_partial: List[str],
    processor,
    model,
    top_k: int = 5
) -> List[Tuple[str, float]]:
    """
    Recommend the final pick for team2 and return the win rate for team2.

    Parameters:
        team1_champs (List[str]): Full list of 5 champions for team1
        team2_champs_partial (List[str]): First 4 champions for team2
        processor: Trained DataProcessor
        model: Trained DraftPredictor model
        top_k (int): Number of recommendations to return

    Returns:
        List[Tuple[str, float]]: List of (champion_name, team2_win_probability), sorted descending
    """
    all_champs = list(processor.champion_key_to_id.keys())

    # Filter out already-picked champions
    used = set(team1_champs + team2_champs_partial)
    remaining = [c for c in all_champs if c not in used]

    recommendations = []

    # Try each remaining champion as the 5th pick for team2
    for champ in remaining:
        team2_full = team2_champs_partial + [champ]

        feats = processor.prepare_prediction_data(
            team1_champs=team1_champs,
            team2_champs=team2_full
        )
        
        
        prob_team1_win = model.predict_proba(feats).item()
        prob_team2_win = 1.0 - prob_team1_win

        recommendations.append((champ, prob_team2_win))

    # Sort by team2 win probability (descending)
    recommendations.sort(key=lambda x: x[1], reverse=True)

    return recommendations[:top_k]

if __name__ == "__main__":
    from data_processor import DataProcessor
    from model import DraftPredictor

    PROC_PATH  = "checkpoints/draft_predictor_processor.joblib"
    CHAMP_JSON = "data/champion_info.json"
    MODEL_PATH = "checkpoints/draft_predictor.joblib"

    processor = DataProcessor.load(PROC_PATH)
    model = DraftPredictor(CHAMP_JSON,model_type='logistic_regression')
    model.load(MODEL_PATH)

    team1 = ["Aatrox", "LeeSin", "Ahri", "Jinx", "Thresh"]
    team2_partial = ["Ornn", "MasterYi", "Yasuo", "TahmKench"]

    top_recs = recommend_pick(team1, team2_partial, processor, model, top_k=8)

    print("Top 8 recommended 5th picks for team2 (sorted by win probability):\n")
    for i, (champ, prob) in enumerate(top_recs, 1):
        print(f"{i:>2}. {champ:<12}  →  team2 win rate: {prob:6.2%}")