<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"></ul></div>

Classes we will use for project.

In [379]:
from collections import namedtuple
from os.path import join as path_join

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [380]:
class Config:
    """Key configurations for the project."""
    TEAMS_FNAME   = 'teams.csv'
    REG_FNAME     = 'RegularSeasonDetailedResults.csv'
    TOURNEY_FNAME = 'TourneyDetailedResults.csv'
    SEEDS_FNAME   = 'TourneySeeds.csv'
    SLOTS_FNAME   = 'TourneySlots.csv'
    
    DEFAULT_DATA_DIR = '../data/'
    
    PREDICTION_COLUMNS = ['game_id', 'Prediction']

In [381]:
class Team:
    """Immutable notion of a team."""
    def __init__(self, team_id, dm):
        """
        team_id: id of this team from teams.csv file
        dm: DataManager from which this team comes from
        """
        self.team_id = team_id
        self.dm = dm
    
    @property
    def name(self):
        return None
    
    def __str__(self):
        return str(self.team_id)
    
    def __repr__(self):
        return str(self.team_id)

In [382]:
class Season:
    """Immutable notion of a season of college basketball."""
    def __init__(self, yr):
        """yr: year that this season represents"""
        self.yr = yr

In [383]:
class Data:
    """Immutable data obejct storing all data about tournaments, for all years"""
    def __init__(self, teams, reg, tourney, seeds, slots, **kwargs):
        self.teams = teams
        self.reg = reg
        self.tourney = tourney
        self.seeds = seeds
        self.slots = slots
        self.kwargs = kwargs if kwargs is not None else {}

In [384]:
TourneyResult = namedtuple('TourneyResult', ['winner', 'loser', 'season'])

In [385]:
class DataManager:
    """Immutable loader that can load data from directory and query on specific fields."""
    def __init__(self, data_dir=Config.DEFAULT_DATA_DIR):
        """data_dir: path to dir with all the csv's we need"""
        self.data_dir = data_dir
        self._data = None # full dataframe
    
    @property
    def data(self):
        """get the full dataframe for this loader"""
        if self._data is None:
            self._data = self.load()
        return self._data
    
    def load(self):
        """loads data into local memory for later use"""
        teams   = pd.read_csv(path_join(self.data_dir, Config.TEAMS_FNAME))
        reg     = pd.read_csv(path_join(self.data_dir, Config.REG_FNAME))
        tourney = pd.read_csv(path_join(self.data_dir, Config.TOURNEY_FNAME))
        seeds   = pd.read_csv(path_join(self.data_dir, Config.SEEDS_FNAME))
        slots   = pd.read_csv(path_join(self.data_dir, Config.SLOTS_FNAME))
        return Data(teams, reg, tourney, seeds, slots, key='value')
    
    # QUERIES
    
    def get_teams_in_season(self, season):
        reg = self.data.reg
        reg_season = reg[reg.Season == season.yr]
        reg_season_teams_winners = reg_season.Wteam.unique()
        reg_season_teams_losers = reg_season.Lteam.unique()
        reg_season_teams_all = np.union1d(reg_season_teams_winners, reg_season_teams_losers)
        return map(lambda t: Team(t, self), reg_season_teams_all)
    
    def get_team_in_season(self, season, team):
        reg = self.data.reg
        reg_season = reg[reg.Season == season.yr]
        reg_season_team = reg_season[(reg.Wteam == team.team_id) | (reg.Lteam == team.team_id)]
        return self.team_win_lose_score_helper(reg_season_team, team)
    
    def get_training_data(self):
        """
        gets all data for a model to train on
        need to iterate over all tourney games we have and give TourneyResults
        """
        tourney = self.data.tourney
        result = []
        for _, row in tourney.iterrows():
            winner = Team(row.Wteam, self)
            loser = Team(row.Lteam, self)
            season = Season(row.Season)
            result.append(TourneyResult(winner=winner, loser=loser, season=season))
        return result
    
    # HELPERS
    
    def team_win_lose_score_helper(self, df, team):
        """
        df: filtered on team and year
        team: team object
        returns: updated df with removing 'W'/'L' from relevant stats
        """
        rename_cols = {
            'team', 'score', 'fgm', 'fga', 'fgm3', 'fga3', 'ftm', 'fta', 'or', 'dr', 'ast', 'to', 'stl', 'blk', 'pf'
        }
        
        def get_rename_dict(mode):
            """
            mode: string 'W' or 'L'
            """
            result = {}
            for col in rename_cols:
                result[mode + col] = col
            # Extra cols
            result['Season'] = 'season'
            return result
        
        def internal_update_func(row):
            """
            row: row of table we want to update
            """
            if row.Wteam == team.team_id:
                # Must add W to everything in rename cols
                rename_dict = get_rename_dict('W')
            else:
                # Must add L to everything in rename cols
                rename_dict = get_rename_dict('L')

            # Select subset
            subset = row[rename_dict.keys()]
            
            return subset.rename(rename_dict)

        return df.apply(internal_update_func, axis=1)

In [386]:
class Model:
    """Represents one of our models."""
    def __init__(self, name, features):
        """
        name: name of this model, for debugging purposes
        features: list[Feature] for this model
        """
        self.name = name
        self.features = features
        self._sklearn_model = None
    
    @property
    def sklearn_model(self):
        if self._sklearn_model is None:
            #if self.name == 'default_linear':
            self._sklearn_model = LogisticRegression()
        return self._sklearn_model
    
    @classmethod
    def default(cls):
        return cls('default_linear', [Feature('ppg', lambda df: df['score'].mean())])
    
    def get_vector(self, season, team, dm):
        """
        season: Season object
        team: Team object
        dm: DataManager
        returns: FeatureVector filtered on season and team, aggregated accordingly
        """
        filt = dm.get_team_in_season(season, team)
        # compute each feature and append to result dict
        result = {}
        for f in self.features:
            result[f] = f.compute(filt)
        return FeatureVector(result)
    
    def get_X_y(self, trainer):
        X, y = [], []
        # X is list of FeatureVector
        # y is 1/0 values
        data_raw = trainer.dm.get_training_data()
        for result in data_raw:
            vect_a = self.get_vector(result.season, result.winner, trainer.dm)
            vect_b = self.get_vector(result.season, result.loser, trainer.dm)
            
            vect_combo_a = self.combine_vectors(vect_a, vect_b)
            X.append(vect_combo_a.to_list())
            y.append(1)
            
            vect_combo_b = self.combine_vectors(vect_b, vect_a)
            X.append(vect_combo_b.to_list())
            y.append(0)
        return np.array(X), np.array(y)
    
    def train(self, trainer):
        # Need to do the following:
        # 1. Parse data
        X, y = self.get_X_y(trainer)
        # 2. Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        # 3. Fit on training
        self.sklearn_model.fit(X_train, y_train)
        # 3. Evaluate on testing
        # 4. Return evaluation result
        return self.sklearn_model.score(X_test, y_test)
    
    def predict(self, a, b, runner):
        """
        a: Team a
        b: Team b
        runner: runner calling me
        returns: output in [0, 1] range, P(a)
        """
        vect_a = self.get_vector(runner.season, a, runner.dm)
        vect_b = self.get_vector(runner.season, b, runner.dm)
        vect_combo = self.combine_vectors(vect_a, vect_b)
    
    def combine_vectors(self, a, b):
        """
        a, b: FeatureVector's
        returns: combination of a and b
        """
        return a - b

In [387]:
class Feature:
    """Represents a particular entry in a team vector. Computed on a team in a given season."""
    def __init__(self, name, compute_func, **kwargs):
        """
        name: string of name describing this feature
        compute_func: df (team, season) -> numeric value
        """
        self.name = name
        self.func = compute_func
        self.kwargs = kwargs if kwargs is not None else {}
    
    def compute(self, df):
        return self.func(df, **self.kwargs)
    
    def __str__(self):
        return self.name
    
    def __repr__(self):
        return self.name

In [388]:
class FeatureVector:
    """Represents a vector of features. Immutable."""
    def __init__(self, feature_dict):
        """feature_dict: dict[Feature -> numeric]"""
        self.feature_dict = feature_dict
        
    def __getitem__(self, k):
        return self.feature_dict[k]
    
    def __str__(self):
        return str(self.feature_dict)
    
    def __sub__(self, other):
        """computes self - other"""
        final = {}
        for k in self.feature_dict:
            final[k] = self[k] - other[k]
        return FeatureVector(final)
    
    def to_list(self):
        """uses sorted key order so it is deterministic"""
        result = []
        for k in sorted(self.feature_dict, key = lambda f: f.name):
            result.append(self[k])
        return result

In [389]:
class Trainer:
    """Trains a model."""
    def __init__(self, model, dm=None):
        self.model = model
        self.dm = dm if dm is not None else DataManager()
    
    def run(self):
        """trains and returns eval result"""
        return self.model.train(self)

In [390]:
class Runner:
    """Main class that will be used to get predictions for a particular year."""
    def __init__(self, season, model, dm=None):
        """
        season: Season object we want to run on
        model: Model object being used to make the predictions for this run
        dm: DataManager
        """
        self.season = season
        self.model = model
        self.dm = dm if dm is not None else DataManager()
    
    def run(self):
        """Generates all predictions for this season"""
        # Create result
        result = pd.DataFrame(columns=Config.PREDICTION_COLUMNS)
        # Get all teams in this season
        teams = self.dm.get_teams_in_season(self.season)
        # Iterate over all pairs
        for a in teams:
            for b in teams:
                if a != b:
                    game_id = self.get_game_id(a, b)
                    prediction = self.run_pair(a, b)
                    result = result.append({
                        result.columns[0]: game_id,
                        result.columns[1]: prediction
                    }, ignore_index=True)
                    break
            break
        return result
    
    def run_pair(self, a, b):
        """
        a: first team
        b: second team
        returns: 1 if a wins, 0 else
        """
        return self.model.predict(a, b, self)
    
    def get_game_id(self, a, b):
        return str(self.season.yr) + '_' + str(a.team_id) + '_' + str(b.team_id)

In [391]:
class Main:
    """Immutable class we will call main() on to run the project."""
    
    @classmethod
    def main(self):
        """Runs runners on each season that we need to. Returns dataframe with predictions."""
        result = pd.DataFrame()
        for season in [2011, 2012, 2013]:
            r = Runner(Season(season), Model.default())
            current = r.run()
            result = result.append(current)
            break
        return result

In [392]:
out = Main.main()



In [393]:
t = Trainer(Model.default())

In [394]:
print(out)

          game_id Prediction
0  2011_1102_1103       None


In [395]:
t_new = Trainer(Model('base_model', [
    #Feature('FGP', lambda df: (df['fgm']/df['fga']).mean()),
    Feature('blocks', lambda df: df['blk'].mean()), # function that takes a dataframe and returns the value you want out of it
    #Feature('assists', lambda df: df['ast'].mean()),
    Feature('OR', lambda df: df['or'].mean()),
    #Feature('DR', lambda df: df['dr'].mean()),
    #Feature('turnovers', lambda df: df['to'].mean()),
    #Feature('3PP', lambda df: (df['fgm3']/df['fga3']).mean()),
    Feature('3PM', lambda df: df['fgm3'].mean()),
    #Feature('FTP', lambda df: (df['ftm']/df['fta']).mean()),
    Feature('steals', lambda df: df['stl'].mean()),
    Feature('ppg', lambda df: df['score'].mean()),
    #Feature('fouls', lambda df: df['pf'].mean())
    
]))

In [396]:
t_new.run()



0.58391608391608396