# Examples of Basic ML Learning Tools

In [1]:
# The goal of this notebook is to look at just the ability of the seed number
# to determine the winner. One caveat here is that the model always should be
# correct when two teams of the same seed go head to head. I could fix it but
# it probably isn't worth the effort. The accuracies seem to be about 70% or 
# maybe 65% considering the imperfect data. I also broke the learning down by
# season just to see if we are potentially getting better at seeding teams. 

In [106]:
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [109]:
def test_model( data, feat_cols, label_col, model ):
    accs = []
    for i in range(10):
        train = data.sample( frac=.7 )
        test = data.drop( train.index )

        model.fit( train[feat_cols], train[label_col] )
        accs += [model.score( test[feat_cols], test[label_col] )]
    return sum(accs) / len(accs)

def test_models( data, feat_cols, label_col ):
    models = {
        'LogReg': LogisticRegression(),
        'DecisionTree': DecisionTreeClassifier( max_depth=5 ),
        'NaiveBayes': GaussianNB(),
        'NeuralNet': MLPClassifier(),
        'RandomForest': RandomForestClassifier(),
        'KNN': KNeighborsClassifier( 5 ),
        'SVC': SVC(),
        'BoostClassifier':AdaBoostClassifier()
    }
    for model in models.keys():
        accuracy = test_model( data, feat_cols, label_col, models[model] )
        print( model, ": ", accuracy )

In [108]:
# BASIC TESTING ON DIFFERENT TEAM STRENGTH RANKINGS ##############
# I THINK WE CAN ADD PRE-PROCESSING BUT THIS LOOKS DECENT FOR NOW...
data = pandas.read_csv( 'cleaned/TourneyResultsWithRankings.csv' )

# DISSASOCIATE THE SEEDS AS WINNING OR LOSING ####################
half = data.sample( frac=.5 )
rest = data.drop( half.index )
half['Winner'] = 'A'
rest['Winner'] = 'B'

def rename( half, rest, winner_name, loser_name, generic_name ):
    half = half.rename( index=str, columns={ winner_name: generic_name + '1', loser_name:  generic_name + '2'} )
    rest = rest.rename( index=str, columns={ loser_name:  generic_name + '1', winner_name: generic_name + '2'} )
    return half, rest

systems = ['RPI', 'POM', 'MOR', 'RTH', 'WLK' ]
for name in systems:
    half, rest = rename( half, rest, 'W' + name, 'L' + name, name )

data = pandas.concat( [rest, half] )


columns = []
for name in systems:
    columns.append( name + '1' )
    columns.append( name + '2' )

test_models( data, columns, 'Winner' )
data.head()

LogReg :  0.700680272109
DecisionTree :  0.652721088435
Gaussian :  0.678571428571
NeuralNet :  0.683333333333
RandomForest :  0.675170068027
KNN :  0.659183673469
SVC :  0.482653061224
BoostClassifier :  0.686734693878


Unnamed: 0,LBIH,LBOB,LCNG,LCOL,LDOK,LDOL,LDUN,LMAS,LPGH,LPIG,...,WMAS,WPGH,WPIG,WSAG,WSE,WSEL,WTeamID,WWIL,WWOL,Winner
0,234.0,239.0,,212,,239,247.0,249.0,,,...,265.0,,,251,210.0,233.0,1421,,220,B
2,19.0,22.0,,19,,21,19.0,18.0,,,...,40.0,,,32,44.0,34.0,1113,,39,B
3,17.0,24.0,,18,,18,27.0,19.0,,,...,36.0,,,48,32.0,48.0,1141,,49,B
5,31.0,26.0,,23,,36,61.0,31.0,,,...,24.0,,,23,28.0,37.0,1163,,32,B
6,85.0,76.0,,75,,91,45.0,84.0,,,...,10.0,,,10,12.0,8.0,1181,,9,B


In [93]:
# BASIC TESTING ON SEEDING DATA ##################################
data = pandas.read_csv( 'cleaned/TourneyResultsWithSeeds.csv')

# DISSASOCIATE THE SEEDS AS WINNING OR LOSING ####################
half = data.sample( frac=.5 )
rest = data.drop( half.index )
half['Winner'] = 'A'
rest['Winner'] = 'B'
half = half.rename( index=str, columns={ 'WSeed':'Seed_One', 'LSeed':'Seed_Two'} )
rest = rest.rename( index=str, columns={ 'LSeed':'Seed_One', 'WSeed':'Seed_Two'} )
data = pandas.concat( [rest, half] )

# TRAINING AND TESTING DATA ############
feature_columns = ['Seed_One', 'Seed_Two']
test_models( data, feature_columns, 'Winner' )

LogReg :  0.710236220472
DecisionTree :  0.712440944882
Gaussian :  0.71842519685
NeuralNet :  0.72031496063
RandomForest :  0.708976377953
KNN :  0.680157480315
SVC :  0.71937007874
