# Examples of Basic ML Learning Tools

In [1]:
# The goal of this notebook is to look at just the ability of the seed number
# to determine the winner. One caveat here is that the model always should be
# correct when two teams of the same seed go head to head. I could fix it but
# it probably isn't worth the effort. The accuracies seem to be about 70% or 
# maybe 65% considering the imperfect data. I also broke the learning down by
# season just to see if we are potentially getting better at seeding teams. 

In [1]:
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.ensemble import VotingClassifier

In [4]:
def predict( train, test, feat_cols, label_col, model ):
    model.fit( train[feat_cols], train[label_col] )
    return model.score( test[feat_cols], test[label_col] )

def test_model( data, feat_cols, label_col, model ):
    accs = []
    for i in range(10):
        train = data.sample( frac=.7 )
        test = data.drop( train.index )
        accs += [predict( train, test, feat_cols, label_col, model )]
    return sum(accs) / len(accs)

models = {  'LogReg': LogisticRegression(),
            'DecisionTree': DecisionTreeClassifier( max_depth=5 ),
            'NaiveBayes': GaussianNB(),
            'NeuralNet': MLPClassifier(),
            'RandomForest': RandomForestClassifier(),
            'KNN': KNeighborsClassifier( 5 ),
            'SVC': SVC(),
            'BoostClassifier':AdaBoostClassifier() }

def test_models( data, feat_cols, label_col ):
    for model in models.keys():
        accuracy = test_model( data, feat_cols, label_col, models[model] )
        print( model, ": ", accuracy )
        
def test_ensemble( model_names, data, feat_cols, label_col ):
    estimators = []
    estimators.append( (model_names[0], models[model_names[0]]) )
    for name in model_names:
        estimators.append( (name + '1', models[name]) )
        estimators.append( (name + '2', models[name]) )
        
    ensemble = VotingClassifier( estimators=estimators, voting='hard' )
    accuracy = test_model( data, feat_cols, label_col, ensemble )
    print( 'Ensemble: ', accuracy )
        
def normalize( data ): # STILL WORKING ON THIS
    x = data.values #returns a numpy array
    normalizer = preprocessing.MinMaxScaler()
    x_scaled = normalizer.fit_transform( x )
    return pandas.DataFrame( x_scaled, columns=data.columns )

In [None]:
# SOME BASIC TESTING ON SEEDING DATA AND RPI RANKINGS ############
data = pandas.read_csv( 'cleaned/TourneySeedsAndRankings.csv' )
half = data.sample( frac=.5 )
rest = data.drop( half.index )
half['Winner'] = 'A'
rest['Winner'] = 'B'

def rename( half, rest, winner_name, loser_name, generic_name ):
    half = half.rename( index=str, columns={ winner_name: generic_name + '1', loser_name:  generic_name + '2'} )
    rest = rest.rename( index=str, columns={ loser_name:  generic_name + '1', winner_name: generic_name + '2'} )
    return half, rest

cols = [ 'SAG', 'Seed', 'RPI', 'POM' ]
for name in cols:
    half, rest = rename( half, rest, 'W' + name, 'L' + name, name )
    
data = pandas.concat( [rest, half] )

columns = []
for name in cols:
    columns.append( name + '1' )
    columns.append( name + '2' )
    
for col in columns:
    data[col] = data[col] - data[col].min()
    data[col] = data[col] / data[col].max()
    
print( 'All Data' )
test_models( data, columns, 'Winner' )
test_ensemble( ['LogReg', 'NeuralNet', 'SVC'], data, columns, 'Winner' )

years = {}
for year in range(2004, 2018):
    train = data.loc[ data['Season'] < year ]
    test = data.loc[ data['Season'] == year ]
    print( year )
    results = {}
    for model_name in models.keys():
        acc = predict( train, test, columns, 'Winner', models[model_name])
        results[model_name] = acc
    best = max( results.keys(), key=(lambda k: results[k]) )
    print(best, results[best])
    years[year] = (best, results[best])
    
import matplotlib.pyplot as plt
%matplotlib inline 
plt.scatter( [year for year in years.keys()], [years[year][1] for year in years.keys()] )
    
print(data.head())

All Data
LogReg :  0.715306122449
DecisionTree :  0.66768707483
NaiveBayes :  0.67074829932
NeuralNet :  0.712585034014
RandomForest :  0.65612244898
KNN :  0.665306122449
SVC :  0.697619047619
BoostClassifier :  0.686394557823
Ensemble:  0.719727891156
2004
NeuralNet 0.765625
2005




KNN 0.71875
2006
NaiveBayes 0.6875
2007
LogReg 0.8125
2008
LogReg 0.765625
2009


In [23]:
# BASIC TESTING ON DIFFERENT TEAM STRENGTH RANKINGS ##############
# I THINK WE CAN ADD PRE-PROCESSING BUT THIS LOOKS DECENT FOR NOW...
data = pandas.read_csv( 'cleaned/TourneyResultsWithRankings.csv' )

# DISSASOCIATE THE SEEDS AS WINNING OR LOSING ####################
half = data.sample( frac=.5 )
rest = data.drop( half.index )
half['Winner'] = 'A'
rest['Winner'] = 'B'

def rename( half, rest, winner_name, loser_name, generic_name ):
    half = half.rename( index=str, columns={ winner_name: generic_name + '1', loser_name:  generic_name + '2'} )
    rest = rest.rename( index=str, columns={ loser_name:  generic_name + '1', winner_name: generic_name + '2'} )
    return half, rest

systems = [ 'RPI', 'POM', 'MOR', 'RTH', 'WLK', 'DOL', 'COL', 'SAG' ]
for name in systems:
    half, rest = rename( half, rest, 'W' + name, 'L' + name, name )

data = pandas.concat( [rest, half] )


columns = []
for name in systems:
    columns.append( name + '1' )
    columns.append( name + '2' )
    
for col in columns:
    data[col] = data[col] - data[col].min()
    data[col] = data[col] / data[col].max()

test_models( data, columns, 'Winner' )
test_ensemble( ['LogReg', 'NeuralNet', 'SVC'], data, columns, 'Winner' )
data.head()

LogReg :  0.707482993197
DecisionTree :  0.664965986395
NaiveBayes :  0.665306122449
NeuralNet :  0.714965986395
RandomForest :  0.679591836735
KNN :  0.67925170068
SVC :  0.69387755102
BoostClassifier :  0.684013605442
Ensemble:  0.721768707483


Unnamed: 0,COL1,COL2,DOL1,DOL2,LBIH,LBOB,LCNG,LDOK,LDUN,LMAS,...,WLK2,WMAS,WPGH,WPIG,WSE,WSEL,WTeamID,WWIL,WWOL,Winner
3,0.065385,0.116438,0.057823,0.120401,17.0,24.0,,,27.0,19.0,...,0.108392,36.0,,,32.0,48.0,1141,,49,B
4,0.211538,0.092466,0.163265,0.080268,49.0,47.0,,,23.0,48.0,...,0.111888,30.0,,,30.0,28.0,1143,,26,B
7,0.173077,0.133562,0.180272,0.16388,52.0,40.0,,,67.0,54.0,...,0.129371,49.0,,,38.0,40.0,1211,,43,B
8,0.2,0.047945,0.183673,0.036789,56.0,63.0,,,42.0,51.0,...,0.031469,11.0,,,16.0,10.0,1228,,14,B
12,0.180769,0.071918,0.170068,0.053512,41.0,59.0,,,44.0,38.0,...,0.066434,23.0,,,18.0,20.0,1323,,18,B


In [93]:
# BASIC TESTING ON SEEDING DATA ##################################
data = pandas.read_csv( 'cleaned/TourneyResultsWithSeeds.csv')

# DISSASOCIATE THE SEEDS AS WINNING OR LOSING ####################
half = data.sample( frac=.5 )
rest = data.drop( half.index )
half['Winner'] = 'A'
rest['Winner'] = 'B'
half = half.rename( index=str, columns={ 'WSeed':'Seed_One', 'LSeed':'Seed_Two'} )
rest = rest.rename( index=str, columns={ 'LSeed':'Seed_One', 'WSeed':'Seed_Two'} )
data = pandas.concat( [rest, half] )

# TRAINING AND TESTING DATA ############
feature_columns = ['Seed_One', 'Seed_Two']
test_models( data, feature_columns, 'Winner' )

LogReg :  0.710236220472
DecisionTree :  0.712440944882
Gaussian :  0.71842519685
NeuralNet :  0.72031496063
RandomForest :  0.708976377953
KNN :  0.680157480315
SVC :  0.71937007874
