March Madness Bracket Prediction Data Crunch
===================

## Overview
Use machine learning and statistical methods to predict NCAA Men's Basketball game outcome and championship based on seasonal performance, game seed, and other stats.

## Method

Instead of analyzing the stats of each team, transform the features into **Difference** and **Quotient** between **two teams in each of 63 games** and predict the probability, using logistic regression and other machine learning methods.

- Feature Engineering
- Data Preprocessing
- Feature Selection
- Model Comparision
- Prediction

## Result

The accuracy of the model is **70.89%** with a current log loss of **0.49**.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (auc, classification_report, roc_auc_score, accuracy_score,
                             f1_score, log_loss, roc_curve, confusion_matrix, precision_score, recall_score)
from sklearn.preprocessing import StandardScaler
from math import sin, cos, sqrt, atan2, radians
import random
import statsmodels.api as sm

In [2]:
ncaa_tour = pd.read_csv('testncaa.csv')

In [3]:
ncaa_tour.columns

Index(['team1_score', 'team2_score', 'team1_seed', 'team2_seed', 'season',
       'host_lat', 'host_long', 'team1_lat', 'team1_long', 'team2_lat',
       'team2_long', 'team1_pt_school_ncaa', 'team1_pt_overall_ncaa',
       'team1_pt_school_s16', 'team1_pt_overall_s16', 'team1_pt_school_ff',
       'team1_pt_overall_ff', 'team1_pt_career_school_wins',
       'team1_pt_career_school_losses', 'team1_pt_career_overall_wins',
       'team1_pt_career_overall_losses', 'team1_pt_team_season_wins',
       'team1_pt_team_season_losses', 'team1_pt_coach_season_wins',
       'team1_pt_coach_season_losses', 'team2_pt_school_ncaa',
       'team2_pt_overall_ncaa', 'team2_pt_school_s16', 'team2_pt_overall_s16',
       'team2_pt_school_ff', 'team2_pt_overall_ff',
       'team2_pt_career_school_wins', 'team2_pt_career_school_losses',
       'team2_pt_career_overall_wins', 'team2_pt_career_overall_losses',
       'team2_pt_team_season_wins', 'team2_pt_team_season_losses',
       'team2_pt_coach_season_w

In [4]:
##replace all NaN ap ranking with 45 'team1_ap_final', 
## 'team1_ap_preseason', 'team1_coaches_before_final',
##       'team1_coaches_preseason', 'team2_ap_final', 'team2_ap_preseason',
##       'team2_coaches_before_final', 'team2_coaches_preseason'
ncaa_tour['team1_ap_final'].fillna(45.0,inplace =True)
ncaa_tour['team1_ap_preseason'].fillna(45.0,inplace =True)
ncaa_tour['team1_coaches_before_final'].fillna(45.0,inplace =True)
ncaa_tour['team1_coaches_preseason'].fillna(45.0,inplace =True)

ncaa_tour['team2_ap_final'].fillna(45.0,inplace =True)
ncaa_tour['team2_ap_preseason'].fillna(45.0,inplace =True)
ncaa_tour['team2_coaches_before_final'].fillna(45.0,inplace =True)
ncaa_tour['team2_coaches_preseason'].fillna(45.0,inplace =True)

In [67]:
for i in list(ncaa_tour.columns[:-2]):
    ncaa_tour[i] = ncaa_tour[i].astype(float).replace(0,0.1)

In [48]:
##replace all 0 with 0.1, so ata later feature engineering we can devide two variables withour geting
##NaN

ncaa_tour['team1_pt_school_ncaa'] = ncaa_tour['team1_pt_school_ncaa'].astype(float).replace(0,0.1)
ncaa_tour['team1_pt_overall_ncaa'] = ncaa_tour['team1_pt_overall_ncaa'].astype(float).replace(0,0.1)
ncaa_tour['team1_pt_school_s16'] = ncaa_tour['team1_pt_school_s16'].astype(float).replace(0,0.1)
ncaa_tour['team1_pt_overall_s16'] = ncaa_tour['team1_pt_overall_s16'].astype(float).replace(0,0.1)
ncaa_tour['team1_pt_school_ff'] = ncaa_tour['team1_pt_school_ff'].astype(float).replace(0,0.1)
ncaa_tour['team1_pt_overall_ff'] = ncaa_tour['team1_pt_overall_ff'].astype(float).replace(0,0.1)
ncaa_tour['team1_pt_career_school_wins'] = ncaa_tour['team1_pt_career_school_wins'].astype(float).replace(0,0.1)
ncaa_tour['team1_pt_career_overall_wins'] = ncaa_tour['team1_pt_career_overall_wins'].astype(float).replace(0,0.1)
ncaa_tour['team1_pt_team_season_losses'] = ncaa_tour['team1_pt_team_season_losses'].astype(float).replace(0,0.1)
ncaa_tour['team1_pt_coach_season_losses'] = ncaa_tour['team1_pt_coach_season_losses'].astype(float).replace(0,0.1)

ncaa_tour['team2_pt_school_ncaa'] = ncaa_tour['team2_pt_school_ncaa'].astype(float).replace(0,0.1)
ncaa_tour['team2_pt_overall_ncaa'] = ncaa_tour['team2_pt_overall_ncaa'].astype(float).replace(0,0.1)
ncaa_tour['team2_pt_school_s16'] = ncaa_tour['team2_pt_school_s16'].astype(float).replace(0,0.1)
ncaa_tour['team2_pt_overall_s16'] = ncaa_tour['team2_pt_overall_s16'].astype(float).replace(0,0.1)
ncaa_tour['team2_pt_school_ff'] = ncaa_tour['team2_pt_school_ff'].astype(float).replace(0,0.1)
ncaa_tour['team2_pt_overall_ff'] = ncaa_tour['team2_pt_overall_ff'].astype(float).replace(0,0.1)
ncaa_tour['team2_pt_career_school_wins'] = ncaa_tour['team2_pt_career_school_wins'].astype(float).replace(0,0.1)
ncaa_tour['team2_pt_career_overall_wins'] = ncaa_tour['team2_pt_career_overall_wins'].astype(float).replace(0,0.1)
ncaa_tour['team2_pt_team_season_losses'] = ncaa_tour['team2_pt_team_season_losses'].astype(float).replace(0,0.1)
ncaa_tour['team2_pt_coach_season_losses'] = ncaa_tour['team2_pt_coach_season_losses'].astype(float).replace(0,0.1)

In [68]:
##Calculate Distance
def distance(lat1, lon1, lat2, lon2):

    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    
    return distance
ncaa_tour['outcome'] = 1

In [69]:
def shuffle(df):     
    df = df.reindex(np.random.permutation(df.index)).copy()# random generate and random order
    return df.reset_index(drop=True)

ncaa_tour = shuffle(ncaa_tour)

In [70]:
my_randoms = random.sample(range(len(ncaa_tour)), round(len(ncaa_tour)/2))
ncaa_tour_1 = ncaa_tour[~ncaa_tour.index.isin(my_randoms)].reset_index(drop = True)
ncaa_tour_2 = ncaa_tour[ncaa_tour.index.isin(my_randoms)].reset_index(drop = True)

In [71]:
ncaa_tour_2['outcome'] = ncaa_tour_2['outcome'].apply(lambda x: 1 if x==0 else 0)
ncaa_tour_2.columns = ['team2_score', 'team1_score', 'team2_seed', 'team1_seed', 'season',
       'host_lat', 'host_long', 'team2_lat', 'team2_long', 'team1_lat',
       'team1_long', 'team2_pt_school_ncaa', 'team2_pt_overall_ncaa',
       'team2_pt_school_s16', 'team2_pt_overall_s16', 'team2_pt_school_ff',
       'team2_pt_overall_ff', 'team2_pt_career_school_wins',
       'team2_pt_career_school_losses', 'team2_pt_career_overall_wins',
       'team2_pt_career_overall_losses', 'team2_pt_team_season_wins',
       'team2_pt_team_season_losses', 'team2_pt_coach_season_wins',
       'team2_pt_coach_season_losses', 'team1_pt_school_ncaa',
       'team1_pt_overall_ncaa', 'team1_pt_school_s16', 'team1_pt_overall_s16',
       'team1_pt_school_ff', 'team1_pt_overall_ff',
       'team1_pt_career_school_wins', 'team1_pt_career_school_losses',
       'team1_pt_career_overall_wins', 'team1_pt_career_overall_losses',
       'team1_pt_team_season_wins', 'team1_pt_team_season_losses',
       'team1_pt_coach_season_wins', 'team1_pt_coach_season_losses',
       'team2_ap_final', 'team2_ap_preseason', 'team2_coaches_before_final',
       'team2_coaches_preseason', 'team1_ap_final', 'team1_ap_preseason',
       'team1_coaches_before_final', 'team1_coaches_preseason', 'team2_fg2pct',
       'team2_fg3pct', 'team2_ftpct', 'team2_blockpct', 'team2_oppfg2pct',
       'team2_oppfg3pct', 'team2_oppftpct', 'team2_oppblockpct',
       'team2_f3grate', 'team2_oppf3grate', 'team2_arate', 'team2_opparate',
       'team2_stlrate', 'team2_oppstlrate', 'team1_fg2pct', 'team1_fg3pct',
       'team1_ftpct', 'team1_blockpct', 'team1_oppfg2pct', 'team1_oppfg3pct',
       'team1_oppftpct', 'team1_oppblockpct', 'team1_f3grate',
       'team1_oppf3grate', 'team1_arate', 'team1_opparate', 'team1_stlrate',
       'team1_oppstlrate', 'team2_tempo', 'team2_adjtempo', 'team2_oe',
       'team2_adjoe', 'team2_de', 'team2_adjde', 'team1_tempo',
       'team1_adjtempo', 'team1_oe', 'team1_adjoe', 'team1_de', 'team1_adjde',
       'game_id', 'outcome']

In [72]:
ncaa_tour_2 = ncaa_tour_2[['team1_score', 'team2_score', 'team1_seed', 'team2_seed', 'season',
       'host_lat', 'host_long', 'team1_lat', 'team1_long', 'team2_lat',
       'team2_long', 'team1_pt_school_ncaa', 'team1_pt_overall_ncaa',
       'team1_pt_school_s16', 'team1_pt_overall_s16', 'team1_pt_school_ff',
       'team1_pt_overall_ff', 'team1_pt_career_school_wins',
       'team1_pt_career_school_losses', 'team1_pt_career_overall_wins',
       'team1_pt_career_overall_losses', 'team1_pt_team_season_wins',
       'team1_pt_team_season_losses', 'team1_pt_coach_season_wins',
       'team1_pt_coach_season_losses', 'team2_pt_school_ncaa',
       'team2_pt_overall_ncaa', 'team2_pt_school_s16', 'team2_pt_overall_s16',
       'team2_pt_school_ff', 'team2_pt_overall_ff',
       'team2_pt_career_school_wins', 'team2_pt_career_school_losses',
       'team2_pt_career_overall_wins', 'team2_pt_career_overall_losses',
       'team2_pt_team_season_wins', 'team2_pt_team_season_losses',
       'team2_pt_coach_season_wins', 'team2_pt_coach_season_losses',
       'team1_ap_final', 'team1_ap_preseason', 'team1_coaches_before_final',
       'team1_coaches_preseason', 'team2_ap_final', 'team2_ap_preseason',
       'team2_coaches_before_final', 'team2_coaches_preseason', 'team1_fg2pct',
       'team1_fg3pct', 'team1_ftpct', 'team1_blockpct', 'team1_oppfg2pct',
       'team1_oppfg3pct', 'team1_oppftpct', 'team1_oppblockpct',
       'team1_f3grate', 'team1_oppf3grate', 'team1_arate', 'team1_opparate',
       'team1_stlrate', 'team1_oppstlrate', 'team2_fg2pct', 'team2_fg3pct',
       'team2_ftpct', 'team2_blockpct', 'team2_oppfg2pct', 'team2_oppfg3pct',
       'team2_oppftpct', 'team2_oppblockpct', 'team2_f3grate',
       'team2_oppf3grate', 'team2_arate', 'team2_opparate', 'team2_stlrate',
       'team2_oppstlrate', 'team1_tempo', 'team1_adjtempo', 'team1_oe',
       'team1_adjoe', 'team1_de', 'team1_adjde', 'team2_tempo',
       'team2_adjtempo', 'team2_oe', 'team2_adjoe', 'team2_de', 'team2_adjde',
       'game_id', 'outcome']]
ncaa_tour_2.head()

Unnamed: 0,team1_score,team2_score,team1_seed,team2_seed,season,host_lat,host_long,team1_lat,team1_long,team2_lat,...,team1_de,team1_adjde,team2_tempo,team2_adjtempo,team2_oe,team2_adjoe,team2_de,team2_adjde,game_id,outcome
0,45.0,58.0,16.0,1.0,2006.0,42.6875,-83.2342,40.280066,-74.006446,40.039388,...,94.761,96.8515,66.7668,67.3832,112.5262,115.8837,95.1165,92.4318,2006-1437-1284,0
1,65.0,78.0,5.0,12.0,2010.0,30.3369,-81.6614,39.980943,-75.158267,42.445291,...,89.3644,86.5396,65.5326,65.7817,113.4091,114.0527,97.4265,102.0349,2010-1165-1396,0
2,77.0,108.0,9.0,1.0,2008.0,35.7806,-78.6389,36.06153,-94.178214,35.912165,...,96.6896,95.5965,75.893,74.4187,115.3951,118.7927,94.9042,93.24,2008-1314-1116,0
3,52.0,59.0,9.0,5.0,2010.0,29.7604,-95.3698,42.514923,-92.459703,42.72476,...,91.6915,89.2475,66.673,67.1159,108.3923,109.2147,95.4326,91.952,2010-1277-1320,0
4,56.0,58.0,14.0,3.0,2007.0,38.0297,-84.4947,39.509642,-84.731821,44.044515,...,96.3595,96.5727,67.2739,66.9999,112.6453,115.7993,97.8219,97.5042,2007-1332-1275,0


In [73]:
ncaa_shuffle = pd.concat([ncaa_tour_2, ncaa_tour_1])

In [74]:
ncaa_shuffle

Unnamed: 0,team1_score,team2_score,team1_seed,team2_seed,season,host_lat,host_long,team1_lat,team1_long,team2_lat,...,team1_de,team1_adjde,team2_tempo,team2_adjtempo,team2_oe,team2_adjoe,team2_de,team2_adjde,game_id,outcome
0,45.0,58.0,16.0,1.0,2006.0,42.6875,-83.2342,40.280066,-74.006446,40.039388,...,94.7610,96.8515,66.7668,67.3832,112.5262,115.8837,95.1165,92.4318,2006-1437-1284,0
1,65.0,78.0,5.0,12.0,2010.0,30.3369,-81.6614,39.980943,-75.158267,42.445291,...,89.3644,86.5396,65.5326,65.7817,113.4091,114.0527,97.4265,102.0349,2010-1165-1396,0
2,77.0,108.0,9.0,1.0,2008.0,35.7806,-78.6389,36.061530,-94.178214,35.912165,...,96.6896,95.5965,75.8930,74.4187,115.3951,118.7927,94.9042,93.2400,2008-1314-1116,0
3,52.0,59.0,9.0,5.0,2010.0,29.7604,-95.3698,42.514923,-92.459703,42.724760,...,91.6915,89.2475,66.6730,67.1159,108.3923,109.2147,95.4326,91.9520,2010-1277-1320,0
4,56.0,58.0,14.0,3.0,2007.0,38.0297,-84.4947,39.509642,-84.731821,44.044515,...,96.3595,96.5727,67.2739,66.9999,112.6453,115.7993,97.8219,97.5042,2007-1332-1275,0
5,67.0,72.0,10.0,3.0,2004.0,38.6272,-90.1978,39.552523,-119.825260,33.773732,...,94.7882,92.4162,72.6362,71.1581,106.9236,112.3931,92.1012,90.5671,2004-1210-1305,0
6,75.0,94.0,9.0,8.0,2006.0,42.6875,-83.2342,43.073858,-89.405356,32.232071,...,96.9050,94.9980,69.2926,69.6354,103.1360,107.7834,96.4165,94.3050,2006-1112-1458,0
7,68.0,82.0,4.0,1.0,2016.0,33.8361,-117.8897,36.001590,-78.942260,44.044515,...,105.0370,98.8273,69.4792,69.0826,112.7070,118.6600,99.6220,95.9110,2016-1332-1181,0
8,65.0,87.0,8.0,1.0,2010.0,42.9047,-78.8494,47.668144,-117.403062,43.037587,...,97.1099,96.2094,71.3215,70.7441,114.2660,115.7085,94.1878,91.8891,2010-1393-1211,0
9,69.0,76.0,10.0,7.0,2004.0,39.0997,-94.5783,39.736860,-84.173662,41.923332,...,100.0119,97.0935,64.7191,64.0949,109.1876,113.5851,103.8945,101.7404,2004-1177-1173,0


In [75]:
ncaa_shuffle['dist1'] = ncaa_shuffle.apply(lambda row: distance(row['host_lat'], row['host_long'], row['team1_lat'], row['team1_long']), axis=1)
ncaa_shuffle['dist2'] = ncaa_shuffle.apply(lambda row: distance(row['host_lat'], row['host_long'], row['team2_lat'], row['team2_long']), axis=1)

In [76]:
ncaa_tour2 = pd.DataFrame()

In [77]:
ncaa_tour2['d_team_seed'] = ncaa_shuffle['team1_seed'] - ncaa_shuffle['team2_seed']
ncaa_tour2['q_team_seed'] = ncaa_shuffle['team1_seed'] / ncaa_shuffle['team2_seed']
ncaa_tour2['season'] = ncaa_shuffle['season']
ncaa_tour2['diff_dist'] = ncaa_shuffle['dist1'] - ncaa_shuffle['dist2']
ncaa_tour2['d_pt_school_ncaa'] = ncaa_shuffle['team1_pt_school_ncaa'] - ncaa_shuffle['team2_pt_school_ncaa']
ncaa_tour2['q_pt_school_ncaa'] = ncaa_shuffle['team1_pt_school_ncaa'] / ncaa_shuffle['team2_pt_school_ncaa']
ncaa_tour2['d_pt_overall_ncaa'] = ncaa_shuffle['team1_pt_overall_ncaa'] - ncaa_shuffle['team2_pt_overall_ncaa']
ncaa_tour2['q_pt_overall_ncaa'] = ncaa_shuffle['team1_pt_overall_ncaa'] / ncaa_shuffle['team2_pt_overall_ncaa']
ncaa_tour2['d_pt_school_s16'] = ncaa_shuffle['team1_pt_school_s16'] - ncaa_shuffle['team2_pt_school_s16']
ncaa_tour2['q_pt_school_s16'] = ncaa_shuffle['team1_pt_school_s16'] / ncaa_shuffle['team2_pt_school_s16']
ncaa_tour2['d_pt_overall_s16'] = ncaa_shuffle['team1_pt_overall_s16'] - ncaa_shuffle['team2_pt_overall_s16']
ncaa_tour2['q_pt_overall_s16'] = ncaa_shuffle['team1_pt_overall_s16'] / ncaa_shuffle['team2_pt_overall_s16']
ncaa_tour2['d_pt_school_ff'] = ncaa_shuffle['team1_pt_school_ff'] - ncaa_shuffle['team2_pt_school_ff']
ncaa_tour2['q_pt_school_ff'] = ncaa_shuffle['team1_pt_school_ff'] / ncaa_shuffle['team2_pt_school_ff']
ncaa_tour2['d_pt_overall_ff'] = ncaa_shuffle['team1_pt_overall_ff'] - ncaa_shuffle['team2_pt_overall_ff']
ncaa_tour2['q_pt_overall_ff'] = ncaa_shuffle['team1_pt_overall_ff'] / ncaa_shuffle['team2_pt_overall_ff']
ncaa_tour2['d_pt_career_school_wins'] = ncaa_shuffle['team1_pt_career_school_wins'] - ncaa_shuffle['team2_pt_career_school_wins']
ncaa_tour2['q_pt_career_school_wins'] = ncaa_shuffle['team1_pt_career_school_wins'] / ncaa_shuffle['team2_pt_career_school_wins']
ncaa_tour2['d_pt_career_school_losses'] = ncaa_shuffle['team1_pt_career_school_losses'] - ncaa_shuffle['team2_pt_career_school_losses']
ncaa_tour2['q_pt_career_school_losses'] = ncaa_shuffle['team1_pt_career_school_losses'] / ncaa_shuffle['team2_pt_career_school_losses']
ncaa_tour2['d_pt_career_overall_wins'] = ncaa_shuffle['team1_pt_career_overall_wins'] - ncaa_shuffle['team2_pt_career_overall_wins']
ncaa_tour2['q_pt_career_overall_wins'] = ncaa_shuffle['team1_pt_career_overall_wins'] / ncaa_shuffle['team2_pt_career_overall_wins']
ncaa_tour2['d_pt_career_overall_losses'] = ncaa_shuffle['team1_pt_career_overall_losses'] - ncaa_shuffle['team2_pt_career_overall_losses']
ncaa_tour2['q_pt_career_overall_losses'] = ncaa_shuffle['team1_pt_career_overall_losses'] / ncaa_shuffle['team2_pt_career_overall_losses']
ncaa_tour2['d_pt_team_season_wins'] = ncaa_shuffle['team1_pt_team_season_wins'] - ncaa_shuffle['team2_pt_team_season_wins']
ncaa_tour2['q_pt_team_season_wins'] = ncaa_shuffle['team1_pt_team_season_wins'] / ncaa_shuffle['team2_pt_team_season_wins']
ncaa_tour2['d_pt_team_season_losses'] = ncaa_shuffle['team1_pt_team_season_losses'] - ncaa_shuffle['team2_pt_team_season_losses']
ncaa_tour2['q_pt_team_season_losses'] = ncaa_shuffle['team1_pt_team_season_losses'] / ncaa_shuffle['team2_pt_team_season_losses']
ncaa_tour2['d_pt_coach_season_wins'] = ncaa_shuffle['team1_pt_coach_season_wins'] - ncaa_shuffle['team2_pt_coach_season_wins']
ncaa_tour2['q_pt_coach_season_wins'] = ncaa_shuffle['team1_pt_coach_season_wins'] / ncaa_shuffle['team2_pt_coach_season_wins']
ncaa_tour2['d_pt_coach_season_losses'] = ncaa_shuffle['team1_pt_coach_season_losses'] - ncaa_shuffle['team2_pt_coach_season_losses']
ncaa_tour2['q_pt_coach_season_losses'] = ncaa_shuffle['team1_pt_coach_season_losses'] / ncaa_shuffle['team2_pt_coach_season_losses']

In [78]:
ncaa_tour2['d_ap_final'] = ncaa_shuffle['team1_ap_final'] - ncaa_shuffle['team2_ap_final']
ncaa_tour2['q_ap_final'] = ncaa_shuffle['team1_ap_final'] / ncaa_shuffle['team2_ap_final']
ncaa_tour2['d_ap_preseason'] = ncaa_shuffle['team1_ap_preseason'] - ncaa_shuffle['team2_ap_preseason']
ncaa_tour2['q_ap_preseason'] = ncaa_shuffle['team1_ap_preseason'] / ncaa_shuffle['team2_ap_preseason']
ncaa_tour2['d_coaches_before_final'] = ncaa_shuffle['team1_coaches_before_final'] - ncaa_shuffle['team2_coaches_before_final']
ncaa_tour2['q_coaches_before_final'] = ncaa_shuffle['team1_coaches_before_final'] / ncaa_shuffle['team2_coaches_before_final']
ncaa_tour2['d_coaches_preseason'] = ncaa_shuffle['team1_coaches_preseason'] - ncaa_shuffle['team2_coaches_preseason']
ncaa_tour2['q_coaches_preseason'] = ncaa_shuffle['team1_coaches_preseason'] / ncaa_shuffle['team2_coaches_preseason']

ncaa_tour2['d_fg2pct'] = ncaa_shuffle['team1_fg2pct'] - ncaa_shuffle['team2_fg2pct']
ncaa_tour2['q_fg2pct'] = ncaa_shuffle['team1_fg2pct'] / ncaa_shuffle['team2_fg2pct']
ncaa_tour2['d_fg3pct'] = ncaa_shuffle['team1_fg3pct'] - ncaa_shuffle['team2_fg3pct']
ncaa_tour2['q_fg3pct'] = ncaa_shuffle['team1_fg3pct'] / ncaa_shuffle['team2_fg3pct']
ncaa_tour2['d_ftpct'] = ncaa_shuffle['team1_ftpct'] - ncaa_shuffle['team2_ftpct']
ncaa_tour2['q_ftpct'] = ncaa_shuffle['team1_ftpct'] / ncaa_shuffle['team2_ftpct']
ncaa_tour2['d_blockpct'] = ncaa_shuffle['team1_blockpct'] - ncaa_shuffle['team2_blockpct']
ncaa_tour2['q_blockpct'] = ncaa_shuffle['team1_blockpct'] / ncaa_shuffle['team2_blockpct']
ncaa_tour2['d_oppfg2pct'] = ncaa_shuffle['team1_oppfg2pct'] - ncaa_shuffle['team2_oppfg2pct']
ncaa_tour2['q_oppfg2pct'] = ncaa_shuffle['team1_oppfg2pct'] / ncaa_shuffle['team2_oppfg2pct']
ncaa_tour2['d_oppfg3pct'] = ncaa_shuffle['team1_oppfg3pct'] - ncaa_shuffle['team2_oppfg3pct']
ncaa_tour2['q_oppfg3pct'] = ncaa_shuffle['team1_oppfg3pct'] / ncaa_shuffle['team2_oppfg3pct']
ncaa_tour2['d_oppftpct'] = ncaa_shuffle['team1_oppftpct'] - ncaa_shuffle['team2_oppftpct']
ncaa_tour2['q_oppftpct'] = ncaa_shuffle['team1_oppftpct'] / ncaa_shuffle['team2_oppftpct']
ncaa_tour2['d_oppblockpct'] = ncaa_shuffle['team1_oppblockpct'] - ncaa_shuffle['team2_oppblockpct']
ncaa_tour2['q_oppblockpct'] = ncaa_shuffle['team1_oppblockpct'] / ncaa_shuffle['team2_oppblockpct']
ncaa_tour2['d_f3grate'] = ncaa_shuffle['team1_f3grate'] - ncaa_shuffle['team2_f3grate']
ncaa_tour2['q_f3grate'] = ncaa_shuffle['team1_f3grate'] / ncaa_shuffle['team2_f3grate']
ncaa_tour2['d_oppf3grate'] = ncaa_shuffle['team1_oppf3grate'] - ncaa_shuffle['team2_oppf3grate']
ncaa_tour2['q_oppf3grate'] = ncaa_shuffle['team1_oppf3grate'] / ncaa_shuffle['team2_oppf3grate']
ncaa_tour2['d_arate'] = ncaa_shuffle['team1_arate'] - ncaa_shuffle['team2_arate']
ncaa_tour2['q_arate'] = ncaa_shuffle['team1_arate'] / ncaa_shuffle['team2_arate']
ncaa_tour2['d_opparate'] = ncaa_shuffle['team1_opparate'] - ncaa_shuffle['team2_opparate']
ncaa_tour2['q_opparate'] = ncaa_shuffle['team1_opparate'] / ncaa_shuffle['team2_opparate']
ncaa_tour2['d_stlrate'] = ncaa_shuffle['team1_stlrate'] - ncaa_shuffle['team2_stlrate']
ncaa_tour2['q_stlrate'] = ncaa_shuffle['team1_stlrate'] / ncaa_shuffle['team2_stlrate']
ncaa_tour2['d_oppstlrate'] = ncaa_shuffle['team1_oppstlrate'] - ncaa_shuffle['team2_oppstlrate']
ncaa_tour2['q_oppstlrate'] = ncaa_shuffle['team1_oppstlrate'] / ncaa_shuffle['team2_oppstlrate']
ncaa_tour2['d_tempo'] = ncaa_shuffle['team1_tempo'] - ncaa_shuffle['team2_tempo']
ncaa_tour2['q_tempo'] = ncaa_shuffle['team1_tempo'] / ncaa_shuffle['team2_tempo']
ncaa_tour2['d_adjtempo'] = ncaa_shuffle['team1_adjtempo'] - ncaa_shuffle['team2_adjtempo']
ncaa_tour2['q_adjtempo'] = ncaa_shuffle['team1_adjtempo'] / ncaa_shuffle['team2_adjtempo']
ncaa_tour2['d_oe'] = ncaa_shuffle['team1_oe'] - ncaa_shuffle['team2_oe']
ncaa_tour2['q_oe'] = ncaa_shuffle['team1_oe'] / ncaa_shuffle['team2_oe']
ncaa_tour2['d_adjoe'] = ncaa_shuffle['team1_adjoe'] - ncaa_shuffle['team2_adjoe']
ncaa_tour2['q_adjoe'] = ncaa_shuffle['team1_adjoe'] / ncaa_shuffle['team2_adjoe']
ncaa_tour2['d_de'] = ncaa_shuffle['team1_de'] - ncaa_shuffle['team2_de']
ncaa_tour2['q_de'] = ncaa_shuffle['team1_de'] / ncaa_shuffle['team2_de']
ncaa_tour2['d_adjde'] = ncaa_shuffle['team1_adjde'] - ncaa_shuffle['team2_adjde']
ncaa_tour2['q_adjde'] = ncaa_shuffle['team1_adjde'] / ncaa_shuffle['team2_adjde']
ncaa_tour2['outcome'] = ncaa_shuffle['outcome']

In [79]:
ncaa_tour2['game_id'] = ncaa_shuffle['game_id']

In [81]:
np.sum(ncaa_tour2==np.inf)

d_team_seed                   0
q_team_seed                   0
season                        0
diff_dist                     0
d_pt_school_ncaa              0
q_pt_school_ncaa              0
d_pt_overall_ncaa             0
q_pt_overall_ncaa             0
d_pt_school_s16               0
q_pt_school_s16               0
d_pt_overall_s16              0
q_pt_overall_s16              0
d_pt_school_ff                0
q_pt_school_ff                0
d_pt_overall_ff               0
q_pt_overall_ff               0
d_pt_career_school_wins       0
q_pt_career_school_wins       0
d_pt_career_school_losses     0
q_pt_career_school_losses     0
d_pt_career_overall_wins      0
q_pt_career_overall_wins      0
d_pt_career_overall_losses    0
q_pt_career_overall_losses    0
d_pt_team_season_wins         0
q_pt_team_season_wins         0
d_pt_team_season_losses       0
q_pt_team_season_losses       0
d_pt_coach_season_wins        0
q_pt_coach_season_wins        0
                             ..
d_oppftp

In [82]:
ncaa_tour2.head()

Unnamed: 0,d_team_seed,q_team_seed,season,diff_dist,d_pt_school_ncaa,q_pt_school_ncaa,d_pt_overall_ncaa,q_pt_overall_ncaa,d_pt_school_s16,q_pt_school_s16,...,d_oe,q_oe,d_adjoe,q_adjoe,d_de,q_de,d_adjde,q_adjde,outcome,game_id
0,15.0,16.0,2006.0,92.642751,1.0,2.0,-1.0,0.666667,-0.9,0.1,...,-16.2231,0.855828,-20.5565,0.822611,-0.3555,0.996262,4.4197,1.047816,0,2006-1437-1284
1,-7.0,0.416667,2010.0,-199.617812,0.0,1.0,9.0,5.5,0.0,1.0,...,-9.3264,0.917763,-8.2162,0.927961,-8.0621,0.917249,-15.4953,0.848137,0,2010-1165-1396
2,8.0,9.0,2008.0,1358.663148,-3.9,0.025,-17.0,0.055556,-1.9,0.05,...,-9.1885,0.920374,-10.8438,0.908717,1.7854,1.018813,2.3565,1.025273,0,2008-1314-1116
3,4.0,1.8,2010.0,-296.430744,-11.0,0.083333,-11.0,0.083333,-7.9,0.0125,...,-1.1237,0.989633,-3.1038,0.971581,-3.7411,0.960799,-2.7045,0.970588,0,2010-1277-1320
4,11.0,4.666667,2007.0,-3108.062656,-1.0,0.666667,-1.0,0.75,0.0,1.0,...,-12.6987,0.887268,-12.5453,0.891663,-1.4624,0.98505,-0.9315,0.990447,0,2007-1332-1275


In [83]:
corr = ncaa_tour2.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,d_team_seed,q_team_seed,season,diff_dist,d_pt_school_ncaa,q_pt_school_ncaa,d_pt_overall_ncaa,q_pt_overall_ncaa,d_pt_school_s16,q_pt_school_s16,d_pt_overall_s16,q_pt_overall_s16,d_pt_school_ff,q_pt_school_ff,d_pt_overall_ff,q_pt_overall_ff,d_pt_career_school_wins,q_pt_career_school_wins,d_pt_career_school_losses,q_pt_career_school_losses,d_pt_career_overall_wins,q_pt_career_overall_wins,d_pt_career_overall_losses,q_pt_career_overall_losses,d_pt_team_season_wins,q_pt_team_season_wins,d_pt_team_season_losses,q_pt_team_season_losses,d_pt_coach_season_wins,q_pt_coach_season_wins,d_pt_coach_season_losses,q_pt_coach_season_losses,d_ap_final,q_ap_final,d_ap_preseason,q_ap_preseason,d_coaches_before_final,q_coaches_before_final,d_coaches_preseason,q_coaches_preseason,d_fg2pct,q_fg2pct,d_fg3pct,q_fg3pct,d_ftpct,q_ftpct,d_blockpct,q_blockpct,d_oppfg2pct,q_oppfg2pct,d_oppfg3pct,q_oppfg3pct,d_oppftpct,q_oppftpct,d_oppblockpct,q_oppblockpct,d_f3grate,q_f3grate,d_oppf3grate,q_oppf3grate,d_arate,q_arate,d_opparate,q_opparate,d_stlrate,q_stlrate,d_oppstlrate,q_oppstlrate,d_tempo,q_tempo,d_adjtempo,q_adjtempo,d_oe,q_oe,d_adjoe,q_adjoe,d_de,q_de,d_adjde,q_adjde,outcome
d_team_seed,1.0,0.750827,-0.0102639,0.147625,-0.466099,-0.324449,-0.570004,-0.350473,-0.465117,-0.379245,-0.536636,-0.428145,-0.367801,-0.283735,-0.41346,-0.318106,-0.383522,-0.0161428,-0.0995274,-0.0546537,-0.521343,-0.0176463,-0.294152,-0.157097,-0.0178498,-0.0891854,0.338672,0.152696,-0.56119,-0.027952,0.599338,0.142275,0.904842,0.496114,0.655388,0.370522,0.865862,0.468028,0.653188,0.360394,-0.333813,-0.33387,-0.215395,-0.21679,-0.0787847,-0.0809301,-0.325276,-0.298472,0.404675,0.405444,0.228225,0.228163,0.0862568,0.0864019,0.160181,0.15515,0.110087,0.104642,0.0828854,0.070995,-0.158388,-0.161562,0.0989214,0.0955683,-0.063588,-0.0647128,0.175565,0.172715,-0.107895,-0.105214,-0.0865815,-0.0848782,-0.574258,-0.575077,-0.742591,-0.743682,0.420719,0.420924,0.652972,0.651846,-0.489536
q_team_seed,0.750827,1.0,-0.0127271,0.148847,-0.391467,-0.197294,-0.471758,-0.192594,-0.397235,-0.231702,-0.463289,-0.250251,-0.326578,-0.187756,-0.368334,-0.207409,-0.343749,-0.028739,-0.100808,-0.0416164,-0.440446,-0.0282021,-0.230886,-0.12841,0.0087741,-0.0543552,0.365085,0.273067,-0.572994,-0.0340916,0.583047,0.250795,0.702571,0.748892,0.538147,0.490454,0.671377,0.690947,0.538595,0.476191,-0.332506,-0.325288,-0.196771,-0.196577,-0.0724845,-0.0732009,-0.272777,-0.232349,0.328188,0.333556,0.183194,0.184981,0.103652,0.104251,0.168445,0.170826,0.109855,0.10535,0.112126,0.112166,-0.126288,-0.130215,0.120442,0.119986,-0.0748118,-0.0756312,0.148128,0.146255,-0.101236,-0.0954422,-0.0810082,-0.0763152,-0.52207,-0.510501,-0.622991,-0.60424,0.374687,0.381629,0.540049,0.553179,-0.395772
season,-0.0102639,-0.0127271,1.0,-0.0618053,0.0184608,0.0555327,0.0215902,0.0312046,0.0179208,0.0272652,0.0238786,-0.0295799,0.0117411,0.0476044,0.00651213,0.0266607,0.00619118,0.0592341,-0.0192785,-0.0183151,0.0103697,0.0575154,-0.00979871,-0.0153536,-0.0248147,0.0125391,0.000538182,0.0532406,-0.00692049,0.0621717,0.0348789,0.0490526,0.0127925,-0.0254435,-0.0319482,-0.0105339,0.0135182,-0.0484866,-0.029941,0.0114823,0.022545,0.0248237,0.0228845,0.0259008,0.0105465,0.00926979,-0.036857,-0.0552606,-0.0112556,-0.0115187,0.0287849,0.0285133,-0.0196507,-0.0200997,0.0326209,0.0128651,-0.0110538,-0.03306,0.0488788,0.0443672,-0.0115654,-0.00430447,0.0151036,0.0246093,-0.0563672,-0.0515066,-0.0600079,-0.0627473,0.0232182,0.0171327,0.0303759,0.0264254,0.0206996,0.0232841,0.0248833,0.0252126,0.0134139,0.0155159,0.00970575,0.0103634,-0.0455023
diff_dist,0.147625,0.148847,-0.0618053,1.0,-0.0658715,-0.0459246,-0.0880066,-0.00387552,-0.0657048,-0.0414207,-0.100115,-0.054028,-0.0635983,-0.0500488,-0.0730021,-0.0488355,-0.0513614,-0.0251277,0.00646291,-0.0149287,-0.0579508,-0.0218775,0.0348869,0.0229931,-0.012276,-0.0460097,0.0656995,0.0589697,-0.14056,-0.0248331,0.131645,0.0611502,0.15265,0.12162,0.11513,0.102594,0.150682,0.128083,0.111542,0.102148,-0.0778641,-0.0773571,-0.0167163,-0.0181151,0.026515,0.0256351,-0.0481519,-0.0495698,0.104195,0.101279,0.0165068,0.0164461,0.0321483,0.0325985,-0.00580582,-0.00744838,-0.0119886,-0.00861516,0.00821054,0.00152232,-0.0461334,-0.0476173,0.0222542,0.0215175,-0.0287828,-0.0337429,-0.0154109,-0.0123303,0.0286051,0.0275242,0.0314512,0.0303056,-0.0923638,-0.0923598,-0.104038,-0.103739,0.105984,0.106526,0.142995,0.14371,-0.117382
d_pt_school_ncaa,-0.466099,-0.391467,0.0184608,-0.0658715,1.0,0.540436,0.84122,0.375955,0.946964,0.644973,0.827832,0.53752,0.795779,0.585885,0.720345,0.535611,0.96525,0.032708,0.71321,0.345195,0.772924,0.0241419,0.499268,0.207026,-0.0226419,0.0390625,-0.193464,-0.0412872,0.314453,0.0181947,-0.287677,-0.035773,-0.455157,-0.253246,-0.468855,-0.322936,-0.438294,-0.248491,-0.483405,-0.326329,0.217325,0.215613,0.12026,0.120351,0.026247,0.0256826,0.258521,0.230142,-0.208061,-0.204561,-0.188524,-0.19113,-0.0397694,-0.0433601,-0.18233,-0.178213,-0.0953486,-0.0957848,-0.0666688,-0.0810341,0.0623239,0.0644042,0.0495448,0.0284766,0.0894766,0.0748032,-0.053289,-0.0522975,0.128358,0.127787,0.107664,0.107203,0.366565,0.364679,0.411331,0.409369,-0.171488,-0.170456,-0.275657,-0.272927,0.263574
q_pt_school_ncaa,-0.324449,-0.197294,0.0555327,-0.0459246,0.540436,1.0,0.488911,0.670307,0.496029,0.709848,0.459505,0.549652,0.434483,0.5538,0.412264,0.536568,0.520724,0.0320489,0.401683,0.372803,0.460118,0.0217868,0.327711,0.220656,-0.0626455,0.0115057,-0.133401,-0.0388597,0.186433,0.0120873,-0.174967,-0.0351133,-0.30966,-0.137155,-0.339899,-0.134904,-0.271595,-0.119404,-0.343243,-0.133059,0.143737,0.147293,0.0994514,0.101054,0.0340089,0.0332031,0.127531,0.101866,-0.14419,-0.141859,-0.0594667,-0.0574173,-0.0514679,-0.0523352,-0.114085,-0.114566,-0.00807997,-0.0124161,-0.0833994,-0.0852473,0.0247075,0.0254491,-0.0128466,-0.019406,0.0283365,0.0274132,-0.116536,-0.113615,0.0579774,0.0562848,0.0492911,0.0480869,0.264906,0.271064,0.303736,0.310878,-0.119358,-0.118886,-0.185129,-0.181373,0.156069
d_pt_overall_ncaa,-0.570004,-0.471758,0.0215902,-0.0880066,0.84122,0.488911,1.0,0.475984,0.810941,0.576786,0.926759,0.618459,0.681588,0.514496,0.776767,0.58106,0.795141,0.0352515,0.526466,0.281437,0.950341,0.0328333,0.664066,0.286847,-0.0314064,0.0239756,-0.219253,-0.0817493,0.340672,0.0228915,-0.299606,-0.0694401,-0.53681,-0.334017,-0.54404,-0.393406,-0.516658,-0.320014,-0.564223,-0.400046,0.154583,0.154629,0.0923652,0.0917423,0.0140651,0.0164941,0.346245,0.311264,-0.268323,-0.264541,-0.144714,-0.14749,-0.0682895,-0.0702091,-0.174781,-0.167365,-0.212559,-0.217497,-0.0352002,-0.0396765,0.063355,0.0661502,-0.00183053,-0.0142175,0.106407,0.0910819,-0.0449146,-0.0433091,0.227516,0.225702,0.19613,0.194887,0.38132,0.37868,0.463148,0.460959,-0.225559,-0.224573,-0.351505,-0.349557,0.316551
q_pt_overall_ncaa,-0.350473,-0.192594,0.0312046,-0.00387552,0.375955,0.670307,0.475984,1.0,0.336419,0.446744,0.392045,0.609529,0.289766,0.338523,0.317831,0.39244,0.349151,0.0409696,0.253192,0.266824,0.45951,0.0461015,0.3801,0.392066,-0.0779716,0.0056918,-0.124784,-0.0339499,0.15226,0.0281657,-0.137109,-0.029955,-0.304652,-0.132326,-0.302369,-0.12637,-0.277127,-0.115743,-0.297063,-0.124093,0.101802,0.102097,0.0912103,0.0950155,0.0232789,0.026109,0.120836,0.110314,-0.130084,-0.126353,-0.0461729,-0.0444941,-0.0197958,-0.0210557,-0.0786923,-0.0813599,-0.0238226,-0.0279646,-0.0365465,-0.0316075,0.0836651,0.0830111,-0.0112403,-0.0136514,0.0326903,0.023422,-0.101019,-0.0990446,0.0891365,0.0877508,0.0739045,0.0737083,0.236674,0.237901,0.292469,0.298733,-0.120887,-0.119007,-0.181597,-0.177327,0.138602
d_pt_school_s16,-0.465117,-0.397235,0.0179208,-0.0657048,0.946964,0.496029,0.810941,0.336419,1.0,0.676631,0.883577,0.56077,0.898536,0.662145,0.805725,0.595388,0.918291,0.0248363,0.631139,0.279085,0.758803,0.0169736,0.469836,0.168352,-0.000708703,0.0413741,-0.162303,-0.0491282,0.284036,0.0119188,-0.262385,-0.0447714,-0.448608,-0.261344,-0.493475,-0.375787,-0.435931,-0.256305,-0.505793,-0.376909,0.177932,0.179013,0.112266,0.111871,0.0136019,0.0138643,0.280115,0.251821,-0.201971,-0.199077,-0.191781,-0.195331,0.00277755,0.000429941,-0.132513,-0.129342,-0.126463,-0.126196,-0.0816215,-0.0979035,0.0424146,0.0447246,0.0247256,0.00882237,0.0835795,0.0717377,-0.0419102,-0.0376159,0.156066,0.155684,0.138058,0.137891,0.340122,0.339199,0.402702,0.401786,-0.160718,-0.159636,-0.284536,-0.282481,0.269984
q_pt_school_s16,-0.379245,-0.231702,0.0272652,-0.0414207,0.644973,0.709848,0.576786,0.446744,0.676631,1.0,0.612694,0.788622,0.615242,0.818259,0.56144,0.749207,0.618429,0.0122013,0.421333,0.268666,0.538997,0.00232334,0.34058,0.150614,0.00105699,0.0234839,-0.121462,-0.0442205,0.215196,-0.00523975,-0.210865,-0.041144,-0.370166,-0.160544,-0.395875,-0.155291,-0.335021,-0.147036,-0.393922,-0.152807,0.114464,0.118215,0.0824521,0.0804207,0.0180732,0.0176319,0.208284,0.192371,-0.19013,-0.187613,-0.127103,-0.127961,0.0133272,0.0124427,-0.100294,-0.0976264,-0.0926323,-0.0929358,-0.0969699,-0.094308,0.0204781,0.0198168,-0.0192481,-0.0264337,0.0667293,0.0580363,-0.0865264,-0.0823307,0.124721,0.123928,0.108816,0.107845,0.266612,0.271536,0.32673,0.333155,-0.165703,-0.165341,-0.258016,-0.253358,0.192849


In [84]:
corr.to_csv('corr2.csv')

In [85]:
# Delete all variables with correlation higher than 0.9
del ncaa_tour2['d_ap_final']

In [86]:
del ncaa_tour2['d_pt_school_s16']
del ncaa_tour2['d_pt_career_school_wins']
del ncaa_tour2['d_pt_overall_s16']
del ncaa_tour2['d_pt_career_overall_wins']
del ncaa_tour2['d_pt_overall_ff']
del ncaa_tour2['q_pt_coach_season_wins']
del ncaa_tour2['d_coaches_preseason']
del ncaa_tour2['q_coaches_preseason']
del ncaa_tour2['q_fg2pct']
del ncaa_tour2['q_fg3pct']
del ncaa_tour2['q_ftpct']
del ncaa_tour2['q_blockpct']
del ncaa_tour2['q_oppfg3pct']
del ncaa_tour2['q_oppfg2pct']
del ncaa_tour2['q_oppftpct']
del ncaa_tour2['q_oppblockpct']
del ncaa_tour2['q_f3grate']
del ncaa_tour2['q_oppf3grate']
del ncaa_tour2['q_arate']
del ncaa_tour2['q_opparate']
del ncaa_tour2['q_stlrate']
del ncaa_tour2['q_oppstlrate']
del ncaa_tour2['d_tempo']
del ncaa_tour2['q_tempo']
del ncaa_tour2['q_adjtempo']
del ncaa_tour2['q_oe']
del ncaa_tour2['q_adjoe']
del ncaa_tour2['q_de']
del ncaa_tour2['q_adjde']

In [87]:
ncaa_tour2.to_csv('correlated_removed.csv')

In [88]:
ncaa_tour2.columns

Index(['d_team_seed', 'q_team_seed', 'season', 'diff_dist', 'd_pt_school_ncaa',
       'q_pt_school_ncaa', 'd_pt_overall_ncaa', 'q_pt_overall_ncaa',
       'q_pt_school_s16', 'q_pt_overall_s16', 'd_pt_school_ff',
       'q_pt_school_ff', 'q_pt_overall_ff', 'q_pt_career_school_wins',
       'd_pt_career_school_losses', 'q_pt_career_school_losses',
       'q_pt_career_overall_wins', 'd_pt_career_overall_losses',
       'q_pt_career_overall_losses', 'd_pt_team_season_wins',
       'q_pt_team_season_wins', 'd_pt_team_season_losses',
       'q_pt_team_season_losses', 'd_pt_coach_season_wins',
       'd_pt_coach_season_losses', 'q_pt_coach_season_losses', 'q_ap_final',
       'd_ap_preseason', 'q_ap_preseason', 'd_coaches_before_final',
       'q_coaches_before_final', 'd_fg2pct', 'd_fg3pct', 'd_ftpct',
       'd_blockpct', 'd_oppfg2pct', 'd_oppfg3pct', 'd_oppftpct',
       'd_oppblockpct', 'd_f3grate', 'd_oppf3grate', 'd_arate', 'd_opparate',
       'd_stlrate', 'd_oppstlrate', 'd_adjtempo'

In [170]:
rf_train1 = ncaa_tour2[['d_team_seed', 'q_team_seed', 'diff_dist', 'd_pt_school_ncaa',
       'q_pt_school_ncaa', 'd_pt_overall_ncaa', 'q_pt_overall_ncaa',
       'q_pt_school_s16', 'q_pt_overall_s16', 'd_pt_school_ff',
       'q_pt_school_ff', 'q_pt_overall_ff', 'q_pt_career_school_wins',
       'd_pt_career_school_losses', 'q_pt_career_school_losses',
       'q_pt_career_overall_wins', 'd_pt_career_overall_losses',
       'q_pt_career_overall_losses', 'd_pt_team_season_wins',
       'q_pt_team_season_wins', 'd_pt_team_season_losses',
       'q_pt_team_season_losses', 'd_pt_coach_season_wins',
       'd_pt_coach_season_losses', 'q_pt_coach_season_losses', 'q_ap_final',
       'd_ap_preseason', 'q_ap_preseason', 'd_coaches_before_final',
       'q_coaches_before_final', 'd_fg2pct', 'd_fg3pct', 'd_ftpct',
       'd_blockpct', 'd_oppfg2pct', 'd_oppfg3pct', 'd_oppftpct',
       'd_oppblockpct', 'd_f3grate', 'd_oppf3grate', 'd_arate', 'd_opparate',
       'd_stlrate', 'd_oppstlrate', 'd_adjtempo', 'd_oe', 'd_adjoe', 'd_de',
       'd_adjde']]

In [171]:
rf_train1.head()

Unnamed: 0,d_team_seed,q_team_seed,diff_dist,d_pt_school_ncaa,q_pt_school_ncaa,d_pt_overall_ncaa,q_pt_overall_ncaa,q_pt_school_s16,q_pt_overall_s16,d_pt_school_ff,...,d_oppf3grate,d_arate,d_opparate,d_stlrate,d_oppstlrate,d_adjtempo,d_oe,d_adjoe,d_de,d_adjde
0,15.0,16.0,92.642751,1.0,2.0,-1.0,0.666667,0.1,0.1,0.0,...,5.9113,13.9719,-0.9213,-0.0048,0.0339,-1.1163,-16.2231,-20.5565,-0.3555,4.4197
1,-7.0,0.416667,-199.617812,0.0,1.0,9.0,5.5,1.0,1.0,0.0,...,0.7215,1.4794,-4.9937,-0.0334,-0.012,-4.5776,-9.3264,-8.2162,-8.0621,-15.4953
2,8.0,9.0,1358.663148,-3.9,0.025,-17.0,0.055556,0.05,0.009091,-0.9,...,0.7602,2.033,2.5341,-0.0084,-0.0074,-6.02,-9.1885,-10.8438,1.7854,2.3565
3,4.0,1.8,-296.430744,-11.0,0.083333,-11.0,0.083333,0.0125,0.0125,-4.9,...,-1.0024,-13.0878,-10.5639,-0.0029,-0.0203,-7.2159,-1.1237,-3.1038,-3.7411,-2.7045
4,11.0,4.666667,-3108.062656,-1.0,0.666667,-1.0,0.75,1.0,1.0,0.0,...,-2.7551,5.2012,-2.9457,-0.0068,0.0323,-7.7949,-12.6987,-12.5453,-1.4624,-0.9315


In [91]:
##This part need gurther grid search!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [172]:
sel = SelectFromModel(RandomForestClassifier(n_estimators = 1000 ))

In [173]:
sel.fit(rf_train1,ncaa_tour2['outcome'])

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=None)

In [174]:
selected_feat = rf_train1.columns[(sel.get_support())]
len(selected_feat)

12

In [175]:
print(selected_feat)

Index(['d_team_seed', 'q_team_seed', 'q_pt_overall_s16', 'q_ap_final',
       'd_ap_preseason', 'q_ap_preseason', 'd_coaches_before_final',
       'q_coaches_before_final', 'd_oppfg3pct', 'd_oe', 'd_adjoe', 'd_adjde'],
      dtype='object')


In [98]:
ncaa_selected = ncaa_tour2[selected_feat]
ncaa_selected['game_id'] = ncaa_tour2['game_id']
ncaa_selected['season'] = ncaa_tour2['season']
ncaa_selected['outcome'] = ncaa_tour2['outcome']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [99]:
ncaa_selected.head()

Unnamed: 0,d_team_seed,q_team_seed,q_pt_overall_s16,q_ap_final,d_ap_preseason,q_ap_preseason,d_coaches_before_final,q_coaches_before_final,d_oppfg3pct,d_oe,d_adjoe,d_adjde,game_id,season,outcome
0,15.0,16.0,0.1,22.5,40.0,9.0,41.0,11.25,-1.5249,-16.2231,-20.5565,4.4197,2006-1437-1284,2006.0,0
1,-7.0,0.416667,1.0,0.377778,0.0,1.0,-32.0,0.288889,-5.7315,-9.3264,-8.2162,-15.4953,2010-1165-1396,2010.0,0
2,8.0,9.0,0.009091,45.0,18.0,19.0,44.0,45.0,0.6558,-9.1885,-10.8438,2.3565,2008-1314-1116,2008.0,0
3,4.0,1.8,0.0125,4.090909,43.0,22.5,12.0,2.0,0.4313,-1.1237,-3.1038,-2.7045,2010-1277-1320,2010.0,0
4,11.0,4.666667,1.0,2.8125,0.0,1.0,33.0,3.75,0.6466,-12.6987,-12.5453,-0.9315,2007-1332-1275,2007.0,0


In [115]:
ncaa_selected_train = ncaa_selected[ncaa_selected['season'] != 2018].reset_index(drop=True)
ncaa_selected_testing = ncaa_selected[ncaa_selected['season'] == 2018].reset_index(drop=True)
input_train = ncaa_selected_train[selected_feat]
input_test = ncaa_selected_testing[selected_feat]
input_all = ncaa_selected[selected_feat]

In [105]:
# Logistic Regression
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [141]:
log_model = LogisticRegression(random_state=0, solver='liblinear',multi_class='ovr')

In [142]:
log_model.fit(input_train, ncaa_selected_train['outcome'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [144]:
ncaa_selected_testing['log_predict'] = log_model.predict(input_test)

In [146]:
log_model.predict_proba(input_test)

array([[0.35368747, 0.64631253],
       [0.66625662, 0.33374338],
       [0.76383568, 0.23616432],
       [0.6585151 , 0.3414849 ],
       [0.95574435, 0.04425565],
       [0.88886371, 0.11113629],
       [0.19718642, 0.80281358],
       [0.73614461, 0.26385539],
       [0.38086498, 0.61913502],
       [0.50124335, 0.49875665],
       [0.4656934 , 0.5343066 ],
       [0.76155661, 0.23844339],
       [0.76761818, 0.23238182],
       [0.68583668, 0.31416332],
       [0.44317435, 0.55682565],
       [0.32979961, 0.67020039],
       [0.34013682, 0.65986318],
       [0.08044006, 0.91955994],
       [0.71213922, 0.28786078],
       [0.53076894, 0.46923106],
       [0.72859237, 0.27140763],
       [0.77899576, 0.22100424],
       [0.80082616, 0.19917384],
       [0.65537395, 0.34462605],
       [0.55297463, 0.44702537],
       [0.35102354, 0.64897646],
       [0.87717959, 0.12282041],
       [0.06563181, 0.93436819],
       [0.68914645, 0.31085355],
       [0.17633216, 0.82366784],
       [0.

In [153]:
accuracy_score(ncaa_selected_testing['outcome'], log_model.predict(input_test))

0.7611940298507462

In [152]:
log_loss(ncaa_selected_testing['outcome'], log_model.predict_proba(input_test))

0.5689301356507891

In [120]:
#svm svc model
from sklearn.svm import SVC

In [155]:
svm_model = SVC(probability = True)
svm_model.fit(input_train, ncaa_selected_train['outcome'])



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [156]:
ncaa_selected_testing['predict'] = svm_model.predict(input_test)

In [157]:
#How to get probability here?
svm_model.predict_proba(input_test)

array([[0.49345646, 0.50654354],
       [0.49345646, 0.50654354],
       [0.49345645, 0.50654355],
       [0.49457258, 0.50542742],
       [0.49360659, 0.50639341],
       [0.5112084 , 0.4887916 ],
       [0.49345646, 0.50654354],
       [0.49364593, 0.50635407],
       [0.52801337, 0.47198663],
       [0.48919489, 0.51080511],
       [0.49024681, 0.50975319],
       [0.49345635, 0.50654365],
       [0.49345685, 0.50654315],
       [0.49498329, 0.50501671],
       [0.49345647, 0.50654353],
       [0.4933228 , 0.5066772 ],
       [0.48441079, 0.51558921],
       [0.49345646, 0.50654354],
       [0.49341646, 0.50658354],
       [0.47642719, 0.52357281],
       [0.49323938, 0.50676062],
       [0.49350675, 0.50649325],
       [0.5       , 0.5       ],
       [0.5       , 0.5       ],
       [0.56451431, 0.43548569],
       [0.48364444, 0.51635556],
       [0.49345968, 0.50654032],
       [0.49322033, 0.50677967],
       [0.49389155, 0.50610845],
       [0.49345678, 0.50654322],
       [0.

In [158]:
accuracy_score(ncaa_selected_testing['outcome'], svm_model.predict(input_test))

0.44776119402985076

In [159]:
log_loss(ncaa_selected_testing['outcome'],svm_model.predict_proba(input_test))

0.6984975195921507

In [131]:
##Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [132]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)

In [134]:
rf_model.fit(input_train, ncaa_selected_train['outcome'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [137]:
ncaa_selected_testing['predict'] = rf_model.predict(input_test)

In [139]:
rf_model.predict_proba(input_test)

array([[0.31531192, 0.68468808],
       [0.60341992, 0.39658008],
       [0.59923633, 0.40076367],
       [0.68131153, 0.31868847],
       [0.82873996, 0.17126004],
       [0.79137462, 0.20862538],
       [0.21130681, 0.78869319],
       [0.76269132, 0.23730868],
       [0.53897722, 0.46102278],
       [0.49608152, 0.50391848],
       [0.67449217, 0.32550783],
       [0.55407799, 0.44592201],
       [0.76378164, 0.23621836],
       [0.73838765, 0.26161235],
       [0.41279707, 0.58720293],
       [0.48958267, 0.51041733],
       [0.41951193, 0.58048807],
       [0.18739264, 0.81260736],
       [0.50883019, 0.49116981],
       [0.53168875, 0.46831125],
       [0.68986823, 0.31013177],
       [0.77975669, 0.22024331],
       [0.65143109, 0.34856891],
       [0.5947115 , 0.4052885 ],
       [0.53196652, 0.46803348],
       [0.49794256, 0.50205744],
       [0.78896346, 0.21103654],
       [0.16877407, 0.83122593],
       [0.76652942, 0.23347058],
       [0.25504547, 0.74495453],
       [0.

In [160]:
accuracy_score(ncaa_selected_testing['outcome'], rf_model.predict(input_test))

0.7761194029850746

In [162]:
log_loss(ncaa_selected_testing['outcome'],rf_model.predict_proba(input_test))

0.5337522280269728

In [182]:
test_2019 = pd.read_csv('NCAA_Tourney_2019.csv')

In [183]:
test_2019_selected = pd.DataFrame()

In [184]:
test_2019.columns

Index(['game_id', 'team1_id', 'team2_id', 'season', 'team1_seed', 'team2_seed',
       'strongseed', 'weakseed', 'host_lat', 'host_long',
       'team1_pt_school_ncaa', 'team1_pt_overall_ncaa', 'team1_pt_school_s16',
       'team1_pt_overall_s16', 'team1_pt_school_ff', 'team1_pt_overall_ff',
       'team1_pt_career_school_wins', 'team1_pt_career_school_losses',
       'team1_pt_career_overall_losses', 'team1_pt_team_season_wins',
       'team1_pt_team_season_losses', 'team1_pt_coach_season_wins',
       'team1_pt_coach_season_losses', 'team1_pt_career_overall_wins',
       'team2_pt_school_ncaa', 'team2_pt_overall_ncaa', 'team2_pt_school_s16',
       'team2_pt_overall_s16', 'team2_pt_school_ff', 'team2_pt_overall_ff',
       'team2_pt_career_school_wins', 'team2_pt_career_school_losses',
       'team2_pt_career_overall_losses', 'team2_pt_team_season_wins',
       'team2_pt_team_season_losses', 'team2_pt_coach_season_wins',
       'team2_pt_coach_season_losses', 'team2_pt_career_overall

In [185]:
for i in list(test_2019.columns[1:]):
    test_2019[i].fillna(45.0,inplace =True)
    test_2019[i] = test_2019[i].astype(float).replace(0,0.1)

'd_team_seed', 'q_team_seed', 'q_pt_overall_s16', 'q_ap_final',
       'd_ap_preseason', 'q_ap_preseason', 'd_coaches_before_final',
       'q_coaches_before_final', 'd_oppfg3pct', 'd_oe', 'd_adjoe', 'd_adjde'

In [187]:
test_2019_selected['d_team_seed'] = test_2019['team1_seed']-test_2019['team2_seed']
test_2019_selected['q_team_seed'] = test_2019['team1_seed']/test_2019['team2_seed']
test_2019_selected['q_pt_overall_s16'] = test_2019['team1_pt_overall_s16']/test_2019['team2_pt_overall_s16']
test_2019_selected['q_ap_final'] = test_2019['team1_ap_final'] / test_2019['team2_ap_final']
test_2019_selected['d_ap_preseason'] = test_2019['team1_ap_preseason'] - test_2019['team2_ap_preseason']
test_2019_selected['q_ap_preseason'] = test_2019['team1_ap_preseason'] / test_2019['team2_ap_preseason']
test_2019_selected['d_coaches_before_final'] = test_2019['team1_coaches_before_final'] - test_2019['team2_coaches_before_final']
test_2019_selected['q_coaches_before_final'] = test_2019['team1_coaches_before_final'] / test_2019['team2_coaches_before_final']

test_2019_selected['d_oppfg3pct'] = test_2019['team1_oppfg3pct'] - test_2019['team2_oppfg3pct']
test_2019_selected['d_adjoe'] = test_2019['team1_adjoe'] - test_2019['team2_adjoe']
test_2019_selected['d_oe'] = test_2019['team1_oe'] - test_2019['team2_oe']
test_2019_selected['d_adjde'] = test_2019['team1_adjde'] - test_2019['team2_adjde']
test_2019_selected['game_id'] = test_2019['game_id']

Unnamed: 0,d_team_seed,q_team_seed,q_pt_overall_s16,q_ap_final,d_ap_preseason,q_ap_preseason,d_coaches_before_final,q_coaches_before_final,d_oppfg3pct,d_adjoe,d_oe,d_adjde
0,-1.0,0.500000,1.846154,0.833333,-6.0,0.400000,-2.0,0.714286,-2.432645,-1.4770,-1.273,-2.3289
1,-2.0,0.333333,240.000000,0.555556,-19.0,0.173913,-4.0,0.555556,-4.753991,1.5840,0.710,-9.3759
2,-3.0,0.250000,8.000000,0.312500,-11.0,0.266667,-10.0,0.333333,-3.827753,1.6530,-0.820,-5.8385
3,-4.0,0.200000,4.800000,0.111111,-14.0,0.222222,-40.0,0.111111,-5.936236,2.7080,1.252,-8.5879
4,-5.0,0.166667,12.000000,0.238095,-41.0,0.088889,-16.0,0.238095,-2.629050,6.2580,5.850,-5.7211
5,-6.0,0.142857,6.000000,0.111111,-41.0,0.088889,-40.0,0.111111,-2.361440,6.7010,5.941,-4.2534
6,-7.0,0.125000,240.000000,0.111111,-41.0,0.088889,-40.0,0.111111,1.722151,15.7770,10.404,-0.4961
7,-8.0,0.111111,24.000000,0.111111,-41.0,0.088889,-40.0,0.111111,-1.963862,9.7540,6.115,-7.6140
8,-9.0,0.100000,240.000000,0.111111,-41.0,0.088889,-40.0,0.111111,-4.528888,9.4010,9.886,-8.0886
9,-10.0,0.090909,240.000000,0.111111,-41.0,0.088889,-40.0,0.111111,-5.341530,5.2520,-3.209,-13.6411


In [212]:
test_2019_selected['prediction'] = log_model.predict(test_2019_selected[test_2019_selected.columns[:-2]])

In [213]:
test_2019_selected['probability'] = log_model.predict_proba(test_2019_selected[test_2019_selected.columns[:-2]])[:,1]

In [214]:
test_2019_selected.to_csv('2019_prediction.csv')