# Jonathan Halverson
# Monday, March 27, 2017
# Part 13: Simple predictive models including Experience

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')
from scipy.stats import binom

In [None]:
iofile = 'data/fightmetric_cards/fightmetric_fights_CLEAN_3-6-2017.csv'
fights = pd.read_csv(iofile, header=0, parse_dates=['Date'])
fights.head(3)

We remove no contests immediately to avoid inf (infinity) in WinRatio:

In [None]:
fights = fights[fights.Outcome != 'no contest']

In [None]:
num = []
num_wins = []
num_L = []
num_wins_L = []
for index, row in fights.iterrows():
     d = row['Date']
     
     winner = row['Winner']
     x = fights[((fights.Winner == winner) | (fights.Loser == winner)) & (fights.Date < d) & (fights.Outcome != 'no contest')].shape[0]
     y = fights[(fights.Winner == winner) & (fights.Date < d)].shape[0]
     num.append(x)
     num_wins.append(y)
     
     loser = row['Loser']
     x = fights[((fights.Winner == loser) | (fights.Loser == loser)) & (fights.Date < d) & (fights.Outcome != 'no contest')].shape[0]
     y = fights[(fights.Winner == loser) & (fights.Date < d)].shape[0]
     num_L.append(x)
     num_wins_L.append(y)
fights['Num'] = num
fights['Num_L'] = num_L
fights['Num_Wins'] = num_wins
fights['Num_Wins_L'] = num_wins_L
fights['WinRatio'] = fights.Num_Wins / fights.Num
fights['WinRatio_L'] = fights.Num_Wins_L / fights.Num_L
len(num), len(num_L), max(num), max(num_L), min(num), min(num_L)

In [None]:
x = 'Gray Maynard'
fights[(fights.Winner == x) & (fights.Winner == x)]

In [None]:
fights[fights.WinRatio_L > 1][['Winner', 'WinRatio']]

In [None]:
fights.describe()

### Note that win ratio can be NaN

In [None]:
fights.WinRatio.loc[10:20]

In [None]:
iofile = 'data/ufc_name_education.csv'
ufc = pd.read_csv(iofile, header=0)
ufc.head(3)

In [None]:
iofile = 'data/fightmetric_fighters_with_corrections_from_UFC_Wikipedia_CLEAN.csv'
fighters = pd.read_csv(iofile, header=0, parse_dates=['Dob'])
cols = ['Name', 'Height', 'Reach', 'LegReach', 'Stance', 'Dob']
df = fights.merge(fighters[cols], how='left', left_on='Winner', right_on='Name')
df = df.merge(fighters[cols], how='left', left_on='Loser', right_on='Name', suffixes=('', '_L'))
df = df.drop(['Name', 'Name_L'], axis=1)
df = df.merge(ufc, left_on='Winner', right_on='Name', how='left')
df = df.merge(ufc, left_on='Loser', right_on='Name', how='left', suffixes=('', '_L'))
df.Education = df.Education.fillna(0.0)
df.Education_L = df.Education_L.fillna(0.0)
df = df.drop(['Name', 'Name_L'], axis=1)
df.head(3)

In [None]:
iofile = 'data/fightmetric_career_stats.csv'
cstats = pd.read_csv(iofile, header=0)

In [None]:
df = df.merge(cstats, left_on='Winner', right_on='Name', how='left')
df = df.merge(cstats, left_on='Loser', right_on='Name', how='left', suffixes=('', '_L'))
df = df.drop(['Name', 'Name_L'], axis=1)

In [None]:
df.info()

Filter fights to defeats since January 1, 2005:

In [None]:
fs = df[(df.Date > pd.to_datetime('2005-01-01')) & (df.Outcome == 'def.')].copy()
fs.shape[0]

In [None]:
fs.info()

### Younger wins

In [None]:
yw = fs[pd.notnull(fs.Dob) & pd.notnull(fs.Dob_L) & (fs.Dob != fs.Dob_L)]
wins = yw[yw.Dob > yw.Dob_L].shape[0]
total = yw.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

### Longer reach wins

In [None]:
lr = fs[pd.notnull(fs.Reach) & pd.notnull(fs.Reach_L) & (fs.Reach != fs.Reach_L)]
wins = lr[lr.Reach > lr.Reach_L].shape[0]
total = lr.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

What if we don't exclude nulls?

In [None]:
lr = fs[fs.Reach != fs.Reach_L]
wins = lr[lr.Reach > lr.Reach_L].shape[0]
total = lr.shape[0]
wins, total, wins / float(total), fs.shape[0] - total

In [None]:
71.0 > np.nan

In [None]:
71.0 == np.nan

In [None]:
lr[['Reach', 'Reach_L']].info()

We see that nulls in Reach_L lead to Falses which increased the total which gave a win ratio less than 1/2.

### Taller wins

In [None]:
ht = fs[pd.notnull(fs.Height) & pd.notnull(fs.Height_L) & (fs.Height != fs.Height_L)]
wins = ht[ht.Height > ht.Height_L].shape[0]
total = ht.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

### Stance

In [None]:
st = fs[pd.notnull(fs.Stance) & pd.notnull(fs.Stance_L) & (((fs.Stance == 'Orthodox') & (fs.Stance_L == 'Southpaw')) | ((fs.Stance == 'Southpaw') & (fs.Stance_L == 'Orthodox')))]
wins = st[st.Stance == 'Southpaw'].shape[0]
total = st.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

### Education

In [None]:
ed = fs[pd.notnull(fs.Education) & pd.notnull(fs.Education_L) & (fs.Education != fs.Education_L)]
wins = ed[ed.Education == 1].shape[0]
total = ed.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

### Leg reach

In [None]:
lg = fs[pd.notnull(fs.LegReach) & pd.notnull(fs.LegReach_L) & (fs.LegReach != fs.LegReach_L)]
wins = lg[lg.LegReach > lg.LegReach_L].shape[0]
total = lg.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

### Predict winner has more UFC fights

In [None]:
nf = fs[pd.notnull(fs.Num) & pd.notnull(fs.Num_L) & (fs.Num != fs.Num_L)]
wins = nf[nf.Num > nf.Num_L].shape[0]
total = nf.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

### Predict winner has higher win ratio (for 5 fights or more)

In [None]:
wr = fs[pd.notnull(fs.WinRatio) & pd.notnull(fs.WinRatio_L) & (fs.WinRatio != fs.WinRatio_L) & (fs.Num > 4) & (fs.Num_L > 4)]
wins = wr[wr.WinRatio > wr.WinRatio_L].shape[0]
total = wr.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

In [None]:
# How does the result change if we replace NaN with 0.5 and don't require a certain number of fights?
# Ans: (1571, 2728, 0.5758797653958945, 833, 2.331962646952389e-15)
#fs.WinRatio = fs.WinRatio.fillna(0.5)
#fs.WinRatio_L = fs.WinRatio_L.fillna(0.5)

### Higher win ratio and younger

In [None]:
vd = fs[pd.notnull(fs.WinRatio) & pd.notnull(fs.WinRatio_L) & pd.notnull(fs.Dob) & pd.notnull(fs.Dob_L) & (fs.WinRatio != fs.WinRatio_L) & (fs.Dob != fs.Dob_L) & (fs.Num > 4) & (fs.Num_L > 4)]
msk1 = (vd.WinRatio > vd.WinRatio_L) & (vd.Dob > vd.Dob_L)
msk2 = (vd.WinRatio < vd.WinRatio_L) & (vd.Dob < vd.Dob_L)

sp = vd[msk1 | msk2]
wins = sp[sp.WinRatio > sp.WinRatio_L].shape[0]
total = sp.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

### Reach advantage of 4 inches or more

In [None]:
vd = fs[pd.notnull(fs.Reach) & pd.notnull(fs.Reach_L)]
msk1 = np.abs(vd.Reach - vd.Reach_L) >= 4

sp = vd[msk1]
wins = sp[(sp.Reach > sp.Reach_L)].shape[0]
total = sp.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

In [None]:
sp[['Winner', 'Reach', 'Loser', 'Reach_L']].head(10)

### 4 years or more younger

In [None]:
vd = fs[pd.notnull(fs.Dob) & pd.notnull(fs.Dob_L)]
msk1 = ((vd.Dob - vd.Dob_L) / np.timedelta64(1, 'Y'))**2 >= 16

sp = vd[msk1]
wins = sp[(sp.Dob > sp.Dob_L)].shape[0]
total = sp.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

In [None]:
sp[['Winner', 'Dob', 'Loser', 'Dob_L']].head(20)

### Younger and greater reach

In [None]:
vd = fs[pd.notnull(fs.Dob) & pd.notnull(fs.Dob_L) & pd.notnull(fs.Reach) & pd.notnull(fs.Reach_L)]
msk1 = (vd.Reach > vd.Reach_L) & (vd.Dob > vd.Dob_L)
msk2 = (vd.Reach < vd.Reach_L) & (vd.Dob < vd.Dob_L)

sp = vd[msk1 | msk2]
wins = sp[(sp.Dob > sp.Dob_L) & (sp.Reach > sp.Reach_L)].shape[0]
total = sp.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

### Southpaw and younger

In [None]:
vd = fs[pd.notnull(fs.Stance) & pd.notnull(fs.Stance_L) & pd.notnull(fs.Dob) & pd.notnull(fs.Dob_L)]
msk1 = (vd.Stance == 'Southpaw') & (vd.Stance_L == 'Orthodox') & (vd.Dob > vd.Dob_L)
msk2 = (vd.Stance == 'Orthodox') & (vd.Stance_L == 'Southpaw') & (vd.Dob < vd.Dob_L)

sp = vd[msk1 | msk2]
wins = sp[sp.Stance == 'Southpaw'].shape[0]
total = sp.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

### South, younger and greater reach

In [None]:
vd = fs[pd.notnull(fs.Stance) & pd.notnull(fs.Stance_L) & pd.notnull(fs.Dob) & pd.notnull(fs.Dob_L) & pd.notnull(fs.Reach) & pd.notnull(fs.Reach_L)]
msk1 = (vd.Stance == 'Southpaw') & (vd.Stance_L == 'Orthodox') & (vd.Reach > vd.Reach_L) & (vd.Dob > vd.Dob_L)
msk2 = (vd.Stance == 'Orthodox') & (vd.Stance_L == 'Southpaw') & (vd.Reach < vd.Reach_L) & (vd.Dob < vd.Dob_L)

sp = vd[msk1 | msk2]
wins = sp[sp.Stance == 'Southpaw'].shape[0]
total = sp.shape[0]
wins, total, wins / float(total), fs.shape[0] - total, 2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

In [None]:
sp[['Winner', 'Dob', 'Reach', 'Stance', 'Loser', 'Dob_L', 'Reach_L', 'Stance_L']].head()

### Overall win ratio

In [None]:
fs_fighters = fs.Winner.append(fs.Loser).value_counts().to_frame()
wins = fs.Winner.value_counts().to_frame()
loses = fs.Loser.value_counts().to_frame()
fs_fighters = fs_fighters.merge(wins, left_index=True, right_index=True, how='left')
fs_fighters = fs_fighters.merge(loses, left_index=True, right_index=True, how='left')
fs_fighters = fs_fighters.fillna(0.0)
fs_fighters.columns = ['Total', 'Wins', 'Losses']
fs_fighters['WinRatio'] = fs_fighters.Wins / (fs_fighters.Wins + fs_fighters.Losses)

In [None]:
ws = fs.merge(fs_fighters[['Total', 'WinRatio']], left_on='Winner', right_index=True, how='left')
ws = ws.merge(fs_fighters[['Total', 'WinRatio']], left_on='Loser', right_index=True, how='left', suffixes=('', '_L'))
ws.head(3)

In [None]:
ws = ws[pd.notnull(ws.WinRatio) & pd.notnull(ws.WinRatio_L) & (ws.Total > 5) & (ws.Total_L > 5)]
wins = ws[ws.WinRatio > ws.WinRatio_L].shape[0]
total = ws.shape[0]
wins, total, wins / float(total), ws.shape[0] - total

# Part II: Cleaning, imputing and standardizing

In [None]:
fs.info()

In [None]:
fs['Age'] = (fs.Date - fs.Dob) / np.timedelta64(1, 'Y')
fs['Age_L'] = (fs.Date - fs.Dob_L) / np.timedelta64(1, 'Y')

In [None]:
fs.Stance.value_counts()

In [None]:
fs.Stance_L.value_counts()

In [None]:
gs = fs[fs.Stance.isin(['Southpaw', 'Orthodox', 'Switch']) & fs.Stance_L.isin(['Southpaw', 'Orthodox', 'Switch'])].copy()

Replace NaN's in win ratios:

In [None]:
gs.WinRatio = gs.WinRatio.fillna(0.5)
gs.WinRatio_L = gs.WinRatio_L.fillna(0.5)

In [None]:
gs.Stance = gs.Stance.replace({'Switch':'Orthodox'}).replace({'Southpaw':1, 'Orthodox':0})
gs.Stance_L = gs.Stance_L.replace({'Switch':'Orthodox'}).replace({'Switch':2, 'Southpaw':1, 'Orthodox':0})

In [None]:
#gs.LegReach = gs.LegReach.apply()

In [None]:
cols = ['Reach', 'Height', 'LegReach', 'Stance', 'Num', 'WinRatio', 'Education', 'Age', 'Reach_L', 'Height_L', 'LegReach_L', 'Stance_L', 'Num_L', 'WinRatio_L', 'Education_L', 'Age_L']
cols = ['Reach', 'Height', 'Stance', 'Num', 'WinRatio', 'Education', 'Age', 'Reach_L', 'Height_L', 'Stance_L', 'Num_L', 'WinRatio_L', 'Education_L', 'Age_L']
gs[cols].head()

     slpm           4026 non-null float64
     str_acc        4026 non-null float64
     sapm           4026 non-null float64
     str_def        4026 non-null float64
     td_avg         4026 non-null float64
     td_acc         4026 non-null float64
     td_def         4026 non-null float64
     sub_avg        4026 non-null float64

In [None]:
gs.info()

In [None]:
#gs = gs[cols]
gs['AgeDiff'] = gs.Age - gs.Age_L
gs['ReachDiff'] = gs.Reach - gs.Reach_L
gs['StanceDiff'] = gs.Stance - gs.Stance_L
gs['EducationDiff'] = gs.Education - gs.Education_L
gs['WinRatioDiff'] = gs.WinRatio - gs.WinRatio_L
gs['NumFightsDiff'] = gs.Num - gs.Num_L
gs['SApMDiff'] = gs.sapm - gs.sapm_L
gs['SLpMDiff'] = gs.slpm - gs.slpm_L
gs['TDavgDiff'] = gs.td_avg - gs.td_avg_L
gs['TDdefDiff'] = gs.td_def - gs.td_def_L
gs['SubavgDiff'] = gs.sub_avg - gs.sub_avg_L
gs = gs[['AgeDiff', 'ReachDiff', 'StanceDiff', 'EducationDiff', 'WinRatioDiff', 'NumFightsDiff', 'SApMDiff', 'SLpMDiff', 'TDavgDiff', 'TDdefDiff', 'SubavgDiff']]
gs.info()

In [None]:
gs.head()

In [None]:
gs.describe()

In [None]:
shuffled = []
from random import random as rng
for index, row in gs.iterrows():
     if (rng() < 0.5):
          #shuffled.append(([1] + list(row[7:].values) + list(row[:7].values)))
          shuffled.append(([1] + list(-1 * row.values)))
     else:
          shuffled.append(([0] + list(row.values)))

In [None]:
data = pd.DataFrame(shuffled)
data.describe()

In [None]:
cols = [0, 1, 3, 7, 8, 10, 14]
cols = [0, 7, 14]
#cols = [0, 3, 10]
cols = range(data.shape[1])
data = data.loc[:,cols].dropna()
data = data.sample(data.shape[0], replace=False, axis=0)
data.info()

In [None]:
plt.scatter(data[data.loc[:,0]==0].iloc[:,1], data[data.loc[:,0]==0].iloc[:,2], marker='+')
plt.scatter(data[data.loc[:,0]==1].iloc[:,1], data[data.loc[:,0]==1].iloc[:,2], marker='_')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

X = data.iloc[:,1:].values
y = data.iloc[:,0].values
stdsc = StandardScaler()
StatifiedCV = StratifiedKFold(n_splits=10, shuffle=True)
param_grid = dict(max_depth=range(1, 10), criterion=['gini', 'entropy'], splitter=['best', 'random'])
test_accuracies = []
for i in range(10):
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
     clf = DecisionTreeClassifier(random_state=0)
     grid = GridSearchCV(clf, param_grid, cv=StatifiedCV, scoring='accuracy', refit=True)
     grid.fit(stdsc.fit_transform(X_train), y_train)
     # predict test labels
     y_test_pred = grid.predict(stdsc.transform(X_test))
     test_accuracy = metrics.accuracy_score(y_test_pred, y_test)
     test_accuracies.append(test_accuracy)
     print i, grid.best_score_, grid.best_params_, test_accuracy

In [None]:
np.array(test_accuracies).mean()

Write out the optimal tree to graph file:

In [None]:
# http://webgraphviz.com (paste contents of file here)
#from sklearn import tree
#fn = ['stance1', 'age1', 'stance2', 'age2']
#cn = ['fighter1 wins', 'fighter2 wins']
#clf.set_params(**grid.best_params_).fit(X_train, y_train)
#tree.export_graphviz(clf, out_file='tree.dot', feature_names=fn, class_names=cn)

In [None]:
# uncomment below to see log
#grid.cv_results_

### Evaluation of the classifer

In [None]:
confmat = metrics.confusion_matrix(y_test, y_test_pred)
confmat

In [None]:
fig, ax = plt.subplots(figsize=(3, 3))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
     for j in range(confmat.shape[1]):
          ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
plt.xlabel('Predicted label')
plt.ylabel('True label')

In [None]:
metrics.recall_score(y_test, y_test_pred)

In [None]:
metrics.precision_score(y_test, y_test_pred)

### ROC curve

In [None]:
# store the predicted probabilities for class 1 
y_pred_prob = grid.predict_proba(X_test)[:,1]

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1])
plt.xlabel('False positive rate (1 - specificity)')
plt.ylabel('True positive rate (sensitivity)')

In [None]:
metrics.roc_auc_score(y_test, y_pred_prob)

# Other models

### Random Forest

from sklearn.ensemble import RandomForestClassifier

X = data.iloc[:,1:].values
y = data.iloc[:,0].values
stdsc = StandardScaler()
StatifiedCV = StratifiedKFold(n_splits=10, shuffle=True)
param_grid = dict(criterion=['gini', 'entropy'], bootstrap=[True, False])
test_accuracies = []
for i in range(10):
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
     clf = RandomForestClassifier(n_estimators=100)
     grid = GridSearchCV(clf, param_grid, cv=StatifiedCV, scoring='accuracy', refit=True)
     grid.fit(stdsc.fit_transform(X_train), y_train)
     # predict test labels
     y_test_pred = grid.predict(stdsc.transform(X_test))
     test_accuracy = metrics.accuracy_score(y_test_pred, y_test)
     test_accuracies.append(test_accuracy)
     print i, grid.best_score_, grid.best_params_, test_accuracy

In [None]:
np.array(test_accuracies).mean()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

X = data.iloc[:,1:].values
y = data.iloc[:,0].values
stdsc = StandardScaler()
StatifiedCV = StratifiedKFold(n_splits=10, shuffle=True)
param_grid = dict(C=np.logspace(-3, 1, base=10, num=20), penalty=['l1', 'l2'])
test_accuracies = []
for i in range(10):
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
     clf = LogisticRegression()
     grid = GridSearchCV(clf, param_grid, cv=StatifiedCV, scoring='accuracy', refit=True)
     grid.fit(stdsc.fit_transform(X_train), y_train)
     # predict test labels
     y_test_pred = grid.predict(stdsc.transform(X_test))
     test_accuracy = metrics.accuracy_score(y_test_pred, y_test)
     test_accuracies.append(test_accuracy)
     print i, grid.best_score_, grid.best_params_, test_accuracy

In [None]:
np.array(test_accuracies).mean()

### AdaBoost

from sklearn.ensemble import AdaBoostClassifier

X = data.iloc[:,1:].values
y = data.iloc[:,0].values
stdsc = StandardScaler()
StatifiedCV = StratifiedKFold(n_splits=10, shuffle=True)
param_grid = dict(learning_rate=np.logspace(-4, 0, base=10, num=5),
                  n_estimators=[25, 50, 100],
                  base_estimator__max_depth=[1, 2, 3])
test_accuracies = []
for i in range(10):
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
     tree = DecisionTreeClassifier(criterion='entropy')
     clf = AdaBoostClassifier(tree)
     grid = GridSearchCV(clf, param_grid, cv=StatifiedCV, scoring='accuracy', refit=True)
     grid.fit(stdsc.fit_transform(X_train), y_train)
     # predict test labels
     y_test_pred = grid.predict(stdsc.transform(X_test))
     test_accuracy = metrics.accuracy_score(y_test_pred, y_test)
     test_accuracies.append(test_accuracy)
     print i, grid.best_score_, grid.best_params_, test_accuracy

In [None]:
np.array(test_accuracies).mean()

### Support Vector Classifier

from sklearn.svm import SVC

X = data.iloc[:,1:].values
y = data.iloc[:,0].values
stdsc = StandardScaler()
StatifiedCV = StratifiedKFold(n_splits=10, shuffle=True)
param_grid = [dict(kernel=['linear'],
                   C=np.logspace(-3, 3, base=10, num=15),
                   shrinking=[True, False])]
test_accuracies = []
for i in range(10):
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
     clf = SVC(probability=True)
     grid = GridSearchCV(clf, param_grid, cv=StatifiedCV, scoring='accuracy', refit=True)
     grid.fit(stdsc.fit_transform(X_train), y_train)
     # predict test labels
     y_test_pred = grid.predict(stdsc.transform(X_test))
     test_accuracy = metrics.accuracy_score(y_test_pred, y_test)
     test_accuracies.append(test_accuracy)
     print i, grid.best_score_, grid.best_params_, test_accuracy

In [None]:
np.array(test_accuracies).mean()

### Bagged Logistic Regression

In [None]:
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

X = data.iloc[:,1:].values
y = data.iloc[:,0].values
stdsc = StandardScaler()
StatifiedCV = StratifiedKFold(n_splits=10, shuffle=True)
param_grid = dict(n_estimators=[25, 50],
                  bootstrap_features=[True, False],
                  base_estimator__C=np.logspace(-3, 3, base=10, num=15),
                  base_estimator__penalty=['l1', 'l2'])
test_accuracies = []
for i in range(10):
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
     lr = LogisticRegression()
     clf = BaggingClassifier(base_estimator=lr)
     grid = GridSearchCV(clf, param_grid, cv=StatifiedCV, scoring='accuracy', refit=True)
     grid.fit(stdsc.fit_transform(X_train), y_train)
     # predict test labels
     y_test_pred = grid.predict(stdsc.transform(X_test))
     test_accuracy = metrics.accuracy_score(y_test_pred, y_test)
     test_accuracies.append(test_accuracy)
     print i, grid.best_score_, grid.best_params_, test_accuracy

In [None]:
np.array(test_accuracies).mean()