# Import Libraries

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats

## Data Processing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Data Modeling
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

## Machine Learning Algos
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import xgboost as xgb

##
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score, multilabel_confusion_matrix, classification_report

# Import data

In [2]:
df = pd.read_csv("pitches.csv.zip")

# Data Cleaning

In [3]:
df.head()

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,...,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
0,0.416,2.963,92.9,84.1,2305.052,159.235,-25.0,3.2,23.7,7.665,...,3,0.0,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.191,2.347,92.8,84.1,2689.935,151.402,-40.7,3.4,23.7,12.043,...,4,0.0,2015000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0
2,-0.518,3.284,94.1,85.2,2647.972,145.125,-43.7,3.7,23.7,14.368,...,5,0.0,2015000000.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0
3,-0.641,1.221,91.0,84.0,1289.59,169.751,-1.3,5.0,23.8,2.104,...,6,0.0,2015000000.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0
4,-1.821,2.083,75.4,69.6,1374.569,280.671,18.4,12.0,23.8,-10.28,...,7,0.0,2015000000.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0


In [4]:
df.shape

(2867154, 40)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2867154 entries, 0 to 2867153
Data columns (total 40 columns):
px                 float64
pz                 float64
start_speed        float64
end_speed          float64
spin_rate          float64
spin_dir           float64
break_angle        float64
break_length       float64
break_y            float64
ax                 float64
ay                 float64
az                 float64
sz_bot             float64
sz_top             float64
type_confidence    float64
vx0                float64
vy0                float64
vz0                float64
x                  float64
x0                 float64
y                  float64
y0                 float64
z0                 float64
pfx_x              float64
pfx_z              float64
nasty              float64
zone               float64
code               object
type               object
pitch_type         object
event_num          int64
b_score            float64
ab_id              float64
b

In [6]:
df.describe()

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,...,event_num,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b
count,2852965.0,2852965.0,2853040.0,2853040.0,2852965.0,2852965.0,2852965.0,2852965.0,2852965.0,2852965.0,...,2867154.0,2867154.0,2867154.0,2867154.0,2867154.0,2867154.0,2867154.0,2867154.0,2867154.0,2867154.0
mean,0.006572502,2.254962,88.38124,81.36274,1731.173,180.2308,5.848084,6.601459,23.81685,-2.308512,...,312.9711,2.256608,2016606000.0,0.8824259,0.8810873,0.9826033,2.894472,0.3018314,0.1860451,0.09595543
std,0.892749,0.9463968,6.01554,5.364057,682.7521,67.42859,24.43621,133.1708,0.0644654,10.74066,...,192.8482,2.550955,1117238.0,0.9665457,0.8251786,0.8174947,1.726595,0.4590526,0.3891431,0.2945302
min,-10.54333,-5.183664,33.9,32.4,1.214,-0.002,-90.0,0.1,23.3,-59.29009,...,3.0,0.0,2015000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,-0.5908243,1.65,84.3,77.9,1238.745,137.154,-11.9,4.3,23.8,-11.29,...,149.0,0.0,2016004000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,0.013,2.259,89.7,82.5,1866.322,195.885,6.8,6.0,23.8,-3.142,...,302.0,1.0,2017004000.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0
75%,0.609,2.863,93.0,85.4,2247.564,225.234,27.0,8.0,23.9,5.995827,...,464.0,3.0,2018002000.0,2.0,2.0,2.0,4.0,1.0,0.0,0.0
max,12.95291,14.88624,105.0,96.9,6539.259,360.001,269.4,224889.3,36.4,40.978,...,1336.0,25.0,2018186000.0,4.0,2.0,2.0,21.0,1.0,1.0,1.0


In [7]:
df.isnull().sum()

px                 14189
pz                 14189
start_speed        14114
end_speed          14114
spin_rate          14189
spin_dir           14189
break_angle        14189
break_length       14189
break_y            14189
ax                 14189
ay                 14189
az                 14189
sz_bot              2083
sz_top              2083
type_confidence    14189
vx0                14189
vy0                14189
vz0                14189
x                      0
x0                 14189
y                      0
y0                 14189
z0                 14189
pfx_x              14142
pfx_z              14142
nasty              14189
zone               14189
code                5719
type                   0
pitch_type         14189
event_num              0
b_score                0
ab_id                  0
b_count                0
s_count                0
outs                   0
pitch_num              0
on_1b                  0
on_2b                  0
on_3b                  0


In [8]:
df = df.dropna(how='any', axis=0)

In [9]:
df.shape

(2847250, 40)

In [10]:
df['pitch_type'].value_counts()

FF    1012917
SL     449804
FT     337244
CH     292123
SI     242047
CU     233913
FC     149376
KC      66410
FS      43533
KN      11260
IN       6187
EP        812
FO        810
PO        626
SC        113
UN         57
FA          9
AB          9
Name: pitch_type, dtype: int64

In [11]:
df['year'] = df.ab_id.astype(str)

In [12]:
new_df = df[df['year'].str.startswith('2018')]

In [13]:
new_df.head()

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,...,b_score,ab_id,b_count,s_count,outs,pitch_num,on_1b,on_2b,on_3b,year
2142710,0.239615,1.848781,93.7,84.5,2632.27,233.718,43.7,5.8,23.7,-19.911162,...,0.0,2018000000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2018000001.0
2142711,-0.804047,0.925201,94.0,85.8,2561.181,236.246,42.7,6.1,23.8,-20.043108,...,0.0,2018000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,2018000001.0
2142712,0.598637,1.616677,94.7,86.5,2673.854,237.996,44.4,6.0,23.8,-21.510485,...,0.0,2018000000.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,2018000001.0
2142713,1.453223,1.466314,82.3,76.7,959.909,38.861,-8.2,10.8,23.9,4.974062,...,0.0,2018000000.0,1.0,2.0,0.0,4.0,0.0,0.0,0.0,2018000001.0
2142714,-0.427914,2.490041,95.4,87.1,2408.148,241.08,40.9,6.1,23.8,-20.172978,...,0.0,2018000000.0,1.0,2.0,0.0,5.0,0.0,0.0,0.0,2018000001.0


In [14]:
new_df.shape

(718322, 41)

In [15]:
new_df['pitch_type'].value_counts()

FF    254086
SL    122796
FT     81617
CH     74606
CU     60360
SI     57970
FC     39061
KC     16327
FS     10365
KN       719
EP       204
PO        89
FO        84
SC        37
AB         1
Name: pitch_type, dtype: int64

In [16]:
new_df = new_df.drop(new_df.loc[new_df['pitch_type']=='FF'].index)

In [17]:
new_df = new_df.drop(new_df.loc[new_df['pitch_type']=='FT'].index)

In [18]:
new_df = new_df.drop(new_df.loc[new_df['pitch_type']=='SI'].index)

In [19]:
new_df = new_df.drop(new_df.loc[new_df['pitch_type']=='FC'].index)

In [20]:
new_df = new_df.drop(new_df.loc[new_df['pitch_type']=='KN'].index)

In [21]:
new_df = new_df.drop(new_df.loc[new_df['pitch_type']=='EP'].index)

In [22]:
new_df = new_df.drop(new_df.loc[new_df['pitch_type']=='PO'].index)

In [23]:
new_df = new_df.drop(new_df.loc[new_df['pitch_type']=='FO'].index)

In [24]:
new_df = new_df.drop(new_df.loc[new_df['pitch_type']=='SC'].index)

In [25]:
final_df = new_df.drop(new_df.loc[new_df['pitch_type']=='AB'].index)

In [26]:
final_df['pitch_type'].value_counts()

SL    122796
CH     74606
CU     60360
KC     16327
FS     10365
Name: pitch_type, dtype: int64

In [27]:
final_df.drop(['code', 'type', 'event_num', 'b_score', 'b_count', 's_count', 
              'outs', 'pitch_num', 'on_1b', 'on_2b', 'on_3b'], axis=1, inplace=True)

In [28]:
final_df.head()

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,...,y,y0,z0,pfx_x,pfx_z,nasty,zone,pitch_type,ab_id,year
2142713,1.453223,1.466314,82.3,76.7,959.909,38.861,-8.2,10.8,23.9,4.974062,...,199.22,50.0,5.768441,3.397188,-4.215796,22.0,14.0,SL,2018000000.0,2018000001.0
2142728,0.747412,0.729438,91.0,84.9,180.473,311.036,0.6,7.8,23.9,-1.241472,...,219.14,50.0,6.052319,-0.692269,-0.60251,32.0,14.0,SL,2018000000.0,2018000006.0
2142729,0.605145,1.997796,90.5,83.8,575.016,348.328,0.2,8.7,23.9,-1.055874,...,184.92,50.0,6.148478,-0.596179,-2.885365,36.0,9.0,SL,2018000000.0,2018000006.0
2142730,1.291387,0.060647,82.8,75.7,2048.993,9.642,-4.4,14.3,23.8,2.842411,...,235.81,50.0,6.269245,2.001122,-11.776095,29.0,14.0,CU,2018000000.0,2018000006.0
2142732,1.046822,1.451463,90.9,83.6,157.794,140.496,-3.7,7.4,23.8,0.913876,...,199.59,50.0,6.123433,0.51733,0.627479,82.0,14.0,SL,2018000000.0,2018000007.0


In [29]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 284454 entries, 2142713 to 2867149
Data columns (total 30 columns):
px                 284454 non-null float64
pz                 284454 non-null float64
start_speed        284454 non-null float64
end_speed          284454 non-null float64
spin_rate          284454 non-null float64
spin_dir           284454 non-null float64
break_angle        284454 non-null float64
break_length       284454 non-null float64
break_y            284454 non-null float64
ax                 284454 non-null float64
ay                 284454 non-null float64
az                 284454 non-null float64
sz_bot             284454 non-null float64
sz_top             284454 non-null float64
type_confidence    284454 non-null float64
vx0                284454 non-null float64
vy0                284454 non-null float64
vz0                284454 non-null float64
x                  284454 non-null float64
x0                 284454 non-null float64
y                  284

In [30]:
def bin_pitches(single_pitch):
    
    offspeed = ['CH', 'FS']
    breaking = ['SL', 'CU', 'KC']
    
    if single_pitch in offspeed:
        return 0
    else: 
        return 1
    

In [31]:
final_df['bin_pitches'] = final_df.pitch_type.apply(bin_pitches)

In [32]:
final_df['bin_pitches'].value_counts()

1    199483
0     84971
Name: bin_pitches, dtype: int64

In [33]:
final_df[['pitch_type', 'bin_pitches']].drop_duplicates(keep='first')

Unnamed: 0,pitch_type,bin_pitches
2142713,SL,1
2142730,CU,1
2142753,CH,0
2142905,FS,0
2143373,KC,1


In [34]:
final_df.shape

(284454, 31)

In [35]:
final_df.duplicated().sum()

0

In [36]:
final_df.drop(['pitch_type', 'ab_id', 'year'], axis=1, inplace=True)

In [37]:
features = final_df.drop(['bin_pitches'], axis=1)

In [38]:
features[(np.abs(stats.zscore(final_df)) < 3).all(axis=1)]

Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,...,vz0,x,x0,y,y0,z0,pfx_x,pfx_z,nasty,zone
2142713,1.453223,1.466314,82.3,76.7,959.909,38.861,-8.2,10.8,23.9,4.974062,...,-2.125677,61.60,-0.732282,199.22,50.0,5.768441,3.397188,-4.215796,22.0,14.0
2142728,0.747412,0.729438,91.0,84.9,180.473,311.036,0.6,7.8,23.9,-1.241472,...,-7.636481,88.51,-1.269673,219.14,50.0,6.052319,-0.692269,-0.602510,32.0,14.0
2142729,0.605145,1.997796,90.5,83.8,575.016,348.328,0.2,8.7,23.9,-1.055874,...,-3.683753,93.93,-1.385409,184.92,50.0,6.148478,-0.596179,-2.885365,36.0,9.0
2142730,1.291387,0.060647,82.8,75.7,2048.993,9.642,-4.4,14.3,23.8,2.842411,...,-4.120138,67.73,-1.179056,235.81,50.0,6.269245,2.001122,-11.776095,29.0,14.0
2142732,1.046822,1.451463,90.9,83.6,157.794,140.496,-3.7,7.4,23.8,0.913876,...,-6.226013,77.10,-1.430667,199.59,50.0,6.123433,0.517330,0.627479,82.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2867132,-0.212890,1.587859,80.0,74.0,1209.602,327.322,7.9,12.2,23.9,-5.233581,...,-1.584788,125.12,1.490938,195.93,50.0,6.118648,-3.837512,-5.982271,36.0,13.0
2867134,-1.054564,0.142382,84.7,78.0,588.915,239.158,11.0,8.3,23.9,-4.283495,...,-5.315316,157.20,3.340541,234.94,50.0,4.848164,-2.800499,1.672213,36.0,13.0
2867139,0.676805,3.487355,83.0,77.0,447.145,225.040,7.0,7.9,23.9,-2.635429,...,2.278726,91.20,3.529359,144.63,50.0,5.094116,-1.748581,1.746085,49.0,12.0
2867141,-0.107080,0.886212,84.3,78.4,436.328,198.885,4.6,7.7,23.9,-1.194240,...,-3.970365,121.07,3.237666,214.77,50.0,4.933055,-0.772864,2.259083,42.0,13.0


# EDA

In [None]:
# distribution of bin_pitches, how varies w/ features

In [None]:
# sns.pairplot

# Train_Test Split

In [39]:
X = features
y = final_df['bin_pitches']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [40]:
X_train.shape, y_train.shape, X_test.shape

((199117, 27), (199117,), (85337, 27))

In [43]:
X_train = X_train.select_dtypes(include='float')

### Reduce memory usage

In [44]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [45]:
reduce_mem_usage(X_train)

Mem. usage decreased to 11.77 Mb (72.3% reduction)


Unnamed: 0,px,pz,start_speed,end_speed,spin_rate,spin_dir,break_angle,break_length,break_y,ax,...,vz0,x,x0,y,y0,z0,pfx_x,pfx_z,nasty,zone
2485809,0.291992,1.803711,85.6250,78.6250,1954.00,123.1875,-28.093750,6.898438,23.796875,14.023438,...,-4.242188,105.8750,2.099609,190.125,50.0,5.425781,8.906250,5.832031,51.0,9.0
2347463,0.104614,1.081055,82.8125,76.8750,298.50,346.5000,2.000000,9.601562,23.906250,-0.575684,...,-2.951172,113.0000,1.924805,209.625,50.0,5.359375,-0.388916,-1.629883,42.0,14.0
2424822,-0.287842,1.731445,83.3750,77.6875,972.50,263.5000,13.101562,8.703125,23.906250,-8.093750,...,-2.968750,128.0000,-1.472656,192.000,50.0,5.648438,-5.343750,0.613281,30.0,7.0
2771527,-0.228882,2.207031,90.5000,84.1875,1531.00,176.2500,-1.200195,4.398438,23.906250,0.924316,...,-6.554688,125.7500,0.977539,179.125,50.0,6.027344,0.514160,7.722656,51.0,5.0
2460694,-0.372559,1.829102,82.8750,75.8750,1878.00,235.3750,26.000000,7.500000,23.796875,-12.835938,...,-5.824219,131.2500,-1.413086,189.375,50.0,6.343750,-8.742188,6.027344,37.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2230560,0.165161,2.306641,88.6875,80.6250,2306.00,229.1250,34.187500,5.898438,23.796875,-15.453125,...,-5.671875,110.6875,-1.997070,176.500,50.0,6.011719,-9.289062,8.046875,14.0,5.0
2361115,0.621094,0.986328,85.1250,78.1250,144.00,162.0000,-2.800781,8.500000,23.796875,0.379883,...,-6.371094,93.3125,-2.091797,212.250,50.0,6.218750,0.246826,0.757812,28.0,14.0
2390036,-0.189331,3.890625,81.3125,75.1875,211.75,107.6250,-3.300781,8.796875,23.906250,1.648438,...,2.218750,124.1875,-0.780762,133.625,50.0,5.808594,1.144531,0.363525,45.0,11.0
2663605,0.122192,2.552734,90.0000,83.3125,713.50,152.7500,-7.699219,6.199219,23.796875,2.955078,...,-1.743164,112.3125,-1.546875,169.875,50.0,5.183594,1.682617,3.263672,16.0,2.0


# Vanilla Models

### Logistic Regression

In [46]:
%%time

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_hat_train_log = logreg.predict(X_train)



CPU times: user 10.7 s, sys: 253 ms, total: 11 s
Wall time: 11 s


In [47]:
log_acc = accuracy_score(y_train, y_hat_train_log)
log_pre = precision_score(y_train, y_hat_train_log)
log_rec = recall_score(y_train, y_hat_train_log)
log_f1 = f1_score(y_train, y_hat_train_log)

print(f'Accuracy: {log_acc}')
print(f'Precision: {log_pre}')
print(f'Recall: {log_rec}')
print(f'F1: {log_f1}')

Accuracy: 0.9405676059804035
Precision: 0.9524823799145178
Recall: 0.9633624391640424
F1: 0.9578915156777067


In [52]:
cv_5_results = np.mean(cross_val_score(logreg, X_train, y_train, cv=5))
# cv_10_results = np.mean(cross_val_score(logreg, X_train, y_train, cv=10))
# cv_15_results = np.mean(cross_val_score(logreg, X_train, y_train, cv=15))
print('CV5: ', cv_5_results)
# print('CV10: ', cv_10_results)
# print('CV15: ', cv_15_results)



CV5:  0.9411401406174107


### Gaussian Naive Bayes

In [53]:
%%time

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
y_hat_train_gnb = gaussian.predict(X_train)

  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  ret = umr_sum(x, axis, dtype, out, keepdims)


CPU times: user 677 ms, sys: 76.6 ms, total: 753 ms
Wall time: 766 ms


In [54]:
gnb_acc = accuracy_score(y_train, y_hat_train_gnb)
gnb_pre = precision_score(y_train, y_hat_train_gnb)
gnb_rec = recall_score(y_train, y_hat_train_gnb)
gnb_f1 = f1_score(y_train, y_hat_train_gnb)

print(f'Accuracy: {gnb_acc}')
print(f'Precision: {gnb_pre}')
print(f'Recall: {gnb_rec}')
print(f'F1: {gnb_f1}')

Accuracy: 0.29830200334476714
Precision: 0.0
Recall: 0.0
F1: 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [69]:
cv_5_results = np.mean(cross_val_score(gaussian, X_train, y_train, cv=5))
# cv_10_results = np.mean(cross_val_score(logreg, X_train, y_train, cv=10))
# cv_15_results = np.mean(cross_val_score(logreg, X_train, y_train, cv=15))
print('CV5: ', cv_5_results)
# print('CV10: ', cv_10_results)
# print('CV15: ', cv_15_results)

  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  ret = umr_sum(x, axis, dtype, out, keepdims)


CV5:  0.29830200323857736


### Random Forest

In [55]:
%%time

random_forest = RandomForestClassifier(n_estimators=5)
random_forest.fit(X_train, y_train)
y_hat_train_rf = random_forest.predict(X_train)

CPU times: user 5.47 s, sys: 91.2 ms, total: 5.56 s
Wall time: 5.62 s


In [56]:
rf_acc = accuracy_score(y_train, y_hat_train_rf)
rf_pre = precision_score(y_train, y_hat_train_rf)
rf_rec = recall_score(y_train, y_hat_train_rf)
rf_f1 = f1_score(y_train, y_hat_train_rf)

print(f'Accuracy: {rf_acc}')
print(f'Precision: {rf_pre}')
print(f'Recall: {rf_rec}')
print(f'F1: {rf_f1}')

Accuracy: 0.9986289468001225
Precision: 0.9988266857449473
Recall: 0.9992198683080447
F1: 0.9990232383405785


In [60]:
cv_5_results = np.mean(cross_val_score(random_forest, X_train, y_train, cv=5))
# cv_10_results = np.mean(cross_val_score(random_forest, X_train, y_train, cv=10))
# cv_15_results = np.mean(cross_val_score(random_forest, X_train, y_train, cv=15))
print('CV5: ', cv_5_results)
# print('CV10: ', cv_10_results)
# print('CV15: ', cv_15_results)

CV5:  0.9902117821793336


### AdaBoosting

In [61]:
%%time

ada = AdaBoostClassifier()
ada.fit(X_train, y_train)
y_hat_train_ada = ada.predict(X_train)

CPU times: user 31.1 s, sys: 1.42 s, total: 32.5 s
Wall time: 32.7 s


In [62]:
ada_acc = accuracy_score(y_train, y_hat_train_ada)
ada_pre = precision_score(y_train, y_hat_train_ada)
ada_rec = recall_score(y_train, y_hat_train_ada)
ada_f1 = f1_score(y_train, y_hat_train_ada)

print(f'Accuracy: {ada_acc}')
print(f'Precision: {ada_pre}')
print(f'Recall: {ada_rec}')
print(f'F1: {ada_f1}')

Accuracy: 0.9453386702290613
Precision: 0.9604772184654096
Recall: 0.9616733466933868
F1: 0.961074910412211


In [66]:
cv_5_results = np.mean(cross_val_score(ada, X_train, y_train, cv=5))
# cv_10_results = np.mean(cross_val_score(logreg, X_train, y_train, cv=10))
# cv_15_results = np.mean(cross_val_score(logreg, X_train, y_train, cv=15))
print('CV5: ', cv_5_results)
# print('CV10: ', cv_10_results)
# print('CV15: ', cv_15_results)

CV5:  0.9447761869323182


### XG Boost

In [64]:
%time

xgb = xgb.XGBClassifier()
xgb.fit(X_train, y_train)
y_hat_train_xgb = xgb.predict(X_train)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 10 µs


In [65]:
xgb_acc = accuracy_score(y_train, y_hat_train_xgb)
xgb_pre = precision_score(y_train, y_hat_train_xgb)
xgb_rec = recall_score(y_train, y_hat_train_xgb)
xgb_f1 = f1_score(y_train, y_hat_train_xgb)

print(f'Accuracy: {xgb_acc}')
print(f'Precision: {xgb_pre}')
print(f'Recall: {xgb_rec}')
print(f'F1: {xgb_f1}')

Accuracy: 0.9895639247276726
Precision: 0.9902688532064342
Recall: 0.9949040939020899
F1: 0.9925810620720187


In [67]:
cv_5_results = np.mean(cross_val_score(xgb, X_train, y_train, cv=5))
# cv_10_results = np.mean(cross_val_score(logreg, X_train, y_train, cv=10))
# cv_15_results = np.mean(cross_val_score(logreg, X_train, y_train, cv=15))
print('CV5: ', cv_5_results)
# print('CV10: ', cv_10_results)
# print('CV15: ', cv_15_results)

CV5:  0.9889160610429959


### K-Nearest Neighbors

In [72]:
%time

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_hat_train_knn = knn.predict(X_train)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.91 µs


In [73]:
knn_acc = accuracy_score(y_train, y_hat_train_knn)
knn_pre = precision_score(y_train, y_hat_train_knn)
knn_rec = recall_score(y_train, y_hat_train_knn)
knn_f1 = f1_score(y_train, y_hat_train_knn)

print(f'Accuracy: {knn_acc}')
print(f'Precision: {knn_pre}')
print(f'Recall: {knn_rec}')
print(f'F1: {knn_f1}')

Accuracy: 0.9719662309094653
Precision: 0.978790691033695
Recall: 0.981312625250501
F1: 0.9800500357398141


In [74]:
cv_5_results = np.mean(cross_val_score(knn, X_train, y_train, cv=5))
# cv_10_results = np.mean(cross_val_score(logreg, X_train, y_train, cv=10))
# cv_15_results = np.mean(cross_val_score(logreg, X_train, y_train, cv=15))
print('CV5: ', cv_5_results)
# print('CV10: ', cv_10_results)
# print('CV15: ', cv_15_results)

CV5:  0.9595212884914501


### Compare Vanilla Models

In [77]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Naive Bayes', 'Random Forest', 
              'AdaBoost', 'XG Boost', 'KNN'],
    'Score': [log_f1, gnb_f1, rf_f1, ada_f1, 
             xgb_f1, knn_f1]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
2,Random Forest,0.999023
4,XG Boost,0.992581
5,KNN,0.98005
3,AdaBoost,0.961075
0,Logistic Regression,0.957892
1,Naive Bayes,0.0


## Hyperparameter Tuning

### Validation Set