In [7]:
import pandas as pd
import numpy as nps

In [2]:
data_2018 = pd.read_csv('data/data_2018.csv')
data_2017 = pd.read_csv('data/data_2017.csv')
data_2016 = pd.read_csv('data/data_2016.csv')

In [5]:
data_2018.columns

Index(['date', 'h_team', 'h_sp_name', 'h_sp_obp', 'h_sp_slg', 'h_sp_gb/fb',
       'h_sp_ld', 'h_sp_k/bb', 'v_team', 'v_sp_name', 'v_sp_obp', 'v_sp_slg',
       'v_sp_gb/fb', 'v_sp_ld', 'v_sp_k/bb', 'h_obp', 'h_slg', 'h_k_rate',
       'h_bb_rate', 'v_obp', 'v_slg', 'v_k_rate', 'v_bb_rate', 'home_win'],
      dtype='object')

In [9]:
hcols = ['h_sp_obp', 'h_sp_slg', 'h_sp_gb/fb', 'h_sp_ld', 'h_sp_k/bb', 'h_obp', 'h_slg', 'h_k_rate', 'h_bb_rate']
vcols = ['v_sp_obp', 'v_sp_slg', 'v_sp_gb/fb', 'v_sp_ld', 'v_sp_k/bb', 'v_obp', 'v_slg', 'v_k_rate', 'v_bb_rate']
ycol = ['home_win']

## Difference Dataset

In [112]:
diff_data_2018 = pd.DataFrame()
for i in range(len(hcols)):
    diff_data_2018['d_'+hcols[i][2:]] = data_2018[hcols[i]] - data_2018[vcols[i]]
diff_data_2018[ycol] = data_2018[ycol]
diff_data_2018.to_csv(path_or_buf = 'data/diff_2018.csv', index=False)

In [113]:
diff_data_2017 = pd.DataFrame()
for i in range(len(hcols)):
    diff_data_2017['d_'+hcols[i][2:]] = data_2017[hcols[i]] - data_2017[vcols[i]]
diff_data_2017[ycol] = data_2017[ycol]
diff_data_2017.to_csv(path_or_buf = 'data/diff_2017.csv', index=False)

In [114]:
diff_data_2016 = pd.DataFrame()
for i in range(len(hcols)):
    diff_data_2016['d_'+hcols[i][2:]] = data_2016[hcols[i]] - data_2016[vcols[i]]
diff_data_2016[ycol] = data_2016[ycol]
diff_data_2016.to_csv(path_or_buf = 'data/diff_2016.csv', index=False)

## Perceptron with maxiters

In [149]:
X = data_2018[hcols + vcols].to_numpy()

In [150]:
y = data_2018[ycol].to_numpy()

In [151]:
y = np.where(y==0, -1, y) # perceptron context: +1/-1 

In [177]:
def perceptron(X, y, maxiters=10000):
    w = np.zeros(shape=X.shape[1])
    iters = 0
    margin = np.multiply(np.matmul(X, w), y.reshape(X.shape[0]))
    while iters <= maxiters and (not all(margin > 0)):
        ms = np.random.choice(np.where(margin <= 0)[0], size = 1)[0]
        w = w + y[ms]*X[ms, :]
        margin = np.multiply(np.matmul(X, w), y.reshape(X.shape[0]))
        iters += 1
    return w

In [178]:
w = perceptron(X, y)

In [179]:
margin = np.multiply(np.matmul(X, w), y.reshape(X.shape[0]))

5000 iters, train error = 50%

In [180]:
sum(margin > 0)/1819

0.5398570643210555

### Try diff data

In [181]:
X = diff_data_2018.iloc[:, 0:9].to_numpy()
y = diff_data_2018['home_win'].to_numpy()
y = np.where(y==0, -1, y) # perceptron context: +1/-1 

In [183]:
w = perceptron(X, y, maxiters=10000)
margin = np.multiply(np.matmul(X, w), y.reshape(X.shape[0]))
sum(margin > 0)/1819

0.5393073117097307

No improvement.

# Formal modeling

## Perceptron on all data, Train 80%, Test 20%.

In [224]:
all_data = pd.concat([data_2016, data_2017, data_2018])

In [236]:
n = all_data.shape[0]
train = list(np.random.choice(np.arange(n), size = round(n*0.80), replace = False))
test = [i for i in range(n) if i not in train]

In [237]:
train_set = all_data.iloc[train, :]

In [238]:
test_set = all_data.iloc[test, :]

In [239]:
X_train = train_set[hcols + vcols].to_numpy()
y_train = train_set['home_win'].to_numpy()
y_train = np.where(y_train==0, -1, y_train) # perceptron context: +1/-1 
w = perceptron(X_train, y_train, maxiters=10000)
margin_train = np.multiply(np.matmul(X_train, w), y_train.reshape(X_train.shape[0]))
# training error
sum(margin_train > 0)/len(train)

0.5458006718924971

In [240]:
X_test = test_set[hcols + vcols].to_numpy()
y_test = test_set['home_win'].to_numpy()
y_test = np.where(y_test==0, -1, y_test) # perceptron context: +1/-1 
margin_test = np.multiply(np.matmul(X_test, w), y_test.reshape(X_test.shape[0]))
# test error
sum(margin_test > 0)/len(test)

0.5053763440860215

### Cross-Validation: Perceptron

In [244]:
train_rates = []
test_rates = []
for _ in range(100):
    train_set = all_data.iloc[train, :]
    test_set = all_data.iloc[test, :]

    X_train = train_set[hcols + vcols].to_numpy()
    y_train = train_set['home_win'].to_numpy()
    y_train = np.where(y_train==0, -1, y_train) # perceptron context: +1/-1 
    w = perceptron(X_train, y_train, maxiters=10000)
    
    margin_train = np.multiply(np.matmul(X_train, w), y_train.reshape(X_train.shape[0]))
    # train error
    train_rates.append(sum(margin_train > 0)/len(train))
    
    X_test = test_set[hcols + vcols].to_numpy()
    y_test = test_set['home_win'].to_numpy()
    y_test = np.where(y_test==0, -1, y_test) # perceptron context: +1/-1 
    
    margin_test = np.multiply(np.matmul(X_test, w), y_test.reshape(X_test.shape[0]))
    # test error
    test_rates.append(sum(margin_test > 0)/len(test))

print(np.mean(train_rates))
print(np.mean(test_rates))

0.5061388577827547
0.5070698924731183
