In [1]:
import pandas as pd, numpy as np

In [2]:
# count how many unique games the 30 handicappers bet on
# bet could be any from ML, Spread, or Total
Games = set()
for i in range(1,31): 
    df = pd.read_pickle('HC' + str(i) + '.gz', compression='gzip')
    for each in df.GameKey.unique():
        Games.add(str(each))
df = []
len(Games)

38583

In [3]:
# count how many unique games with at least one ML bet
MLGames = set()
for i in range(1,31): 
    df = pd.read_pickle('HC' + str(i) + '.gz', compression='gzip')
    for each in df[df['Bet']=='MONEY LINE'].GameKey.unique():
        MLGames.add(str(each))
df = []
len(MLGames)

15675

In [4]:
# merge hadicappers' history bets into one dataframe
names = [
    'Smart',
    'Bartley',
    'Aronson',
    'Burns',
    'Barone', # not in Ji's original list
    'Bitler',
    'Power',
    'Ross',
    'Diamond',
    'Trapp',
    'Sports',
    'Eddie',
    'Schule',
    'DAmico',
    'Duffy',
    'Thomas',
    'Hunter',
    'Compeau', # not in Ji's original list
    'Syndicate',
    'Lundin',
    'Simulator',
    'Wilson',
    'Monohan',
    'Vinceletti',
    'Rickenbach',
    'Higgs',
    'Nover',
    'Brown',
    'Karpinski',
    'Rogers'
    ]

df = pd.DataFrame()
for i in range(1,31): 
    newdf = pd.read_pickle('HC' + str(i) + '.gz', compression='gzip')
    
    # if you want the last name of the HC, switch to use names array
    #newdf['Handicapper'] = names[i-1]
    newdf['Handicapper'] = 'HC'+str(i)
    df = df.append(newdf, ignore_index=True)

In [5]:
# count how many unique games with at least one ML bet
len(df[df['Bet']=='MONEY LINE'].GameKey.unique())

15675

In [6]:
# find ML games where handicapper betted more than once
bet2ormore = df[df['Bet']=='MONEY LINE'].groupby(['GameKey', 'Handicapper'])['MLBet'].size().ge(2).to_frame()
print(len(bet2ormore[bet2ormore['MLBet']==True]))
bet2ormore[bet2ormore['MLBet']==True].tail()

174


Unnamed: 0_level_0,Unnamed: 1_level_0,MLBet
GameKey,Handicapper,Unnamed: 2_level_1
f8c44734476443539691ee36eb3499a0,HC29,True
f8e227443dea559f77fbec46d8bb7dbb,HC1,True
f8e63b42704e8d8c8399cc8992a5ce6a,HC13,True
fafe521faee1b20d4b321f774d6d61f7,HC24,True
fefccb38e93fbb587bda3daf23c38d01,HC15,True


In [7]:
# here's an example
df[(df['GameKey']=='f8c44734476443539691ee36eb3499a0') & (df['Bet']=='MONEY LINE') & (df['Handicapper']=='HC29')]

Unnamed: 0,League,GameKey,GameTime,Home,Away,Bet,On,Juice,Type,Result,MLBet,MLWinner,Handicapper
126913,MLB,f8c44734476443539691ee36eb3499a0,2017-06-29 15:40:00,ARIZONA,ST. LOUIS,MONEY LINE,ARIZONA,-120,Free,Loss,Home,Away,HC29
126914,MLB,f8c44734476443539691ee36eb3499a0,2017-06-29 15:40:00,ARIZONA,ST. LOUIS,MONEY LINE,ARIZONA,-120,Free,Loss,Home,Away,HC29


In [8]:
# it looks like there's duplicates in the data, so we will take the first instance
ml_bets_all_hcs = df[df['Bet']=='MONEY LINE'].pivot_table(values=['MLBet'], 
                                                          columns='Handicapper', 
                                                          index=['GameKey'], aggfunc='first')

# drop multiindex level 0
ml_bets_all_hcs.columns = ml_bets_all_hcs.columns.droplevel(0)

ml_bets_all_hcs.head()

Handicapper,HC1,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,...,HC28,HC29,HC3,HC30,HC4,HC5,HC6,HC7,HC8,HC9
GameKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000114585a97f69db2c960000728ae62,,,,,,,,,,Away,...,,,,,,,Home,,,
0001a65e2bb316fd67a35d4afaf844cd,Away,,,,,,Away,,Away,,...,,,,,,,,,,Home
00098000b6b4724326e69e64d74e4e08,,,,,Home,,,,,,...,,,,,,,,,,
0011c58643e0ee75496b3d37db68c264,,,,,,,,,,,...,,,Home,,,,,,,Home
0019a51d563d00747068c03b6e0b2402,,,Away,,,,,,,,...,,,,Home,,,,,,


In [9]:
ml_juice_all_hcs = df[df['Bet']=='MONEY LINE'].pivot_table(values=['Juice'], 
                                                           columns='MLBet', 
                                                           index=['GameKey'], aggfunc='max')

ml_juice_all_hcs.columns = ml_juice_all_hcs.columns.droplevel(0)
ml_juice_all_hcs.columns.name = 'Juice'
ml_juice_all_hcs.columns = ['Juice_Away', 'Juice_Home']
ml_juice_all_hcs.head()

Unnamed: 0_level_0,Juice_Away,Juice_Home
GameKey,Unnamed: 1_level_1,Unnamed: 2_level_1
000114585a97f69db2c960000728ae62,127.0,-127
0001a65e2bb316fd67a35d4afaf844cd,120.0,-118
00098000b6b4724326e69e64d74e4e08,,-140
0011c58643e0ee75496b3d37db68c264,,-170
0019a51d563d00747068c03b6e0b2402,104.0,-115


In [10]:
ml_game_info = df[df['Bet']=='MONEY LINE'].pivot_table(values=['GameTime', 'League','Home', 'Away', 'MLWinner'], 
                                                       index=['GameKey'], aggfunc='first')
ml_game_info.head()

Unnamed: 0_level_0,Away,GameTime,Home,League,MLWinner
GameKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
000114585a97f69db2c960000728ae62,LOS ANGELES,2018-02-13 19:00:00,CAROLINA,NHL,Home
0001a65e2bb316fd67a35d4afaf844cd,ATLANTA,2013-08-06 19:05:00,WASHINGTON,MLB,Away
00098000b6b4724326e69e64d74e4e08,WINNIPEG,2017-03-16 19:00:00,NY ISLANDERS,NHL,Away
0011c58643e0ee75496b3d37db68c264,CALGARY,2015-01-19 22:30:00,LOS ANGELES,NHL,Away
0019a51d563d00747068c03b6e0b2402,MINNESOTA,2014-11-28 20:30:00,DALLAS,NHL,Away


In [11]:
columns = ['League', 'GameTime', 'Away', 'Juice_Away', 'Home', 'Juice_Home', 'MLWinner',
           'HC1', 'HC2', 'HC3', 'HC4', 'HC5', 'HC6', 'HC7', 'HC8', 'HC9', 'HC10',
           'HC11', 'HC12', 'HC13', 'HC14', 'HC15', 'HC16', 'HC17', 'HC18', 'HC19', 'HC20', 
           'HC21', 'HC22', 'HC23', 'HC24', 'HC25', 'HC26', 'HC27', 'HC28', 'HC29', 'HC30']
ml_bets = ml_game_info.join(ml_juice_all_hcs).join(ml_bets_all_hcs)[columns].sort_values(by=['GameTime'])

In [12]:
# if juice is empty, put in 100 
ml_bets.Juice_Away.fillna(100, inplace=True)
ml_bets.Juice_Home.fillna(100, inplace=True)

# conver juice to int64
ml_bets.Juice_Away = pd.to_numeric(ml_bets.Juice_Away)
ml_bets.Juice_Home = pd.to_numeric(ml_bets.Juice_Home)

ml_bets.dtypes

League                object
GameTime      datetime64[ns]
Away                  object
Juice_Away             int64
Home                  object
Juice_Home             int64
MLWinner              object
HC1                   object
HC2                   object
HC3                   object
HC4                   object
HC5                   object
HC6                   object
HC7                   object
HC8                   object
HC9                   object
HC10                  object
HC11                  object
HC12                  object
HC13                  object
HC14                  object
HC15                  object
HC16                  object
HC17                  object
HC18                  object
HC19                  object
HC20                  object
HC21                  object
HC22                  object
HC23                  object
HC24                  object
HC25                  object
HC26                  object
HC27                  object
HC28          

In [13]:
ml_bets.head()

Unnamed: 0_level_0,League,GameTime,Away,Juice_Away,Home,Juice_Home,MLWinner,HC1,HC2,HC3,...,HC21,HC22,HC23,HC24,HC25,HC26,HC27,HC28,HC29,HC30
GameKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abb0845ce6ae8c905388a368f863ceb9,MLB,2013-07-04 19:07:00,DETROIT,-132,TORONTO,100,Away,,,,...,,,,,,,,,,
792d793e8f67cc7b6f214e6ba9397c8c,MLB,2013-07-04 20:10:00,LA DODGERS,100,COLORADO,-134,Home,,,,...,,Home,,,,,,,,
999cde91df73f5504e26ac27c5b39f68,MLB,2013-07-04 21:05:00,ST. LOUIS,-136,LA ANGELS,100,Away,,,,...,,Away,,,,,,,,
2838086990c7462fbecb4c061464f032,MLB,2013-07-05 22:05:00,BOSTON,100,LA ANGELS,-147,Home,,,,...,,,,,,,,,,
9bd97fab2ee67233384743321b6de760,MLB,2013-07-11 19:10:00,CINCINNATI,105,ATLANTA,100,Away,,,,...,,,,,,,,,,


In [14]:
print(ml_bets.GameTime.min(), ml_bets.GameTime.max())

2013-07-04 19:07:00 2018-04-11 19:10:00


In [15]:
for each in ml_bets.League.unique():
    print(ml_bets[ml_bets['League']==each]['League'].groupby([ml_bets.GameTime.dt.year, ml_bets.League]).size(), "\n")


GameTime  League
2013      MLB        657
2014      MLB       1952
2015      MLB       2086
2016      MLB       2151
2017      MLB       2241
2018      MLB        165
Name: League, dtype: int64 

GameTime  League
2013      NFL        39
2014      NFL        81
2015      NFL        80
2016      NFL       102
2017      NFL        92
2018      NFL         4
Name: League, dtype: int64 

GameTime  League
2013      NCAAF     23
2014      NCAAF     70
2015      NCAAF     58
2016      NCAAF     61
2017      NCAAF     65
2018      NCAAF      4
Name: League, dtype: int64 

GameTime  League
2013      NHL        169
2014      NHL        640
2015      NHL        850
2016      NHL        966
2017      NHL       1069
2018      NHL        534
Name: League, dtype: int64 

GameTime  League
2013      NBA         6
2014      NBA       104
2015      NBA       129
2016      NBA       174
2017      NBA       169
2018      NBA       109
Name: League, dtype: int64 

GameTime  League
2013      NCAAB       7
201

In [16]:
mlb_ml_bets = ml_bets[(ml_bets['League']=='MLB') & (ml_bets['MLWinner'] !='')]

mlb_ml_bets.shape

(9173, 37)

In [17]:
juice_bins = [-9999, -300, -200, -190, -180, -170, -160, -150, -140, -130, -120, -110, -100, 
             100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 300, 9999]

make_dummies = ['Binned_Juice_Away', 'Binned_Juice_Home', 'Away', 'Home', 
           'HC1', 'HC2', 'HC3', 'HC4', 'HC5', 'HC6', 'HC7', 'HC8', 'HC9', 'HC10',
           'HC11', 'HC12', 'HC13', 'HC14', 'HC15', 'HC16', 'HC17', 'HC18', 'HC19', 'HC20', 
           'HC21', 'HC22', 'HC23', 'HC24', 'HC25', 'HC26', 'HC27', 'HC28', 'HC29', 'HC30']

mlb_ml_bets['Binned_Juice_Away'] = pd.cut(mlb_ml_bets.Juice_Away, bins=juice_bins, labels=False)
mlb_ml_bets['Binned_Juice_Home'] = pd.cut(mlb_ml_bets.Juice_Home, bins=juice_bins, labels=False)

binarized_mlb_ml_bets = pd.get_dummies(mlb_ml_bets, columns=make_dummies)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [18]:
binarized_mlb_ml_bets.shape

(9173, 171)

In [19]:
features = binarized_mlb_ml_bets.columns.difference(['League', 'GameTime','Juice_Away', 'Juice_Home', 'MLWinner'])

target = ['MLWinner']

# Split into train and test data.
train_data = binarized_mlb_ml_bets[(binarized_mlb_ml_bets.GameTime < '2017-07-01')][features]
train_labels = binarized_mlb_ml_bets[(binarized_mlb_ml_bets.GameTime < '2017-07-01')][target]

test_data = binarized_mlb_ml_bets[(binarized_mlb_ml_bets.GameTime >= '2017-07-01')][features]
test_labels = binarized_mlb_ml_bets[(binarized_mlb_ml_bets.GameTime >= '2017-07-01')][target]

In [20]:
print (train_data.shape, test_data.shape)

(7945, 166) (1228, 166)


In [21]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score



In [22]:
clf=BernoulliNB(alpha=10)
clf.fit(train_data,train_labels)
print('train: ', accuracy_score(clf.predict(train_data),train_labels))
print('test: ', accuracy_score(clf.predict(test_data),test_labels))

test_predicted_labels = clf.predict(test_data)
test_predicted_probs = clf.predict_proba(test_data)
print("\nConfusion matrix:\n%s" % confusion_matrix(test_labels, test_predicted_labels))
print("\nClassification report for classifier %s:\n%s\n"
% (clf, classification_report(test_labels, test_predicted_labels))) 

  y = column_or_1d(y, warn=True)


train:  0.562492133417
test:  0.582247557003

Confusion matrix:
[[308 259]
 [254 407]]

Classification report for classifier BernoulliNB(alpha=10, binarize=0.0, class_prior=None, fit_prior=True):
             precision    recall  f1-score   support

       Away       0.55      0.54      0.55       567
       Home       0.61      0.62      0.61       661

avg / total       0.58      0.58      0.58      1228




In [23]:
badvalues = ['']
ix = np.isin(test_predicted_labels, badvalues)
np.where(ix)

(array([], dtype=int64),)