In [7]:
import requests, bs4, pandas as pd, numpy as np
import hashlib # use this to hash game key
import collections 

In [8]:
# Web scrapping 
# Try an example on MIT SIMULATOR, https://www.sportschatexperts.com/index/capperhistory/capper_id/68/
def get_html(url):
    html = requests.get(url)
    try:
        html.raise_for_status()
    except Exception as exc:
        print('There was a problem: %s' % exc)
    html = bs4.BeautifulSoup(html.text, "html5lib")
    return html

def get_next_url(html):
    # find the next button
    return html.find('a', string='Next')['href']

def merge_table(url, stop_page=-1, data=[]):
    cur_page = 0
    while True:
        cur_page += 1
        if cur_page % 50 == 0:
            print("\tProcess %s ..." % url)
        html = get_html(url)
        table = html.find('section', class_='leaderboards diffpadding').div.table
        table_body = table.find('tbody')       
        rows = table_body.find_all('tr', class_='leaderboards_row2 fix-border-bottom ph-first-row')
        for row in rows:
            cols = row.find_all('td')
            # make sure the targeted row for extraction has 10 columns
            if len(cols) == 10:
                # only want to keep these columns
                keep_columns = [1, 2, 3, 4, 5, 6, 7, 8]
                cols = [ele.text.strip() for ele in [cols[i] for i in keep_columns]]
                data.append([ele for ele in cols if ele]) # Get rid of empty values
        url = get_next_url(html)
        if (url == '#') or cur_page >= stop_page:
            break

In [9]:
def crawl_history():
    names = [    
    'Sports',
    'Eddie',
    'Schule',
    'DAmico',
    'Duffy',
    'Thomas',
    'Hunter',
    'Compeau', # not in Ji's original list
    'Syndicate',
    'Lundin'
    ]

    links = [
    "https://www.sportschatexperts.com/index/capperhistory/capper_id/45/", 
    "https://www.sportschatexperts.com/index/capperhistory/capper_id/38/",
    "https://www.sportschatexperts.com/index/capperhistory/capper_id/52/",
    "https://www.sportschatexperts.com/index/capperhistory/capper_id/35/",
    "https://www.sportschatexperts.com/index/capperhistory/capper_id/37/",
    "https://www.sportschatexperts.com/index/capperhistory/capper_id/53/",
    "https://www.sportschatexperts.com/index/capperhistory/capper_id/41/",
    "https://www.sportschatexperts.com/index/capperhistory/capper_id/56/",
    "https://www.sportschatexperts.com/index/capperhistory/capper_id/58/",
    "https://www.sportschatexperts.com/index/capperhistory/capper_id/61/"
    ]
    
    handicappers_dict = collections.OrderedDict(zip(names, links))
    
    for k, v in handicappers_dict.items():
        data=[]
        merge_table(v, 9999, data)
        #print(data)
        df = pd.DataFrame(data, columns=['League', 'Game', 'Date', 'Play', 'Line Selected', 'Type', 'Score', 'Result'])
        filename = 'HC' + str(list(handicappers_dict.keys()).index(k)+1)
        
        # produce a hashed gamekey for the game
        df["GameKey"] = df[['League', 'Game', 'Date']].apply(lambda x: hashlib.md5(''.join(x).encode('utf-8')).hexdigest(), axis=1)

        # home team is always on the right?
        df["Home"] = df["Game"].apply(lambda x: x.split('vs.')[1].strip().upper())

        # away team is always on the left?
        df["Away"] = df["Game"].apply(lambda x: x.split('vs.')[0].strip().upper())

        # strip out type of bet
        df["Bet"] = df["Line Selected"].apply(lambda x: x.split(':')[0].strip().upper())

        # strip out which team the bet is on
        df["On"] = df["Play"].apply(lambda x: x.split('Play on ')[1].strip().upper())

        # convert game time to datetime format 
        df["GameTime"] = df["Date"].apply(lambda x: pd.to_datetime(x))

        # strip out juice
        df["Juice"] = df["Line Selected"].apply(lambda x: x.split(': ')[1].strip() if ("Money Line" in x) else (x.split(': ')[1].split('/')[0].strip() if ("Total" in x) else x.split(': ')[1].split('/')[1].strip()))

        # MLWinner: either home or away team for ML bets
        conditions = [
        (df['On'] == df['Home']) & (df['Bet'] == 'MONEY LINE') & (df['Result'] == 'Win'), 
        (df['On'] == df['Away']) & (df['Bet'] == 'MONEY LINE') & (df['Result'] == 'Win'), 
        (df['On'] == df['Home']) & (df['Bet'] == 'MONEY LINE') & (df['Result'] == 'Loss'), 
        (df['On'] == df['Away']) & (df['Bet'] == 'MONEY LINE') & (df['Result'] == 'Loss')
        ]
        choices = ['Home', 'Away', 'Away', 'Home']
        df['MLWinner'] = np.select(conditions, choices, default='')

        # MLBet: either home or away team for ML bets
        conditions = [
        (df['On'] == df['Home']) & (df['Bet'] == 'MONEY LINE'), 
        (df['On'] == df['Away']) & (df['Bet'] == 'MONEY LINE')
        ]
        choices = ['Home', 'Away']
        df['MLBet'] = np.select(conditions, choices, default='')
        
        columns_to_keep = ['League', 'GameKey', 'GameTime', 
                   'Home', 'Away',
                   'Bet', 'On',
                   'Juice', 'Type', 'Result',
                   'MLBet', 'MLWinner']
        #print(df[columns_to_keep].head())
        df[columns_to_keep].to_pickle(k + '.gz', compression="gzip")       

crawl_history()

In [10]:
# here's an example to load the pickle file in
Sports_df = pd.read_pickle('Sports.gz', compression='gzip')
Sports_df

Unnamed: 0,League,GameKey,GameTime,Home,Away,Bet,On,Juice,Type,Result,MLBet,MLWinner
0,MLB,097454a0ea30e73053b8a3c2afd66e86,2018-04-11 14:15:00,KANSAS CITY,SEATTLE,MONEY LINE,SEATTLE,-127,Premium,Win,Away,Away
1,MLB,9655fa740ca7f529fa40c04e2a5154f3,2018-04-11 14:10:00,CHI WHITE SOX,TAMPA BAY,MONEY LINE,TAMPA BAY,101,Premium,Loss,Away,Home
2,MLB,5d795c1dcd0c8a50e57be08a6f233537,2018-04-11 13:05:00,WASHINGTON,ATLANTA,MONEY LINE,ATLANTA,112,Premium,Win,Away,Away
3,NBA,128632d43e6da7c86aa28319d9124493,2018-04-10 20:00:00,WASHINGTON,BOSTON,POINT SPREAD,WASHINGTON,-110,Premium,Win,,
4,MLB,1a6b00f4aaee52bca4bc64eace4e0286,2018-04-10 19:05:00,PHILADELPHIA,CINCINNATI,MONEY LINE,CINCINNATI,183,Premium,Loss,Away,Home
5,MLB,8cf116fbdfeff04ff1ad92eddda911c6,2018-04-10 14:10:00,CHI WHITE SOX,TAMPA BAY,MONEY LINE,TAMPA BAY,-104,Premium,Win,Away,Away
6,MLB,08c723b135840a9e9bb8af12760e11d1,2018-04-09 20:15:00,KANSAS CITY,SEATTLE,MONEY LINE,SEATTLE,-103,Premium,Loss,Away,Home
7,NBA,ac9cbde3f715d7038eb9bffc827bfe8d,2018-04-09 20:00:00,MILWAUKEE,ORLANDO,POINT SPREAD,ORLANDO,-110,Premium,Loss,,
8,NBA,5411b5e90340c0e88843dbdd1013c1f1,2018-04-09 19:30:00,BROOKLYN,CHICAGO,POINT SPREAD,BROOKLYN,-107,Premium,Win,,
9,MLB,3e9c8fea8ea1b2f2654a67da4e97e5c3,2018-04-09 19:05:00,BALTIMORE,TORONTO,MONEY LINE,BALTIMORE,-112,Premium,Loss,Home,Away


In [None]:
# additional features to make
"""
num_win_bets_specific_sport,
num_win_bets_all_sports,
num_loss_specific_sport,
num_loss_all_sports,
win_percentage_specific_sport,
win_percentage_all_sports,
avg_juice_on_bets_specific_sport, 
avg_juice_on_bets_all_sports,
current_win_streak_specific_sport, 
current_win_streak_all_sports,
current_loss_streak_specific_sport, 
current_loss_streak_all_sports,
longest_win_streak_specific_sport,
longest_win_streak_all_sports,
longest_loss_streak_specific_sport,
longest_loss_streak_all_sports
"""

"""
our very basic model can be a NB model: 
gamekey, MLWinner, HC1_MLBet, ... HC30_MLBet
including the juice for each HC's bet
"""

Goals
1) which HC are good
2) can we predict who's going to win