In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
pd.set_option('display.max_columns', 500)
from zipfile import ZipFile
from datetime import timedelta
import re
import os

In [2]:
# load all time series data
zip_file = ZipFile('../../data/bestfightodds_data/straight_bets.zip')
dfs = [pd.read_csv(zip_file.open(text_file.filename)) for text_file in zip_file.infolist() 
       if text_file.filename.endswith('.csv')]
df = pd.concat(dfs, sort = False)
df['Date'] = pd.to_datetime(df['dates'], unit = 'ms')
del dfs

# load closing odds data
closing = pd.read_csv("../../data/bestfightodds_data/moneyline_data_at_close.csv")
closing.rename({'url':'fight_odds_url'}, inplace = True, axis = 1)
# filter to straight bets only on cards after 2008
closing['Card_Date'] = pd.to_datetime(closing['Card_Date'])
closing = closing[(closing['bet_type'] == 'Ordinary')]

In [3]:
# filter to final quote

byval = ['fighter1', 'fighter2', 'url', 'Bet', 'betsite', 'Date']
out = df.sort_values(byval).groupby(byval[0:-1]).nth(-1).reset_index()
# pivot data 

keepcol = ['fighter1', 'fighter2', 'url',  'Bet']

out = out[keepcol+['odds', 'betsite']].pivot_table(values = ['odds'], columns = ['betsite'], 
                    index = keepcol,
                    aggfunc='first').reset_index()
out.columns = [a if b == '' else b for (a, b) in out.columns] 
out.rename(columns = {'William\xa0H.':'William_H', 
                     'SportsInt.':'SportsInt',
                    'url':'fight_odds_url'}, inplace = True)


In [4]:
print(closing.shape[0])
cmp = pd.merge(closing, out,
               how = 'inner',
               on = ['fight_odds_url', 'fighter1', 'fighter2', 'Bet'], suffixes = ('_close', '_ts'))
print(cmp.shape[0])
betting_sites= ['5Dimes','BetDSI','BookMaker','SportBet','Bet365','Bovada', 
                'Sportsbook','William_H','Pinnacle','SportsInt','BetOnline','Intertops']
for col in betting_sites:
    cmp[col+'_close'] = np.where(cmp[col+'_close'] == -100.0, 100.0, cmp[col+'_close'])
    cmp[col+'_ts'] = np.where(cmp[col+'_ts'] == -100.0, 100.0, cmp[col+'_ts'])
    cmp[col+"_cmp"] = (cmp[col+'_close'] == cmp[col+"_ts"]) | (cmp[col+'_close'].isna() & cmp[col+"_ts"].isna())
    
cmp['all_good'] = cmp[[x+"_cmp" for x in betting_sites]].apply(lambda x: x.all(), axis = 1)
cmp['url_all_good'] = cmp.groupby('fight_odds_url')['all_good'].transform(lambda x: x.all())
good = cmp[cmp['url_all_good']].drop_duplicates(['fight_odds_url'])
bad = cmp[~(cmp['url_all_good'])].drop_duplicates(['fight_odds_url'])
print(bad.shape[0])
print(good.shape[0])

9730
9730
0
431
