In [28]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

odds_data = {}

for year in range(2010, 2022):
  odds_data[year] = pd.read_excel(f"/content/mlb odds {year}.xlsx")

# Odds

In [29]:
def clean_odds(df, INCLUDE_OU=True):

  if df.shape[1] ==23:
    new_cols = { 'Unnamed: 20': 'Open OU odds', 'Unnamed: 22': 'Close OU odds'}
  else:
    new_cols = { 'Unnamed: 18': 'Open OU odds', 'Unnamed: 20': 'Close OU odds'}

  team_mapper = {'NYY': 'NYY', 'BOS': 'BOS', 'PHI': 'PHI', 'WAS': 'WAS', 'MIA': 'MIA', 'NYM': 'NYM', 'STL': 'STL', 'CIN': 'CIN', 'LOS': 'LAD', 'PIT': 'PIT', 'COL': 'COL', 'MIL': 'MIL', 'CUB': 'CUB', 'ATL': 'ATL', 'SDG': 'SDG', 'ARI': 'ARI', 'SFO': 'SFO', 'HOU': 'HOU', 'CLE': 'CLE', 'CWS': 'CWS', 'TOR': 'TOR', 'TEX': 'TEX', 'DET': 'DET', 'KAN': 'KAN', 'MIN': 'MIN', 'LAA': 'LAA', 'SEA': 'SEA', 'OAK': 'OAK', 'BAL': 'BAL', 'TAM': 'TAM', 'LAD': 'LAD', 'CHC': 'CUB'}
  df['Team'] = df['Team'].map(team_mapper)

  df = df.rename(columns = new_cols)
  try:
    cols = ['Date','VH','Team', 'Pitcher','Final','Open OU', 'Open OU odds', 'Close OU', 'Close OU odds']
    df = df[cols]
  except:
    cols = ['Date','VH','Team', 'Pitcher','Final','OpenOU', 'Open OU odds', 'CloseOU', 'Close OU odds']
    df = df[cols]
    df = df.rename(columns={'OpenOU': 'Open OU','CloseOU':'Close OU'})

    
  visit = df[df.VH=='V'].reset_index().drop(columns='index')
  home = df[df.VH=='H'].reset_index().drop(columns='index')

  new_home_cols = dict(zip(home.columns, [f"Home {i}"for i in home.columns]))
  new_visit_cols = dict(zip(visit.columns, [f"Visit {i}"for i in visit.columns]))
  home = home.rename(columns=new_home_cols)
  visit = visit.rename(columns=new_visit_cols)
  cols_at_end = ['Home Final','Visit Final']

  combined_df = pd.concat([home,visit], axis=1).drop(columns=['Home VH','Visit VH','Visit Date'])
  combined_df = combined_df[[col for col in combined_df if col not in cols_at_end] + cols_at_end]


  combined_df = combined_df.rename(columns={'Home Date':'Date'})
  combined_df[['Home Close OU','Home Final','Visit Final']] = combined_df[['Home Close OU','Home Final','Visit Final']].apply(pd.to_numeric, errors='coerce')
  combined_df['Total Runs'] = combined_df['Home Final'] + combined_df['Visit Final']
  

  if INCLUDE_OU:
    combined_df['Over'] = (combined_df['Total Runs'] >= combined_df['Home Close OU']).astype('int32')
  else:
    combined_df['Over'] = (combined_df['Total Runs'] > combined_df['Home Close OU']).astype('int32')

  base = combined_df[['Date','Home Team','Visit Team','Home Close OU','Home Final','Visit Final','Total Runs','Over']]
  base['Date'] = base['Date'].astype('int32')

  return base

In [30]:
odds_data_clean = {}
for year, df in odds_data.items():

  odds_data_clean[year] = clean_odds(df).dropna()

In [75]:
odds_2010 = odds_data_clean[2010]


team_name = 'LAD'
window = 10

def make_time_serires(clean_odds, team_name, window):

  home = clean_odds[clean_odds['Home Team']==team_name][['Date','Home Team','Visit Team','Home Final', 'Visit Final']].rename(columns={'Home Final':'Runs scored', 'Visit Final':'Runs allowed'})
  away = clean_odds[clean_odds['Visit Team']==team_name][['Date','Home Team','Visit Team','Visit Final','Home Final']].rename(columns={'Visit Final':'Runs scored', 'Home Final': 'Runs allowed'})
  team_odds = home.append(away).sort_values('Date')

  for i in range(1,window+1):
    team_odds[f'Runs scored {i} games before'] = team_odds['Runs scored'].shift(i) 

  team_odds[f'{window} game run scored average'] = team_odds['Runs scored 1 games before'].rolling(window = window).mean() 

  for i in range(1,window+1):
    team_odds[f'Runs allowed {i} games before'] = team_odds['Runs allowed'].shift(i)

  team_odds[f'{window} game run allowed average'] = team_odds['Runs scored 1 games before'].rolling(window = window).mean() 
  team_odds = team_odds.dropna()

  home_runs = team_odds[team_odds['Home Team'] == team_name]
  visit_runs = team_odds[team_odds['Visit Team'] == team_name]

  return home_runs, visit_runs


In [76]:
home_runs, visit_runs = make_time_serires(odds_2010, team_name, 10)

In [77]:
home_runs

Unnamed: 0,Date,Home Team,Visit Team,Runs scored,Runs allowed,Runs scored 1 games before,Runs scored 2 games before,Runs scored 3 games before,Runs scored 4 games before,Runs scored 5 games before,Runs scored 6 games before,Runs scored 7 games before,Runs scored 8 games before,Runs scored 9 games before,Runs scored 10 games before,10 game run scored average,Runs allowed 1 games before,Runs allowed 2 games before,Runs allowed 3 games before,Runs allowed 4 games before,Runs allowed 5 games before,Runs allowed 6 games before,Runs allowed 7 games before,Runs allowed 8 games before,Runs allowed 9 games before,Runs allowed 10 games before,10 game run allowed average
158,417,LAD,SFO,0,9,10.0,6.0,7.0,9.0,5.0,6.0,7.0,10.0,3.0,5.0,6.8,8.0,5.0,9.0,5.0,6.0,7.0,3.0,2.0,4.0,11.0,6.8
176,418,LAD,SFO,2,1,0.0,10.0,6.0,7.0,9.0,5.0,6.0,7.0,10.0,3.0,6.3,9.0,8.0,5.0,9.0,5.0,6.0,7.0,3.0,2.0,4.0,6.3
324,429,LAD,PIT,0,2,3.0,5.0,0.0,0.0,4.0,1.0,5.0,14.0,9.0,2.0,4.3,7.0,10.0,4.0,1.0,3.0,5.0,8.0,6.0,11.0,1.0,4.3
336,430,LAD,PIT,6,2,0.0,3.0,5.0,0.0,0.0,4.0,1.0,5.0,14.0,9.0,4.1,2.0,7.0,10.0,4.0,1.0,3.0,5.0,8.0,6.0,11.0,4.1
352,501,LAD,PIT,5,1,6.0,0.0,3.0,5.0,0.0,0.0,4.0,1.0,5.0,14.0,3.8,2.0,2.0,7.0,10.0,4.0,1.0,3.0,5.0,8.0,6.0,3.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2267,922,LAD,SDG,1,3,0.0,7.0,2.0,5.0,2.0,1.0,1.0,4.0,6.0,4.0,3.2,6.0,6.0,12.0,7.0,10.0,2.0,0.0,7.0,3.0,2.0,3.2
2280,923,LAD,SDG,3,1,1.0,0.0,7.0,2.0,5.0,2.0,1.0,1.0,4.0,6.0,2.9,3.0,6.0,6.0,12.0,7.0,10.0,2.0,0.0,7.0,3.0,2.9
2390,1001,LAD,ARI,5,7,7.0,9.0,3.0,4.0,2.0,3.0,3.0,1.0,0.0,7.0,3.9,6.0,7.0,1.0,5.0,5.0,1.0,1.0,3.0,6.0,6.0,3.9
2406,1002,LAD,ARI,3,2,5.0,7.0,9.0,3.0,4.0,2.0,3.0,3.0,1.0,0.0,3.7,7.0,6.0,7.0,1.0,5.0,5.0,1.0,1.0,3.0,6.0,3.7


In [78]:

def merge_time_series(clean_odds, window):

  teams = clean_odds['Home Team'].unique()

  home_main_df = pd.DataFrame()
  visit_main_df = pd.DataFrame()

  for team in teams:
    home_runs, visit_runs = make_time_serires(clean_odds,team, window)

    home_main_df = home_main_df.append(home_runs)
    visit_main_df = visit_main_df.append(visit_runs)

  merged = pd.merge(visit_main_df, home_main_df, on=['Date','Home Team', 'Visit Team',]).sort_values('Date')

  mapper_visit = dict(zip(merged.columns[3:15].tolist() , [f'Visit {i[:-2]}' for i in merged.columns[3:15]]))
  mapper_home = dict(zip(merged.columns[15:].tolist() , [f'Home {i[:-2]}' for i in merged.columns[15:]]))
  merged = merged.rename(columns=mapper_visit)
  merged = merged.rename(columns=mapper_home)
  merged = merged.reset_index().drop(columns='index')

  merged = merged.dropna()
  merged = merged[merged['Date']<1000]
  return merged

In [79]:
runs_series_2010 = merge_time_series(odds_2010, 10)

In [84]:
runs_series = {}
for year in range(2010,2022):
  runs_series[year] = merge_time_series(odds_data_clean[year], 10)

# Clean Data 

In [34]:
clean_data = {}
for year in range(2010,2022):
  clean_data[year] = pd.read_csv(f'https://raw.githubusercontent.com/jacobh310/over_under/master/data_cleaning/clean_data/ops_clean_data_{year}.csv')

In [100]:
merged = {}

for year in range(2010,2022):

  temp = pd.merge(clean_data[year],runs_series[year], on=['Date','Home Team','Visit Team'])
  temp.to_csv(f'/content/clean_data/series_clean_data_{year}.csv', index=False)
  merged[year] = temp


In [101]:
!zip -r /content/series.zip /content/clean_data


  adding: content/clean_data/ (stored 0%)
  adding: content/clean_data/series_clean_data_2015.csv (deflated 80%)
  adding: content/clean_data/series_clean_data_2010.csv (deflated 80%)
  adding: content/clean_data/series_clean_data_2020.csv (deflated 81%)
  adding: content/clean_data/series_clean_data_2018.csv (deflated 80%)
  adding: content/clean_data/series_clean_data_2016.csv (deflated 80%)
  adding: content/clean_data/series_clean_data_2013.csv (deflated 80%)
  adding: content/clean_data/series_clean_data_2017.csv (deflated 80%)
  adding: content/clean_data/series_clean_data_2011.csv (deflated 80%)
  adding: content/clean_data/series_clean_data_2014.csv (deflated 80%)
  adding: content/clean_data/series_clean_data_2012.csv (deflated 80%)
  adding: content/clean_data/series_clean_data_2019.csv (deflated 80%)
  adding: content/clean_data/series_clean_data_2021.csv (deflated 80%)


In [104]:
merged[2010][:15]

Unnamed: 0,Date,Home Team,Visit Team,Home Close OU,Total Runs,Over,Visit OPS 1,Visit OPS 2,Visit OPS 3,Visit OPS 4,Visit OPS 5,Visit OPS 6,Visit OPS 7,Visit OPS 8,Visit OPS 9,Visit ERA,Home OPS 1,Home OPS 2,Home OPS 3,Home OPS 4,Home OPS 5,Home OPS 6,Home OPS 7,Home OPS 8,Home OPS 9,Home ERA,Visit Runs scored,Visit Runs allowed,Visit Runs scored 1 games before,Visit Runs scored 2 games before,Visit Runs scored 3 games before,Visit Runs scored 4 games before,Visit Runs scored 5 games before,Visit Runs scored 6 games before,Visit Runs scored 7 games before,Visit Runs scored 8 games before,Visit Runs scored 9 games before,Visit Runs scored 10 games before,Home 10 game run scored average,Home Runs allowed 1 games before,Home Runs allowed 2 games before,Home Runs allowed 3 games before,Home Runs allowed 4 games before,Home Runs allowed 5 games before,Home Runs allowed 6 games before,Home Runs allowed 7 games before,Home Runs allowed 8 games before,Home Runs allowed 9 games before,Home Runs allowed 10 games before,Home 10 game run allowed average,Home Runs scored,Home Runs allowed,Home Runs scored 1 games before,Home Runs scored 2 games before,Home Runs scored 3 games before,Home Runs scored 4 games before,Home Runs scored 5 games before,Home Runs scored 6 games before,Home Runs scored 7 games before,Home Runs scored 8 games before,Home Runs scored 9 games before,Home Runs scored 10 games before,Home 10 game run scored average.1,Home Runs allowed 1 games before.1,Home Runs allowed 2 games before.1,Home Runs allowed 3 games before.1,Home Runs allowed 4 games before.1,Home Runs allowed 5 games before.1,Home Runs allowed 6 games before.1,Home Runs allowed 7 games before.1,Home Runs allowed 8 games before.1,Home Runs allowed 9 games before.1,Home Runs allowed 10 games before.1,Home 10 game run allowed average.1
0,416,TOR,LAA,8.5,12,1,0.676,0.754,0.943,1.0,0.756,0.736,0.398,0.229,0.5,13.5,0.721,1.145,0.866,1.294,0.287,0.717,0.65,0.579,0.631,1.35,7,5,2.0,5.0,5.0,4.0,4.0,4.0,1.0,2.0,3.0,6.0,3.6,6.0,3.0,7.0,9.0,3.0,10.0,10.0,4.0,5.0,3.0,3.6,5,7,7.0,1.0,4.0,7.0,5.0,3.0,7.0,3.0,7.0,4.0,4.8,3.0,11.0,2.0,8.0,2.0,0.0,6.0,1.0,4.0,5.0,4.8
1,416,OAK,BAL,7.5,6,0,1.105,0.754,0.75,0.689,0.811,0.9,0.889,0.576,0.498,4.91,0.602,0.946,0.723,0.663,0.626,0.554,1.138,0.54,0.827,2.65,2,4,2.0,1.0,6.0,1.0,2.0,0.0,6.0,5.0,3.0,3.0,2.9,6.0,9.0,8.0,5.0,5.0,3.0,7.0,4.0,4.0,4.0,2.9,4,2,6.0,2.0,0.0,4.0,9.0,3.0,10.0,6.0,6.0,1.0,4.7,2.0,4.0,3.0,0.0,4.0,4.0,4.0,2.0,5.0,1.0,4.7
2,417,BOS,TAM,9.0,11,1,0.618,0.946,0.776,0.881,0.973,0.792,0.66,1.0,0.459,4.09,0.619,0.5,1.194,0.909,0.525,0.606,0.849,2.0,0.273,3.86,6,5,1.0,9.0,8.0,5.0,3.0,0.0,9.0,4.0,4.0,4.0,4.7,1.0,1.0,6.0,1.0,7.0,10.0,3.0,5.0,3.0,3.0,4.7,5,6,1.0,0.0,6.0,2.0,8.0,8.0,3.0,1.0,4.0,9.0,4.2,1.0,8.0,3.0,5.0,6.0,3.0,4.0,3.0,6.0,7.0,4.2
3,417,SDG,ARI,8.0,5,0,0.669,0.883,0.814,0.645,1.191,0.989,0.713,0.825,1.166,3.5,0.563,0.697,1.128,0.933,0.877,0.811,0.583,0.641,0.833,3.6,0,5,3.0,5.0,9.0,5.0,15.0,3.0,9.0,5.0,3.0,6.0,6.3,6.0,6.0,7.0,9.0,6.0,6.0,1.0,3.0,6.0,3.0,6.3,5,0,6.0,2.0,1.0,17.0,2.0,5.0,0.0,3.0,6.0,3.0,4.5,3.0,6.0,6.0,2.0,4.0,4.0,7.0,5.0,3.0,6.0,4.5
4,417,MIN,KAN,9.5,11,1,0.876,0.962,0.785,1.195,0.79,0.652,0.862,0.873,0.0,3.57,0.685,0.64,0.976,0.983,0.902,0.894,0.891,0.698,0.938,3.38,5,6,3.0,7.0,5.0,10.0,6.0,3.0,4.0,3.0,3.0,4.0,4.8,10.0,3.0,6.0,5.0,8.0,8.0,3.0,7.0,2.0,8.0,4.8,6,5,10.0,8.0,3.0,5.0,4.0,2.0,4.0,10.0,4.0,5.0,5.5,3.0,0.0,6.0,2.0,5.0,1.0,3.0,1.0,2.0,3.0,5.5
5,417,OAK,BAL,7.5,7,0,0.691,0.828,0.627,0.727,0.6,0.817,0.617,0.987,0.465,2.89,0.594,0.861,0.705,0.65,0.548,0.428,0.834,0.956,0.867,2.7,3,4,2.0,2.0,1.0,6.0,1.0,2.0,0.0,6.0,5.0,3.0,2.8,4.0,6.0,9.0,8.0,5.0,5.0,3.0,7.0,4.0,4.0,2.8,4,3,4.0,6.0,2.0,0.0,4.0,9.0,3.0,10.0,6.0,6.0,5.0,2.0,2.0,4.0,3.0,0.0,4.0,4.0,4.0,2.0,5.0,5.0
6,417,TOR,LAA,10.0,9,0,0.691,0.718,0.973,0.989,0.803,0.7,0.596,0.711,0.848,2.84,0.658,1.09,0.945,1.386,0.262,0.637,0.875,0.551,0.529,4.05,6,3,7.0,2.0,5.0,5.0,4.0,4.0,4.0,1.0,2.0,3.0,3.7,5.0,6.0,3.0,7.0,9.0,3.0,10.0,10.0,4.0,5.0,3.7,3,6,5.0,7.0,1.0,4.0,7.0,5.0,3.0,7.0,3.0,7.0,4.9,7.0,3.0,11.0,2.0,8.0,2.0,0.0,6.0,1.0,4.0,4.9
7,417,SEA,DET,7.5,6,0,0.861,0.59,1.036,1.105,0.8,0.876,0.455,0.701,0.606,9.0,0.589,0.747,0.97,0.637,0.624,0.628,0.691,0.803,0.422,3.1,2,4,3.0,3.0,6.0,5.0,9.0,4.0,5.0,7.0,2.0,8.0,5.2,11.0,7.0,5.0,10.0,8.0,2.0,2.0,3.0,3.0,4.0,5.2,4,2,11.0,4.0,3.0,0.0,2.0,4.0,2.0,2.0,5.0,1.0,3.4,3.0,2.0,0.0,4.0,9.0,3.0,6.0,6.0,6.0,1.0,3.4
8,417,PIT,CIN,8.5,9,1,0.622,0.595,0.685,0.842,0.444,0.422,1.214,1.301,1.2,2.63,0.845,0.714,0.618,0.841,0.596,0.403,0.697,0.0,0.743,2.37,4,5,3.0,2.0,3.0,10.0,6.0,3.0,3.0,5.0,2.0,3.0,4.0,4.0,10.0,5.0,8.0,5.0,1.0,4.0,4.0,1.0,6.0,4.0,5,4,4.0,0.0,6.0,3.0,6.0,6.0,1.0,2.0,4.0,11.0,4.3,3.0,6.0,5.0,9.0,15.0,3.0,9.0,10.0,3.0,5.0,4.3
9,417,STL,NYM,7.5,3,0,0.553,0.61,1.156,0.706,0.673,1.392,0.684,0.606,0.5,3.75,0.917,0.282,1.214,0.613,0.705,0.827,0.311,0.287,0.0,3.5,2,1,3.0,5.0,5.0,3.0,2.0,3.0,8.0,1.0,6.0,7.0,4.3,4.0,0.0,6.0,11.0,5.0,4.0,2.0,3.0,7.0,1.0,4.3,1,2,4.0,1.0,2.0,5.0,7.0,7.0,5.0,1.0,6.0,11.0,4.9,3.0,5.0,1.0,0.0,8.0,1.0,4.0,2.0,3.0,6.0,4.9
