In [28]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

# Define Necessary Functions

In [29]:
def clean_odds(df, INCLUDE_OU=True):

  if df.shape[1] ==23:
    new_cols = { 'Unnamed: 20': 'Open OU odds', 'Unnamed: 22': 'Close OU odds'}
  else:
    new_cols = { 'Unnamed: 18': 'Open OU odds', 'Unnamed: 20': 'Close OU odds'}

  ## renames some teams to make consistent
  team_mapper = {'NYY': 'NYY', 'BOS': 'BOS', 'PHI': 'PHI', 'WAS': 'WAS', 'MIA': 'MIA', 'NYM': 'NYM', 'STL': 'STL', 'CIN': 'CIN', 'LOS': 'LAD', 'PIT': 'PIT', 'COL': 'COL', 'MIL': 'MIL', 'CUB': 'CUB', 'ATL': 'ATL', 'SDG': 'SDG', 'ARI': 'ARI', 'SFO': 'SFO', 'HOU': 'HOU', 'CLE': 'CLE', 'CWS': 'CWS', 'TOR': 'TOR', 'TEX': 'TEX', 'DET': 'DET', 'KAN': 'KAN', 'MIN': 'MIN', 'LAA': 'LAA', 'SEA': 'SEA', 'OAK': 'OAK', 'BAL': 'BAL', 'TAM': 'TAM', 'LAD': 'LAD', 'CHC': 'CUB'}
  df['Team'] = df['Team'].map(team_mapper)

  df = df.rename(columns = new_cols)
  try:
    cols = ['Date','VH','Team', 'Pitcher','Final','Open OU', 'Open OU odds', 'Close OU', 'Close OU odds']
    df = df[cols]
  except:
    cols = ['Date','VH','Team', 'Pitcher','Final','OpenOU', 'Open OU odds', 'CloseOU', 'Close OU odds']
    df = df[cols]
    df = df.rename(columns={'OpenOU': 'Open OU','CloseOU':'Close OU'})

    
  visit = df[df.VH=='V'].reset_index().drop(columns='index')
  home = df[df.VH=='H'].reset_index().drop(columns='index')

  new_home_cols = dict(zip(home.columns, [f"Home {i}"for i in home.columns]))
  new_visit_cols = dict(zip(visit.columns, [f"Visit {i}"for i in visit.columns]))
  home = home.rename(columns=new_home_cols)
  visit = visit.rename(columns=new_visit_cols)
  cols_at_end = ['Home Final','Visit Final']

  combined_df = pd.concat([home,visit], axis=1).drop(columns=['Home VH','Visit VH','Visit Date'])
  combined_df = combined_df[[col for col in combined_df if col not in cols_at_end] + cols_at_end]


  combined_df = combined_df.rename(columns={'Home Date':'Date'})
  combined_df[['Home Close OU','Home Final','Visit Final']] = combined_df[['Home Close OU','Home Final','Visit Final']].apply(pd.to_numeric, errors='coerce')
  combined_df['Total Runs'] = combined_df['Home Final'] + combined_df['Visit Final']
  

  if INCLUDE_OU:
    combined_df['Over'] = (combined_df['Total Runs'] >= combined_df['Home Close OU']).astype('int32')
  else:
    combined_df['Over'] = (combined_df['Total Runs'] > combined_df['Home Close OU']).astype('int32')

  base = combined_df[['Date','Home Team','Visit Team','Home Close OU','Total Runs','Over']]
  base['Date'] = base['Date'].astype('int32')

  return base


In [30]:
def clean_stats(team, is_home, df):
    
  # removes home and visit from the columns and just leaves it as AVG # or ERA
  home = df[df['Home Team'] ==team][df.columns[:10].tolist() + ['Date', 'Visit Team', 'Home Team']]
  visit = df[df['Visit Team'] ==team][df.columns[10:]]

  avg = [i[6:] for i in df.columns[10:-3]]
  vist_mapper = dict(zip(df.columns[10:-3], avg))
  home_mapper = dict(zip(df.columns[:10], avg))

  home = home.rename(columns=home_mapper)
  visit = visit.rename(columns=vist_mapper)
  
  if is_home:
    combine = home
  else: 
    combine = visit

  #shifts the stats one day to prevent data leakage
  combine = combine.reset_index().drop(columns='index')
  combine[combine.columns[:-2]] = combine[combine.columns[:-2]].apply(pd.to_numeric, errors='coerce')
  combine = combine.sort_values('Date')
  combine = combine.reset_index().drop(columns='index')
  # combine = combine[:162]
  combine = combine[combine.Date<1000]
  combine[combine.columns[:10]] = combine[combine.columns[:10]].shift(1)
  combine = combine.dropna()
  combine = combine.reset_index().drop(columns='index')


  return combine


In [31]:
def merge_df(base, df):
  merged = pd.merge(base, df, on = ['Date','Home Team'])
  merged = merged[merged.columns[:-1]].rename(columns={'Visit Team_x':'Visit Team'})

  return merged

# Import CSV data

In [32]:
odds_df = pd.read_excel(r"/content/yearly_odds/mlb odds 2018.xlsx")
stats_df = pd.read_csv("/content/yearly_stats/2018_stats.csv")

odds_clean = clean_odds(odds_df, True)

In [33]:
def merge_odds_stats(odds, stats):

  ## corrects the teams names to keep consistent
  team_mapper = {'SF': 'SF', 'BOS': 'BOS', 'WSH': 'WSH', 'CIN': 'CIN', 'NYM': 'NYM', 'PIT': 'PIT', 'CHW': 'CHW', 'TEX': 'TEX', 'MIL': 'MIL', 'KC': 'KC', 'ATL': 'ATL', 'ARI': 'ARI', 'HOU': 'HOU', 'LAA': 'LAA', 'OAK': 'OAK', 'TB': 'TB', 'DET': 'DET', 'BAL': 'BAL', 'COL': 'COL', 'FLA': 'MIA', 'CHC': 'CHC', 'CLE': 'CLE', 'PHI': 'PHI', 'MIN': 'MIN', 'STL': 'STL', 'SD': 'SD', 'SEA': 'SEA', 'TOR': 'TOR', 'NYY': 'NYY', 'LAD': 'LAD', 'MIA': 'MIA'}
  stats['Home Team'] = stats['Home Team'].map(team_mapper)
  stats['Visit Team'] = stats['Visit Team'].map(team_mapper)

  ## corrects the names of the teams to match the team names from the odds data
  mapper = {'SF': 'SFO', 'CHC': 'CUB', 'WSH': 'WAS', 'KC': 'KAN', 'CHW': 'CWS', 'SD': 'SDG', 'TB': 'TAM', 'CIN': 'CIN', 'SEA': 'SEA', 'NYM': 'NYM', 'HOU': 'HOU', 'BAL': 'BAL', 'MIN': 'MIN', 'PIT': 'PIT', 'TEX': 'TEX', 'STL': 'STL', 'MIA': 'MIA', 'LAD': 'LAD', 'ARI': 'ARI', 'CLE': 'CLE', 'PHI': 'PHI', 'MIL': 'MIL', 'COL': 'COL', 'OAK': 'OAK', 'NYY': 'NYY', 'TOR': 'TOR', 'LAA': 'LAA', 'DET': 'DET', 'BOS': 'BOS', 'ATL': 'ATL'}
  stats['Home Team'] = stats['Home Team'].map(mapper)
  stats['Visit Team'] = stats['Visit Team'].map(mapper)

  teams = odds['Home Team'].unique()
  columns = ['Date', 'Home Team', 'Visit Team', 'Home Close OU', 'Total Runs', 'Over', 'AVG 1', 'AVG 2', 'AVG 3', 'AVG 4', 'AVG 5', 'AVG 6', 'AVG 7', 'AVG 8', 'AVG 9', 'ERA']

  home_main_df = pd.DataFrame(columns=columns)
  visit_main_df = pd.DataFrame(columns=columns)



  for team in teams:
    home_temp_df = clean_stats(team, True, stats)
    home_temp_merged = merge_df(odds, home_temp_df)

    visit_temp_df = clean_stats(team, False, stats)
    visit_temp_merged = merge_df(odds, visit_temp_df)

    visit_main_df = visit_main_df.append(visit_temp_merged) 
    home_main_df = home_main_df.append(home_temp_merged)


  merged = pd.merge(visit_main_df, home_main_df, on=['Date','Home Team', 'Visit Team','Home Close OU','Total Runs','Over']).sort_values('Date')
  merged = merged.drop_duplicates(subset=['Date','Home Team','Visit Team'])

  stat_mapper_visit = dict(zip([f'{i}_x' for i in visit_main_df.columns[6:]], [f'Visit {i}' for i in visit_main_df.columns[6:]]))
  stat_mapper_home = dict(zip([f'{i}_y' for i in visit_main_df.columns[6:]], [f'Home {i}' for i in visit_main_df.columns[6:]]))

  merged = merged.rename(columns=stat_mapper_visit)
  merged = merged.rename(columns=stat_mapper_home)
  merged = merged.reset_index().drop(columns='index')

  return merged

In [34]:
merged = merge_odds_stats(odds_clean, stats_df)

In [35]:
merged

Unnamed: 0,Date,Home Team,Visit Team,Home Close OU,Total Runs,Over,Visit AVG 1,Visit AVG 2,Visit AVG 3,Visit AVG 4,Visit AVG 5,Visit AVG 6,Visit AVG 7,Visit AVG 8,Visit AVG 9,Visit ERA,Home AVG 1,Home AVG 2,Home AVG 3,Home AVG 4,Home AVG 5,Home AVG 6,Home AVG 7,Home AVG 8,Home AVG 9,Home ERA
0,330,SDG,MIL,8.0,14,1,0.600,0.250,0.000,0.200,0.400,0.000,0.000,0.200,0.500,0.00,0.250,0.000,0.000,0.400,0.333,0.000,0.500,0.000,0.000,1.29
1,330,TAM,BOS,7.0,1,0,0.250,0.000,0.000,0.000,0.750,0.250,0.500,0.000,0.333,0.00,0.250,0.000,0.000,0.000,0.000,0.333,0.500,0.000,0.000,6.00
2,330,TOR,NYY,8.5,6,0,0.200,0.500,0.600,0.200,0.500,0.000,0.250,0.250,0.000,0.00,0.000,0.000,0.000,0.500,0.000,0.000,0.000,0.333,0.000,3.86
3,330,MIA,CUB,8.0,3,0,0.200,0.250,0.250,0.200,0.333,0.667,0.333,0.000,0.000,8.10,0.000,0.500,0.333,0.000,0.667,0.333,0.250,0.000,0.000,11.25
4,330,LAD,SFO,7.5,1,0,0.250,0.500,0.250,0.000,0.000,0.500,0.250,0.250,0.000,0.00,0.000,0.000,0.000,0.000,0.000,0.333,0.667,0.000,1.000,1.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2335,930,SEA,TEX,9.0,4,0,0.254,0.255,0.258,0.271,0.258,0.194,0.261,0.225,0.213,4.30,0.285,0.304,0.303,0.257,0.262,0.220,0.217,0.201,0.268,3.76
2336,930,MIN,CWS,9.0,9,1,0.234,0.243,0.237,0.236,0.275,0.230,0.214,0.240,0.091,4.18,0.282,0.291,0.356,0.275,0.230,0.269,0.257,0.222,0.276,3.62
2337,930,CUB,STL,8.5,15,1,0.258,0.303,0.238,0.280,0.261,0.262,0.264,0.276,0.143,2.83,0.299,0.309,0.278,0.289,0.272,0.274,0.233,0.233,0.130,2.36
2338,930,SFO,LAD,7.5,15,1,0.248,0.312,0.260,0.269,0.261,0.268,0.243,0.251,0.239,2.73,0.229,0.254,0.243,0.244,0.255,0.300,0.236,0.220,0.094,2.81


# Clean and Merge all the Data Sets

In [None]:
for year in range(2020,2022):
  print(year)
  odds_df = pd.read_excel(f"/content/yearly_odds/mlb odds {year}.xlsx")
  stats_df = pd.read_csv(f"/content/yearly_stats/{year}_stats.csv")
  # print(odds_df.shape)
  # print(stats_df.shape)
  odds_clean = clean_odds(odds_df, True)
  merged = merge_odds_stats(odds_clean, stats_df)
  merged.to_csv(f'/content/clean_data/clean_data_{year}.csv', index=False)
  



2020
2021


In [None]:
odds_df_2020.columns

Index(['Date', 'Rot', 'VH', 'Team', 'Pitcher', '1st', '2nd', '3rd', '4th',
       '5th', '6th', '7th', '8th', '9th', 'Final', 'Open', 'Close', 'RunLine',
       'Unnamed: 18', 'OpenOU', 'Unnamed: 20', 'CloseOU', 'Unnamed: 22'],
      dtype='object')

In [None]:
!zip -r /content/files.zip /content/clean_data


  adding: content/clean_data/ (stored 0%)
  adding: content/clean_data/clean_data_2021.csv (deflated 78%)
  adding: content/clean_data/clean_data_2020.csv (deflated 78%)
