In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

# Define Necessary Functions

In [None]:
def clean_odds(df, INCLUDE_OU):

  if df.shape[1] ==23:
    new_cols = { 'Unnamed: 20': 'Open OU odds', 'Unnamed: 22': 'Close OU odds'}
  else:
    new_cols = { 'Unnamed: 18': 'Open OU odds', 'Unnamed: 20': 'Close OU odds'}

  team_mapper = {'NYY': 'NYY', 'BOS': 'BOS', 'PHI': 'PHI', 'WAS': 'WAS', 'MIA': 'MIA', 'NYM': 'NYM', 'STL': 'STL', 'CIN': 'CIN', 'LOS': 'LAD', 'PIT': 'PIT', 'COL': 'COL', 'MIL': 'MIL', 'CUB': 'CUB', 'ATL': 'ATL', 'SDG': 'SDG', 'ARI': 'ARI', 'SFO': 'SFO', 'HOU': 'HOU', 'CLE': 'CLE', 'CWS': 'CWS', 'TOR': 'TOR', 'TEX': 'TEX', 'DET': 'DET', 'KAN': 'KAN', 'MIN': 'MIN', 'LAA': 'LAA', 'SEA': 'SEA', 'OAK': 'OAK', 'BAL': 'BAL', 'TAM': 'TAM', 'LAD': 'LAD', 'CHC': 'CUB'}
  df['Team'] = df['Team'].map(team_mapper)

  df = df.rename(columns = new_cols)
  cols = ['Date','VH','Team', 'Pitcher','Final','Open OU', 'Open OU odds', 'Close OU', 'Close OU odds']
  df = df[cols]

  visit = df[df.VH=='V'].reset_index().drop(columns='index')
  home = df[df.VH=='H'].reset_index().drop(columns='index')

  new_home_cols = dict(zip(home.columns, [f"Home {i}"for i in home.columns]))
  new_visit_cols = dict(zip(visit.columns, [f"Visit {i}"for i in visit.columns]))
  home = home.rename(columns=new_home_cols)
  visit = visit.rename(columns=new_visit_cols)
  cols_at_end = ['Home Final','Visit Final']

  combined_df = pd.concat([home,visit], axis=1).drop(columns=['Home VH','Visit VH','Visit Date'])
  combined_df = combined_df[[col for col in combined_df if col not in cols_at_end] + cols_at_end]


  combined_df = combined_df.rename(columns={'Home Date':'Date'})
  combined_df[['Home Close OU','Home Final','Visit Final']] = combined_df[['Home Close OU','Home Final','Visit Final']].apply(pd.to_numeric, errors='coerce')
  combined_df['Total Runs'] = combined_df['Home Final'] + combined_df['Visit Final']
  

  if INCLUDE_OU:
    combined_df['Over'] = (combined_df['Total Runs'] >= combined_df['Home Close OU']).astype('int32')
  else:
    combined_df['Over'] = (combined_df['Total Runs'] > combined_df['Home Close OU']).astype('int32')

  base = combined_df[['Date','Home Team','Visit Team','Home Close OU','Total Runs','Over']]
  base['Date'] = base['Date'].astype('int32')

  return base


In [None]:
def clean_stats(team, is_home, df):
    
  # removes home and visit from the columns and just leaves it as AVG # or ERA
  home = df[df['Home Team'] ==team][df.columns[:10].tolist() + ['Date', 'Visit Team', 'Home Team']]
  visit = df[df['Visit Team'] ==team][df.columns[10:]]

  avg = [i[6:] for i in df.columns[10:-3]]
  vist_mapper = dict(zip(df.columns[10:-3], avg))
  home_mapper = dict(zip(df.columns[:10], avg))

  home = home.rename(columns=home_mapper)
  visit = visit.rename(columns=vist_mapper)
  
  if is_home:
    combine = home
  else: 
    combine = visit

  combine = combine.reset_index().drop(columns='index')
  combine[combine.columns[:-2]] = combine[combine.columns[:-2]].apply(pd.to_numeric, errors='coerce')
  combine = combine.sort_values('Date')
  combine = combine.reset_index().drop(columns='index')
  # combine = combine[:162]
  combine = combine[combine.Date<1000]
  combine[combine.columns[:10]] = combine[combine.columns[:10]].shift(1)
  combine = combine.dropna()
  combine = combine.reset_index().drop(columns='index')


  return combine


In [None]:
def merge_df(base, df):
  merged = pd.merge(base, df, on = ['Date','Home Team'])
  merged = merged[merged.columns[:-1]].rename(columns={'Visit Team_x':'Visit Team'})

  return merged

# Import CSV data

In [None]:
odds_df = pd.read_excel(r"/content/mlb odds 2018.xlsx")
stats_df = pd.read_csv("/content/2018_stats.csv")

odds_clean = clean_odds(odds_df, True)

In [None]:
def merge_odds_stats(odds, stats):

  team_mapper = {'SF': 'SF', 'BOS': 'BOS', 'WSH': 'WSH', 'CIN': 'CIN', 'NYM': 'NYM', 'PIT': 'PIT', 'CHW': 'CHW', 'TEX': 'TEX', 'MIL': 'MIL', 'KC': 'KC', 'ATL': 'ATL', 'ARI': 'ARI', 'HOU': 'HOU', 'LAA': 'LAA', 'OAK': 'OAK', 'TB': 'TB', 'DET': 'DET', 'BAL': 'BAL', 'COL': 'COL', 'FLA': 'MIA', 'CHC': 'CHC', 'CLE': 'CLE', 'PHI': 'PHI', 'MIN': 'MIN', 'STL': 'STL', 'SD': 'SD', 'SEA': 'SEA', 'TOR': 'TOR', 'NYY': 'NYY', 'LAD': 'LAD', 'MIA': 'MIA'}
  stats['Home Team'] = stats['Home Team'].map(team_mapper)
  stats['Visit Team'] = stats['Visit Team'].map(team_mapper)

  mapper = {'SF': 'SFO', 'CHC': 'CUB', 'WSH': 'WAS', 'KC': 'KAN', 'CHW': 'CWS', 'SD': 'SDG', 'TB': 'TAM', 'CIN': 'CIN', 'SEA': 'SEA', 'NYM': 'NYM', 'HOU': 'HOU', 'BAL': 'BAL', 'MIN': 'MIN', 'PIT': 'PIT', 'TEX': 'TEX', 'STL': 'STL', 'MIA': 'MIA', 'LAD': 'LAD', 'ARI': 'ARI', 'CLE': 'CLE', 'PHI': 'PHI', 'MIL': 'MIL', 'COL': 'COL', 'OAK': 'OAK', 'NYY': 'NYY', 'TOR': 'TOR', 'LAA': 'LAA', 'DET': 'DET', 'BOS': 'BOS', 'ATL': 'ATL'}
  stats['Home Team'] = stats['Home Team'].map(mapper)
  stats['Visit Team'] = stats['Visit Team'].map(mapper)

  teams = odds['Home Team'].unique()
  columns = ['Date', 'Home Team', 'Visit Team', 'Home Close OU', 'Total Runs', 'Over', 'AVG 1', 'AVG 2', 'AVG 3', 'AVG 4', 'AVG 5', 'AVG 6', 'AVG 7', 'AVG 8', 'AVG 9', 'ERA']

  home_main_df = pd.DataFrame(columns=columns)
  visit_main_df = pd.DataFrame(columns=columns)



  for team in teams:
    home_temp_df = clean_stats(team, True, stats)
    home_temp_merged = merge_df(odds, home_temp_df)

    visit_temp_df = clean_stats(team, False, stats)
    visit_temp_merged = merge_df(odds, visit_temp_df)

    visit_main_df = visit_main_df.append(visit_temp_merged) 
    home_main_df = home_main_df.append(home_temp_merged)


  merged = pd.merge(visit_main_df, home_main_df, on=['Date','Home Team', 'Visit Team','Home Close OU','Total Runs','Over']).sort_values('Date')
  merged = merged.drop_duplicates(subset=['Date','Home Team','Visit Team'])

  stat_mapper_visit = dict(zip([f'{i}_x' for i in visit_main_df.columns[6:]], [f'Visit {i}' for i in visit_main_df.columns[6:]]))
  stat_mapper_home = dict(zip([f'{i}_y' for i in visit_main_df.columns[6:]], [f'Home {i}' for i in visit_main_df.columns[6:]]))

  merged = merged.rename(columns=stat_mapper_visit)
  merged = merged.rename(columns=stat_mapper_home)
  merged = merged.reset_index().drop(columns='index')

  return merged

In [None]:
merged = merge_odds_stats(odds_clean, stats_df)

# Clean and Merge all the Data Sets

In [None]:
for year in range(2010,2020):
  print(year)
  odds_df = pd.read_excel(f"/content/mlb odds {year}.xlsx")
  stats_df = pd.read_csv(f"/content/{year}_stats.csv")
  # print(odds_df.shape)
  # print(stats_df.shape)
  odds_clean = clean_odds(odds_df, True)
  merged = merge_odds_stats(odds_clean, stats_df)
  merged.to_csv(f'/content/clean_data/clean_data_{year}.csv', index=False)
  



2010
2011
2012
2013
2014
2015
2016
2017
2018
2019


In [None]:
# !zip -r /content/files.zip /content/clean_data


  adding: content/clean_data/ (stored 0%)
  adding: content/clean_data/clean_data_2011.csv (deflated 78%)
  adding: content/clean_data/clean_data_2016.csv (deflated 78%)
  adding: content/clean_data/clean_data_2019.csv (deflated 78%)
  adding: content/clean_data/clean_data_2010.csv (deflated 78%)
  adding: content/clean_data/clean_data_2018.csv (deflated 78%)
  adding: content/clean_data/clean_data_2015.csv (deflated 78%)
  adding: content/clean_data/clean_data_2013.csv (deflated 78%)
  adding: content/clean_data/clean_data_2014.csv (deflated 78%)
  adding: content/clean_data/clean_data_2012.csv (deflated 78%)
  adding: content/clean_data/clean_data_2017.csv (deflated 78%)
