# A02. MLB API
Sources: 
- MLB Stats API
- Statcast (via pybaseball package)

### 1. MLB Stats API

In [4]:
# Read in boxscore for weather
def create_box(gamePk):
    # Read in boxscore as json
    box = pd.json_normalize(statsapi.boxscore_data(gamePk, timecode=None), record_path='gameBoxInfo')
    
    # Define default values
    default_weather = "75 degrees, Clear."
    default_wind = "0 mph, L To R."
    default_venue = "Missing Park."
    default_date = "November 30, 1993"
    
    # Extract weather, wind, venue, and date
    weather = box.loc[box['label'] == "Weather", "value"].item() if 'Weather' in box['label'].values else default_weather
    wind = box.loc[box['label'] == "Wind", "value"].item() if 'Wind' in box['label'].values else default_wind
    venue = box.loc[box['label'] == "Venue", "value"].item() if 'Venue' in box['label'].values else default_venue
    
    try:
        date = box.iloc[-1, box.columns.get_loc('label')]
    except:
        date = default_date
    
    return weather, wind, venue, date

In [5]:
# Extract field or provide default (helper function)
def extract_field(data, field, default=None):
    try:
        return data[field]
    except:
        return default

In [6]:
# Exract play by play data
def create_game(gamePk):
    game = statsapi.get('game_playByPlay', {'gamePk': gamePk})
    
    # Create list with relevant variables
    game_data = []
    for play in game['allPlays']:
        about = play['about']
        count = play['count']
        result = play['result']
        matchup = play['matchup']
        runners = play['runners']
        
        atBatIndex = about['atBatIndex']
        inning = about['inning']
        halfInning = about['halfInning']
        outs = count['outs']
        
        type = extract_field(result, 'type')
        event = extract_field(result, 'event')
        eventType = extract_field(result, 'eventType')
        description = extract_field(result, 'description')
        rbi = extract_field(result, 'rbi', 0)
        awayScore = extract_field(result, 'awayScore', 0)
        homeScore = extract_field(result, 'homeScore', 0)
        
        batter = extract_field(matchup['batter'], 'id', 999999)
        batterName = extract_field(matchup['batter'], 'fullName', 'Missing Name')
        batSide = extract_field(matchup['batSide'], 'code', 'R')
        pitcher = extract_field(matchup['pitcher'], 'id', 999999)
        pitcherName = extract_field(matchup['pitcher'], 'fullName', 'Missing Name')
        pitchHand = extract_field(matchup['pitchHand'], 'code', 'R')
        
        # Baserunner on base at the end of the play
        postOnFirst = extract_field(matchup, 'postOnFirst', None)
        postOnSecond = extract_field(matchup, 'postOnSecond', None)
        postOnThird = extract_field(matchup, 'postOnThird', None)
        
        # Extract base runner information
        for runner in runners:
            details = runner['details']
            movement = runner['movement']
            
            runner_id = details['runner']['id']
            start = movement['start']
            end = movement['end']
            movementReason = details['movementReason']
            
            game_data.append([atBatIndex, inning, halfInning, outs, type, runner_id, event, eventType, description, 
                              rbi, awayScore, homeScore, batter, batterName, batSide, pitcher, pitcherName, pitchHand, 
                              postOnFirst, postOnSecond, postOnThird, start, end, movementReason])
    
    # Create dataframe
    df = pd.DataFrame(game_data, columns=['atBatIndex', 'inning', 'halfInning', 'outs', 'type', 'id', 'event', 'eventType', 'description', 
                                          'rbi', 'awayScore', 'homeScore', 'batter', 'batterName', 'batSide', 'pitcher', 
                                          'pitcherName', 'pitchHand', 'postOnFirst', 'postOnSecond', 'postOnThird', 'start', 'end', 'movementReason'])
 
    # Assuming 'create_box' function exists
    weather, wind, venue, date = create_box(gamePk)
    df['gamePk'] = gamePk
    df['weather'] = weather
    df['wind'] = wind
    df['venue'] = venue
    df['date'] = date
    
    return df

In [7]:
# Extract API data
def plays_statsapi(start_date, end_date):
    # Extract year
    year = start_date[-4:]
    
    # Read in schedule
    games = statsapi.schedule(start_date=start_date, end_date=end_date)

    # Use a list comprehension to extract unique game_ids
    game_ids = list(game['game_id'] for game in games)
    away_names = list(game['away_name'] for game in games)
    home_names = list(game['home_name'] for game in games)
    game_dates = list(game['game_date'] for game in games)
    game_types = list(game['game_type'] for game in games)
    venue_ids = list(game['venue_id'] for game in games)

    # Run all in parallel
    df_list = Parallel(n_jobs=-1, verbose=0)(delayed(create_game)(gamePk=game_id) for game_id in game_ids)

    # Add additional information from schedule
    for i in range(len(df_list)):
        df_list[i]['away_name'] = away_names[i]
        df_list[i]['home_name'] = home_names[i]
        df_list[i]['game_date'] = game_dates[i]
        df_list[i]['game_type'] = game_types[i]
        df_list[i]['venue_id'] = venue_ids[i]
    
    # Append all dataframes together
    df = pd.concat(df_list, axis=0)
        
    return df

### 2. Statcast

In [8]:
# Extract Statcast data
def plays_statcast(start_date, end_date):
    # Extract year
    year = start_date[:4]
    
    # Use pybaseball to read in Statcast data
    data = statcast(start_date, end_date)
    
    # Create atBatIndex compatible with Statsapi
    data['atBatIndex'] = data['at_bat_number'] - 1 
    
    # Highest level during the at bat
    data['maxSpeed'] = data.groupby(['game_pk', 'atBatIndex'])['effective_speed'].transform(max)
    data['maxSpin'] = data.groupby(['game_pk', 'atBatIndex'])['release_spin_rate'].transform(max)
    
    # Convert to numeric for sorting
    data['game_pk'] = data['game_pk'].astype('int')
    data['atBatIndex'] = data['atBatIndex'].astype('int')
    data['pitch_number'] = data['pitch_number'].astype('int')
    
    # Only want the deciding (last) pitch
    data.sort_values(['game_pk', 'atBatIndex', 'pitch_number'], inplace=True)
    data.drop_duplicates(['game_pk', 'atBatIndex'], keep='last', inplace=True)
    
    data.rename(columns={'game_pk':'gamePk'}, inplace=True)
    
    # Keep relevant variables
    keep_list = ['gamePk', 'atBatIndex', 'pitch_number', 'pitch_name', 'game_type',
                 'hc_x', 'hc_y', 'hit_location', 'hit_distance_sc', 'launch_speed', 'launch_angle', 'launch_speed_angle',
                 'woba_value', 'woba_denom', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
                 'iso_value', 'babip_value',
                 'maxSpeed', 'maxSpin']
                
    data = data[keep_list]
    
    return data

### 3. Dataset 

In [9]:
# Merge together datasets in year range
def dataset(engine, start_year, end_year):
    # Initialize an empty dataframe to store the results
    df = pd.DataFrame()
    
    # Iterate through the range of years
    for year in range(start_year, end_year + 1):
        # Define table names for Stats API and Statcast for the current year
        statsapi_table = f'Stats API {year}'
        statcast_table = f'Statcast {year}'
        
        # Load tables from the database for the current year
        statsapi_df = pd.read_sql_table(statsapi_table, engine)
        statcast_df = pd.read_sql_table(statcast_table, engine)
        
        # Merge the two dataframes based on 'gamePk' and 'atBatIndex'
        merged_df = pd.merge(statsapi_df, statcast_df, on=['gamePk', 'atBatIndex'], how='left')
        
        # Append the merged dataframe to the result dataframe
        df = df.append(merged_df, ignore_index=True)
        
        # Create data variable (without dashes)
        df['date'] = df['game_date'].str.replace('-', '')
        
        # Convert to numeric for sorting
        df['date'] = df['date'].astype('int')
        df['gamePk'] = df['gamePk'].astype('int')
        df['atBatIndex'] = df['atBatIndex'].astype('int')
        
        # Sort
        df.sort_values(['game_date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

        # Only keep one observation per at bat
        df.drop_duplicates(['gamePk', 'atBatIndex'], keep='last', inplace=True)
    
    # Return the combined dataframe
    return df

##### Weather

In [10]:
# Positive to centerfield, negative from centerfield
def y_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "Out To CF": 
        y_vect = wind_speed
    elif df['windDirection'] == "Out To RF": 
        y_vect = angled
    elif df['windDirection'] == "L To R": 
        y_vect = 0
    elif df['windDirection'] == "In From LF": 
        y_vect = angled * -1
    elif df['windDirection'] == "In From CF": 
        y_vect = wind_speed * - 1
    elif df['windDirection'] == "In From RF": 
        y_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        y_vect = 0
    elif df['windDirection'] == "Out To LF": 
        y_vect = angled
    else:
        y_vect = 0
        
    return y_vect

# Positive from left to right, negative from right to left
def x_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "L To R": 
        x_vect = wind_speed
    elif df['windDirection'] == "In From LF": 
        x_vect = angled
    elif df['windDirection'] == "In From CF": 
        x_vect = 0
    elif df['windDirection'] == "In From RF": 
        x_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        x_vect = wind_speed * - 1
    elif df['windDirection'] == "Out To LF": 
        x_vect = angled * -1
    elif df['windDirection'] == "Out To CF": 
        x_vect = 0
    elif df['windDirection'] == "Out To RF": 
        x_vect = angled
    else:
        x_vect = 0
        
    return x_vect

# 2 is to centerfield, 6 is from centerfield, clockwise
# Assumption is wind is blowing in 8 cardinal directions, so we can use simple right isosceles triangles

In [11]:
def clean_weather(df):   
    # Separate weather into temperature and weather type
    df[['temperature', 'weather']] = df['weather'].str.split(", ", expand=True)
    df['temperature'] = df['temperature'].str.replace(" degrees", "").astype('int')
    # Separate wind into speed and direction
    df[['windSpeed', 'windDirection']] = df['wind'].str.split(", ", expand=True)
    df['windSpeed'].fillna("0 mph", inplace=True)
    df['windSpeed'] = df['windSpeed'].str.replace(" mph", "")
    df['windSpeed'] = pd.to_numeric(df['windSpeed'], errors='coerce')
    df['windSpeed'].fillna(0, inplace=True)
    df['windDirection'].fillna('L to R', inplace=True)
    df['windSpeed'].unique()
    df['windDirection'] = df['windDirection'].str.replace(".", "")
    # Calculate vectors
    df['x_vect'] = df.apply(x_vect, axis=1)
    df['y_vect'] = df.apply(y_vect, axis=1)
    
    return df

##### Model Inputs

In [12]:
# Assign play categories to full descriptions
def create_events(df):
    event_mapping = {
        'Strikeout': 'so',
        'Strikeout Double Play': 'so',
        'Groundout': 'go',
        'Fielders Choice': 'go',
        'Double Play': 'go',
        'Grounded Into DP': 'go',
        'Triple Play': 'go',
        'Field Error': 'go',
        'Forceout': 'go',
        'Lineout': 'lo',
        'Bunt Lineout': 'lo',
        'Flyout': 'fo',
        'Sac Fly': 'fo',
        'Sac Fly Double Play': 'fo',
        'Pop Out': 'po',
        'Bunt Pop Out': 'po',
        'Hit By Pitch': 'hbp',
        'Walk': 'bb',
        'Intent Walk': 'bb',
        'Single': 'b1',
        'Double': 'b2',
        'Triple': 'b3',
        'Home Run': 'hr'
    }

    df['eventsModel'] = df['event'].map(event_mapping).fillna('Cut')
    return df

In [13]:
# This turns several variables, including events, venues, hands, and bases into dummies
def create_dummies(df):    
    # Events
    event_dummies = pd.get_dummies(df['eventsModel'])
    # Venues
    venue_dummies = pd.get_dummies(df['venue_id'], prefix='venue')
    # Hands
    pitcher_dummies = pd.get_dummies(df['pitchHand'], prefix='p')
    batter_dummies = pd.get_dummies(df['batSide'], prefix='b')
    # Years
    df['year'] = df['game_date'].str[:4]
    year_dummies = pd.get_dummies(df['year'], prefix='year')
    
    # Create lists of dummies
    venue_list = venue_dummies.columns.tolist()
    year_list = year_dummies.columns.tolist()
    dummy_list = venue_list + year_list
    
    # Add dummies to dataframe
    df = pd.concat([df, event_dummies, venue_dummies, pitcher_dummies, batter_dummies, year_dummies], axis=1)
    
    # Create compatible date variable
    df['date'] = df['game_date'].str.replace('-', '')
    
    # Convert to numeric for sorting
    df['date'] = df['date'].astype('int')
    df['gamePk'] = df['gamePk'].astype('int')
    df['atBatIndex'] = df['atBatIndex'].astype('int')
    
    # Sort
    df.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)
    
    # Create dummy for runners on base
    df['preOnFirst'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnFirst'].shift(1)
    df['preOnSecond'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnSecond'].shift(1)
    df['preOnThird'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnThird'].shift(1)
    
    df['onFirst'] = df['preOnFirst'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    df['onSecond'] = df['preOnSecond'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    df['onThird'] = df['preOnThird'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    
    # Top of the inning dummy
    df['top'] = np.where(df['halfInning'] == "top", 1, 0)
    
    # Convert to numeric
    df['awayScore'] = df['awayScore'].astype('int')
    df['homeScore'] = df['homeScore'].astype('int')
    
    # Determine score before PA
    df['preAwayScore'] = df.groupby(['gamePk'])['awayScore'].shift(1)
    df['preHomeScore'] = df.groupby(['gamePk'])['homeScore'].shift(1)
    
    # If it's the first PA, it'll be missing. 
    df['preAwayScore'] = df['preAwayScore'].fillna(0)
    df['preHomeScore'] = df['preHomeScore'].fillna(0)
    
    
    # Calculate differential
    df['score_diff'] = np.where(df['top'] == 1, df['preAwayScore'] - df['preHomeScore'], df['preHomeScore'] - df['preAwayScore'])
    
    # Calculate PAs and ABs
    df['pa'] = np.where(df['eventsModel'] != "Cut", 1, 0)
    df['ab'] = df['pa'] - df['hbp'] - df['bb']           
            
    # Sort
    df.sort_values(['date', 'gamePk', 'atBatIndex'], inplace=True)
    
    return df

In [14]:
# Create useful Statcast variables
def clean_statcast(df):
    # Convert variables to numeric
    df['launch_speed'] = pd.to_numeric(df['launch_speed'], errors='coerce')
    df['launch_speed_angle'] = pd.to_numeric(df['launch_speed_angle'], errors='coerce')
    df['hc_x'] = pd.to_numeric(df['hc_x'], errors='coerce')
    df['hc_y'] = pd.to_numeric(df['hc_y'], errors='coerce')
    
    # Hard hit dummy
    df['hard_hit'] = (df['launch_speed'] >= 95).astype('int')
    
    # Barrel dummy
    df['barrel'] = (df['launch_speed_angle'] == 6).astype('int')

    # Spray 
    df['spray_angle'] = np.arctan((df['hc_x'] - 125.42) / (198.27 - df['hc_y'])) * 180 / np.pi * 0.75
    df['to_left'] = (df['spray_angle'] < -15).astype('int')
    df['to_middle'] = ((df['spray_angle'] >= -15) & (df['spray_angle'] <= 15)).astype('int')
    df['to_right'] = (df['spray_angle'] > 15).astype('int')

    return df

In [15]:
# Read in park factors to adjust stats
def read_park_factors():
    # Read in park factors
    park_factors = pd.read_sql_table('Statcast Park Factors', engine)
    
    # Clean
    park_factors['Team'] = park_factors['Team'].str.strip()
  
    # Read in team_map 
    team_map = pd.read_sql_table('Team Map', engine)
    
    # Merge with team map to get venue ID
    park_factors = park_factors.merge(team_map[['FANGRAPHSTEAM', 'VENUE_ID']], left_on='Team', right_on='FANGRAPHSTEAM', how='inner')
    park_factors.rename(columns={'VENUE_ID':'venue_id'}, inplace=True)
    
    # Keep relevant variables
    park_factors = park_factors[['venue_id', 'batSide', 'Park Factor', '1B', '2B', '3B', 'HR', 'BB', 'SO']]
    
    # Convert to mean of 1, not 100
    factor_list = ['Park Factor', '1B', '2B', '3B', 'HR', 'BB', 'SO']
    for factor in factor_list:
        park_factors[factor] = park_factors[factor].astype('int') / 100
        
    # Convert to numeric
    park_factors['venue_id'] = park_factors['venue_id'].astype('str')
    
    # Sort
    park_factors.sort_values(['venue_id', 'batSide'], inplace=True)
    
    return park_factors

In [67]:
# Adjust for park factors
def park_adjustments(df):   
    # Read in park factors
    park_factors = read_park_factors()
    
    # Convert to string
    df['venue_id'] = df['venue_id'].astype('str')
    
    # Merge with park factors
    df = df.merge(park_factors, on=['venue_id', 'batSide'], how='left', indicator=True)
    
    # Old/other parks get all 1s
    df[['Park Factor', '1B', '2B', '3B', 'HR', 'BB', 'SO']] = df[['Park Factor', '1B', '2B', '3B', 'HR', 'BB', 'SO']].fillna(1)
    
    
    # Adjust stats by park factor
    df['b1'] = df['b1'].astype('float') / df['1B'].astype('float')
    df['b2'] = df['b2'].astype('float') / df['2B'].astype('float')
    df['b3'] = df['b3'].astype('float') / df['3B'].astype('float')
    df['hr'] = df['hr'].astype('float') / df['HR'].astype('float')
    df['bb'] = df['bb'].astype('float') / df['BB'].astype('float')
    df['so'] = df['so'].astype('float') / df['SO'].astype('float')
    
    return df

In [91]:
# This will return a dataframe that can eventually be used as the model input. Has pitcher vs hitter stats, specific to hand
def rolling_pas(df, pa_num):
    # Copy dataframe
    df_copy = df.copy()
    
    # Note: batter_avg_short will work even when pa_num refers to the "long" period. Suffix will be added in post.
    # Rename for compatibility purposes
    df_copy.rename(columns={'hit_distance_sc':'totalDistance', 'launch_speed':'launchSpeed'}, inplace=True)          
            
    # Convert to numeric and fill with 0s
    combined_list = avg_list + max_list
    for col in combined_list:
        # Check if the column is not numeric
        if not pd.api.types.is_numeric_dtype(df_copy[col]):
            # Convert the non-numeric column to numeric and fill missing values with 0
            df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
            df_copy[col] = df_copy[col].fillna(0)

    # Sort
    df_copy.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)
            
    # Data types may vary. This makes grouping impossible. 
    df_copy['batter'] = df_copy['batter'].astype('int')
    df_copy['pitcher'] = df_copy['pitcher'].astype('int')
        
    ### Batter stats 
    # Stats for which you want the average 
    df_copy[batter_avg_short] = df_copy.groupby(['batter', 'pitchHand'])[avg_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).mean())
    # Stats for which you want the maximum
    df_copy[batter_max_short] = df_copy.groupby(['batter', 'pitchHand'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
    # Stats for which you just want the sum 
    df_copy[['ab_b', 'pa_b']] = df_copy.groupby(['batter', 'pitchHand'])[['ab', 'pa']].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
                
    ### Pitcher stats
    # Stats for which you want the average
    df_copy[pitcher_avg_short] = df_copy.groupby(['pitcher', 'batSide'])[avg_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).mean())
    # Stats for which you want the maximum
    df_copy[pitcher_max_short] = df_copy.groupby(['pitcher', 'batSide'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
    # Stats for which you just want the sum 
    df_copy[['ab_p', 'pa_p']] = df_copy.groupby(['pitcher', 'batSide'])[['ab', 'pa']].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
                
    # Create imputation flags (these observations will have imputed inputs)
    df_copy['imp_b'] = (df_copy['pa_b'] < 40).astype('int')
    df_copy['imp_p'] = (df_copy['pa_p'] < 40).astype('int')

    # Create compatible date variable
    df_copy['date'] = df_copy['game_date'].str.replace('-', '')
    
    # Convert to numeric for sorting
    df_copy['date'] = df_copy['date'].astype('int')
    df_copy['gamePk'] = df_copy['gamePk'].astype('int')
    df_copy['atBatIndex'] = df_copy['atBatIndex'].astype('int')
    df_copy['batter'] = df_copy['batter'].astype('int')
    df_copy['pitcher'] = df_copy['pitcher'].astype('int')
    
    
    # Sort
    df_copy.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

    ### Advanced stats
    # wOBA - using 2022 values throughout
    df_copy['woba_b'] = (0.690 * df_copy['bb_b']) + (0.721 * df_copy['hbp_b']) + (0.885 * df_copy['b1_b']) + (1.262 * df_copy['b2_b']) + (1.601 * df_copy['b3_b']) + (2.070 * df_copy['hr_b'])
    df_copy['woba_p'] = (0.690 * df_copy['bb_p']) + (0.721 * df_copy['hbp_p']) + (0.885 * df_copy['b1_p']) + (1.262 * df_copy['b2_p']) + (1.601 * df_copy['b3_p']) + (2.070 * df_copy['hr_p'])
    
    # Slugging
    df_copy['slg_b'] = (1 * df_copy['b1_b']) + (2 * df_copy['b2_b']) + (3 * df_copy['b3_b']) + (4 * df_copy['hr_b'])
    df_copy['slg_b'] = df_copy['slg_b'] / df_copy['ab_b']
    df_copy['slg_p'] = (1 * df_copy['b1_p']) + (2 * df_copy['b2_p']) + (3 * df_copy['b3_p']) + (4 * df_copy['hr_p'])
    df_copy['slg_p'] = df_copy['slg_p'] / df_copy['ab_p']

    # OBP    
    df_copy['obp_b'] = df_copy[['b1_b', 'b2_b', 'b3_b', 'hr_b', 'bb_b', 'hbp_b']].sum(axis=1)
    df_copy['obp_p'] = df_copy[['b1_p', 'b2_p', 'b3_p', 'hr_p', 'bb_p', 'hbp_p']].sum(axis=1)
    
    # ISO
    df_copy['iso_b'] = df_copy['b2_b'] * 1 + df_copy['b3_b'] * 2 + df_copy['hr_b'] * 3
    df_copy['iso_p'] = df_copy['b2_p'] * 1 + df_copy['b3_p'] * 2 + df_copy['hr_p'] * 3


        
    return df_copy

In [18]:
# Creates model inputs
def create_inputs(start_year, end_year, short=50, long=300):
    # Read in raw data
    df = dataset(engine, start_year, end_year)
    # Clean weather
    df2 = clean_weather(df)
    # Create PA events 
    df3 = create_events(df2)
    # Create dummy variables 
    df4 = create_dummies(df3)
    # Create Statcast variables
    df5 = clean_statcast(df4)   
    # Adjust for park factors
    df6 = park_adjustments(df5)
    
    ### Rolling stats
    # Short
    df_short = rolling_pas(df6, short)
    # Long
    df_long = rolling_pas(df6, long)
    df_long = df_long.add_suffix("_long")
        
    # We only need the rolling stats 
    # rolling_stats_short = batter_stats_short + pitcher_stats_short + ['pa_b', 'ab_b', 'pa_p', 'ab_p']
    long_stats = batter_stats_long + pitcher_stats_long
    df_long = df_long[long_stats]
    
    # Dataset
    complete_dataset = pd.concat([df_short, df_long], axis=1)
            
    # Sort
    complete_dataset.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)
    
    
    return complete_dataset

In [96]:
# %run "U1. Imports.ipynb"
# %run "U2. Utilities.ipynb"
# %run "U3. Classes.ipynb"
# %run "D3. Simulation Functions.ipynb"

# baseball_path = r'C:\Users\james\Documents\MLB\Database'

In [2]:
# db_path = r'C:\Users\james\Documents\MLB\Database\MLBDB.db'
# engine = create_engine(f'sqlite:///{db_path}')

In [19]:
# df = dataset(engine, 2015, 2023)

In [20]:
# df2 = clean_weather(df)

In [21]:
# df3 = create_events(df2)

In [22]:
# df4 = create_dummies(df3)

In [23]:
# df5 = clean_statcast(df4)   

In [82]:
# df6 = park_adjustments(df5)

In [92]:
# df_short = rolling_pas(df6, 50)

In [95]:
# df_short.query("batterName == 'Rafael Devers'").query('date >= 20220930').query('pitchHand == "L"')[['date', 'batterName', 'batter', 'pitchHand', 'venue_id', '1B', 'b1', 'b1_b', 'hr', 'hr_b']].head(50)

In [None]:
# df_short[batter_stats_short].tail(5)

In [None]:
# df_long = rolling_pas(df6, 300)

In [94]:
# df_long = df_long.add_suffix("_long")
# df_long[batter_stats_long].tail(5)

In [None]:
# complete_dataset = pd.concat([df_short, df_long], axis=1)

In [None]:
# complete_dataset[batter_stats_short + batter_stats_long].tail()

In [None]:
# complete_dataset[['b1_b', 'b1_b_long']].tail()

In [None]:
# df_short[batter_stats_short].tail()