# Imports

In [134]:
from glob import glob
from pybaseball import playerid_reverse_lookup
import pandas as pd
import numpy as np

# Functions

In [123]:
def gather_hits(df, pitch_types, hit_types):
    """
    df : <Pandas DataFrame Object>
    pitch_types : <List> Contains all the columns for the different pitch types.
    hit_types : <List> Contains all of the columns to store the whether or not a hit occured
        for some pitch type.
    
    Removes pitches from the players record where the pitch resulted in a ball.
    Hit_type records whether or not the pitch for a specific pitch_type resulted in a hit.
    Foul balls are not considered to be a succesful hit.
    
    returns : <Pandas DataFrame Object> Updated DataFrame.
    """
    for pitch_type, hit_type in zip(pitch_types, hit_types):
        # Only count a pitch against a player if it wasn't a ball, because otherwise its the pitcher's fault.
        df[pitch_type] = ((df[pitch_type] == 1) & ( ~df["description"].str.contains("ball"))) * 1
        df[hit_type] = ((df[pitch_type] == 1) & (df["description"].str.contains("hit"))) * 1
        
    return df

def calculate_percentage_hit(df, pitch_types, hit_types):
    """
    df : <Pandas DataFrame Object>
    pitch_types : <List> Contains all the columns for the different pitch types.
    hit_types : <List> Contains all of the columns to store the whether or not a hit occured
        for some pitch type.
    
    Calculates percentage hit chance for some pitch type. (hit_type / pitch_type) * 100
    Removes NaN values by replacing them with zero. A NaN value my occur if a player never
        was pitched a pitch_type (0/0 error).
    
    returns : <Pandas DataFrame Object> Updated DataFrame.
    """
    for pitch_type, hit_type in zip(pitch_types, hit_types):
        df[pitch_type.replace("pitch_type", "percentage_hit")] = (df[hit_type] / df[pitch_type]) * 100
    
    df = df.fillna(0)
    return df

def calculate_specialty_score(df, percentage_hit_types, weights):
    """
    df : <Pandas DataFrame Object>
    percentage_hit_types : <List> The column names containing percentage score for each pitch type.
    weights : <Dict> The corresponding weights for each pitch type.
    
    Sums up all of the percentage hits and applies their corresponding weights to get a specialty score
        statistic.
    
    returns : <Pandas DataFrame Object> Updated DataFrame.
    """
    df["speciality_score"] = 0
    for percentage_hit_type in percentage_hit_types:
        df["speciality_score"] += df[percentage_hit_type] * weights[percentage_hit_type]
    
    return df

def load_data(data_path):
    """
    data_path : <str> Path containing csv files with each season's batting data. Ex: ./data/*
    
    returns : <Pandas DataFrame Object> DataFrame containing concatenated data across all seasons
        found within 'data_path'.
    """
    all_data = [pd.read_csv(file_name, index_col=0) for file_name in glob(data_path)]
    return pd.concat(all_data, axis=0, sort=False)

# Loading all Statcast data

In [None]:
data_path = "./data/*"
combined_data = load_data(data_path)

# Data Processing

In [148]:
# Remove all instances where the pitch type was not recorded.
n = combined_data.dropna(axis=0, subset=["pitch_type"])

# Below removes random pitch types that consist of numbers that look like 1623_12312 and only occur once.
# This line wil give a warning. Just ignore it.
n["pitch_type"] = n["pitch_type"].apply(lambda x: x.replace("_",""))
n = n[~(n["pitch_type"].str.isdigit())]

# Add columns with a 1 column indicating pitch type.
n = pd.get_dummies(n, columns=["pitch_type"])

# Add column to sum to get number of plate appearances.
n["plate_appearance"] = 1

# Add player info to the batting data, matching on the batting id (key_mlbam).
n["key_mlbam"] = n["batter"]
player_info = playerid_reverse_lookup(n["batter"].values, key_type='mlbam')
n = pd.merge(n, player_info,on="key_mlbam", how="inner")
n["season"] = n["game_year"]

pitch_types = ['CH', 'CU', 'EP', 'FC', 'FF', 'FO', 'FS', 'FT', 'IN', 'KC', 'KN', 'PO', 'SC', 'SI', 'SL', 'UN']
pitch_types = ["pitch_type_" + pitch_type for pitch_type in pitch_types]
hit_types = [pitch_type.replace("pitch_type", "hit") for pitch_type in pitch_types]

z = gather_hits(n.copy(), pitch_types, hit_types)

pitch_type_aggregates = {pitch_type:"sum" for pitch_type in pitch_types}
hit_type_aggregates = {hit_type:"sum" for hit_type in hit_types}
aggregates = {"name_last":"first", "name_first":"first", "season":"first", "plate_appearance":"sum", **pitch_type_aggregates, **hit_type_aggregates}

z = z.groupby(["batter", "game_year"]).agg(aggregates)

# Only get players with 200 or more plate appearances.
z = z[z["plate_appearance"] >= 200]

percentage_hit = calculate_percentage_hit(z.copy(), pitch_types, hit_types)

percentage_hit_types = [pitch_type.replace("pitch_type", "percentage_hit") for pitch_type in pitch_types]
weights = {'percentage_hit_CH': 1, 'percentage_hit_CU': 1, 'percentage_hit_EP': 1, 
           'percentage_hit_FC': 1, 'percentage_hit_FF': 1, 'percentage_hit_FO': 1, 
           'percentage_hit_FS': 1, 'percentage_hit_FT': 1, 'percentage_hit_IN': 1, 
           'percentage_hit_KC': 1, 'percentage_hit_KN': 1, 'percentage_hit_PO': 1, 
           'percentage_hit_SC': 1, 'percentage_hit_SI': 1, 'percentage_hit_SL': 1, 
           'percentage_hit_UN': 1}

specialty_score = calculate_specialty_score(percentage_hit.copy(), percentage_hit_types, weights)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Gathering player lookup table. This may take a moment.


In [149]:
specialty_score

Unnamed: 0_level_0,Unnamed: 1_level_0,name_last,name_first,season,plate_appearance,pitch_type_CH,pitch_type_CU,pitch_type_EP,pitch_type_FC,pitch_type_FF,pitch_type_FO,...,percentage_hit_FT,percentage_hit_IN,percentage_hit_KC,percentage_hit_KN,percentage_hit_PO,percentage_hit_SC,percentage_hit_SI,percentage_hit_SL,percentage_hit_UN,speciality_score
batter,game_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
112526.0,2016.0,colon,bartolo,2016.0,232,3,15,0,7,84,0,...,10.000000,0.0,0.000000,0.000000,0.0,0.0,10.000000,10.000000,0.0,64.523810
120074.0,2016.0,ortiz,david,2016.0,2472,119,95,1,92,558,0,...,35.000000,0.0,34.615385,29.411765,0.0,0.0,32.330827,28.773585,0.0,326.366022
121347.0,2016.0,rodriguez,alex,2016.0,915,49,32,0,27,254,0,...,44.444444,0.0,22.222222,50.000000,0.0,0.0,24.390244,25.742574,0.0,257.829345
134181.0,2016.0,beltre,adrian,2016.0,2442,143,105,0,67,572,0,...,35.087719,0.0,20.000000,31.578947,0.0,0.0,34.640523,30.952381,0.0,320.151041
134181.0,2017.0,beltre,adrian,2017.0,1574,65,64,1,56,302,0,...,35.260116,0.0,30.769231,0.000000,0.0,0.0,28.888889,26.020408,0.0,253.117782
134181.0,2018.0,beltre,adrian,2018.0,1857,97,100,0,88,387,0,...,32.191781,0.0,13.793103,0.000000,0.0,0.0,33.620690,23.376623,0.0,256.634956
136860.0,2016.0,beltran,carlos,2016.0,2186,208,123,0,63,505,0,...,41.447368,0.0,26.923077,36.842105,0.0,0.0,40.404040,24.561404,0.0,329.686725
136860.0,2017.0,beltran,carlos,2017.0,2014,146,146,0,69,389,0,...,33.480176,0.0,20.370370,0.000000,0.0,0.0,28.571429,26.347305,0.0,270.778331
150029.0,2016.0,werth,jayson,2016.0,2785,124,128,1,80,689,3,...,33.816425,0.0,29.166667,0.000000,0.0,0.0,22.543353,25.539568,0.0,259.133880
150029.0,2017.0,werth,jayson,2017.0,1405,68,49,2,47,344,0,...,25.000000,0.0,35.714286,21.428571,0.0,0.0,19.101124,21.600000,0.0,225.119488
