# Imports

In [1]:
from glob import glob
from pybaseball import playerid_reverse_lookup
import pandas as pd
import numpy as np

# Functions

In [2]:
def gather_hits(df, pitch_types, hit_types):
    """
    df : <Pandas DataFrame Object>
    pitch_types : <List> Contains all the columns for the different pitch types.
    hit_types : <List> Contains all of the columns to store the whether or not a hit occured
        for some pitch type.
    
    Removes pitches from the players record where the pitch resulted in a ball.
    Hit_type records whether or not the pitch for a specific pitch_type resulted in a hit.
    Foul balls are not considered to be a succesful hit.
    
    returns : <Pandas DataFrame Object> Updated DataFrame.
    """
    for pitch_type, hit_type in zip(pitch_types, hit_types):
        # Only count a pitch against a player if it wasn't a ball, because otherwise its the pitcher's fault.
        df[pitch_type] = ((df[pitch_type] == 1) & ( ~df["description"].str.contains("ball"))) * 1
        df[hit_type] = ((df[pitch_type] == 1) & (df["description"].str.contains("hit"))) * 1
        
    return df

def calculate_percentage_hit(df, pitch_types, hit_types):
    """
    df : <Pandas DataFrame Object>
    pitch_types : <List> Contains all the columns for the different pitch types.
    hit_types : <List> Contains all of the columns to store the whether or not a hit occured
        for some pitch type.
    
    Calculates percentage hit chance for some pitch type. (hit_type / pitch_type) * 100
    Removes NaN values by replacing them with zero. A NaN value my occur if a player never
        was pitched a pitch_type (0/0 error).
    
    returns : <Pandas DataFrame Object> Updated DataFrame.
    """
    for pitch_type, hit_type in zip(pitch_types, hit_types):
        df[pitch_type.replace("pitch_type", "percentage_hit")] = (df[hit_type] / df[pitch_type]) * 100
    
    df = df.fillna(0)
    return df

def calculate_specialty_score(df, percentage_hit_types, weights):
    """
    df : <Pandas DataFrame Object>
    percentage_hit_types : <List> The column names containing percentage score for each pitch type.
    weights : <Dict> The corresponding weights for each pitch type.
    
    Sums up all of the percentage hits and applies their corresponding weights to get a specialty score
        statistic.
    
    returns : <Pandas DataFrame Object> Updated DataFrame.
    """
    df["speciality_score"] = 0
    for percentage_hit_type in percentage_hit_types:
        df["speciality_score"] += df[percentage_hit_type] * weights[percentage_hit_type]
    
    return df

def load_data(data_path):
    """
    data_path : <str> Path containing csv files with each season's batting data. Ex: ./data/*
    
    returns : <Pandas DataFrame Object> DataFrame containing concatenated data across all seasons
        found within 'data_path'.
    """
    all_data = [pd.read_csv(file_name, index_col=0) for file_name in glob(data_path)]
    return pd.concat(all_data, axis=0, sort=False)

# Loading all Statcast data

In [3]:
data_path = "./data/*"
combined_data = load_data(data_path)

  exec(code_obj, self.user_global_ns, self.user_ns)


# Data Processing

In [5]:
# Remove all instances where the pitch type was not recorded.
n = combined_data.dropna(axis=0, subset=["pitch_type"])

# Below removes random pitch types that consist of numbers that look like 1623_12312 and only occur once.
# This line wil give a warning. Just ignore it.
n["pitch_type"] = n["pitch_type"].apply(lambda x: x.replace("_",""))
n = n[~(n["pitch_type"].str.isdigit())]

# Add columns with a 1 column indicating pitch type.
n = pd.get_dummies(n, columns=["pitch_type"])

# Add column to sum to get number of plate appearances.
n["plate_appearance"] = 1

# Add player info to the batting data, matching on the batting id (key_mlbam).
n["key_mlbam"] = n["batter"]
player_info = playerid_reverse_lookup(n["batter"].values, key_type='mlbam')
n = pd.merge(n, player_info,on="key_mlbam", how="inner")
n["season"] = n["game_year"]

pitch_types = ['CH', 'CU', 'EP', 'FC', 'FF', 'FO', 'FS', 'FT', 'IN', 'KC', 'KN', 'PO', 'SC', 'SI', 'SL', 'UN']
pitch_types = ["pitch_type_" + pitch_type for pitch_type in pitch_types]
hit_types = [pitch_type.replace("pitch_type", "hit") for pitch_type in pitch_types]

z = gather_hits(n.copy(), pitch_types, hit_types)

pitch_type_aggregates = {pitch_type:"sum" for pitch_type in pitch_types}
hit_type_aggregates = {hit_type:"sum" for hit_type in hit_types}
aggregates = {"name_last":"first", "name_first":"first", "season":"first", "key_retro":"first", "key_bbref":"first", "plate_appearance":"sum", **pitch_type_aggregates, **hit_type_aggregates}

z = z.groupby(["batter", "game_year"]).agg(aggregates)

# Only get players with 200 or more plate appearances.
z = z[z["plate_appearance"] >= 200]

percentage_hit = calculate_percentage_hit(z.copy(), pitch_types, hit_types)

percentage_hit_types = [pitch_type.replace("pitch_type", "percentage_hit") for pitch_type in pitch_types]
weights = {'percentage_hit_CH': 1, 'percentage_hit_CU': 0, 'percentage_hit_EP': 1, 
           'percentage_hit_FC': 1, 'percentage_hit_FF': 1, 'percentage_hit_FO': 1, 
           'percentage_hit_FS': 1, 'percentage_hit_FT': 1, 'percentage_hit_IN': 0, 
           'percentage_hit_KC': 1, 'percentage_hit_KN': 1, 'percentage_hit_PO': 0, 
           'percentage_hit_SC': 1, 'percentage_hit_SI': 0, 'percentage_hit_SL': 1, 
           'percentage_hit_UN': 0}

specialty_score = calculate_specialty_score(percentage_hit.copy(), percentage_hit_types, weights)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Gathering player lookup table. This may take a moment.


In [6]:
specialty_score

Unnamed: 0_level_0,Unnamed: 1_level_0,name_last,name_first,season,key_retro,key_bbref,plate_appearance,pitch_type_CH,pitch_type_CU,pitch_type_EP,pitch_type_FC,...,percentage_hit_FT,percentage_hit_IN,percentage_hit_KC,percentage_hit_KN,percentage_hit_PO,percentage_hit_SC,percentage_hit_SI,percentage_hit_SL,percentage_hit_UN,speciality_score
batter,game_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
110029.0,2014.0,abreu,bobby,2014.0,abreb001,abreubo01,647,35,17,1,16,...,33.333333,0.0,25.000000,0.000000,0.0,0.0,36.764706,21.276596,0.0,192.675632
112526.0,2014.0,colon,bartolo,2014.0,colob001,colonba01,216,3,3,0,2,...,27.272727,0.0,0.000000,0.000000,0.0,0.0,25.000000,25.000000,0.0,68.728423
112526.0,2016.0,colon,bartolo,2016.0,colob001,colonba01,232,3,15,0,7,...,10.000000,0.0,0.000000,0.000000,0.0,0.0,10.000000,10.000000,0.0,54.523810
114739.0,2014.0,giambi,jason,2014.0,giamj001,giambja01,262,21,11,0,1,...,25.000000,0.0,0.000000,0.000000,0.0,0.0,26.666667,42.857143,0.0,126.498127
116338.0,2014.0,hunter,torii,2014.0,huntt001,hunteto01,2032,147,85,2,90,...,33.333333,0.0,38.709677,23.076923,0.0,100.0,37.500000,29.439252,0.0,404.288265
116338.0,2015.0,hunter,torii,2015.0,huntt001,hunteto01,1134,71,47,0,73,...,31.578947,0.0,37.500000,0.000000,0.0,0.0,31.428571,26.400000,0.0,215.907285
116380.0,2014.0,ibanez,raul,2014.0,ibanr001,ibanera01,1102,88,47,0,62,...,30.379747,0.0,22.222222,0.000000,0.0,0.0,33.846154,28.985507,0.0,195.411223
116539.0,2014.0,jeter,derek,2014.0,jeted001,jeterde01,2214,106,66,0,95,...,35.585586,0.0,16.666667,16.666667,0.0,0.0,27.272727,42.045455,0.0,283.376566
117244.0,2014.0,konerko,paul,2014.0,konep001,konerpa01,845,67,42,0,21,...,37.837838,0.0,36.363636,0.000000,0.0,0.0,24.000000,18.390805,0.0,204.944384
120074.0,2014.0,ortiz,david,2014.0,ortid001,ortizda01,2384,155,123,3,63,...,31.318681,0.0,13.793103,23.076923,0.0,0.0,32.352941,29.145729,0.0,220.300597


In [7]:
needed_stuff = specialty_score[["name_last", "name_first", "plate_appearance", "speciality_score"]]

In [8]:
needed_stuff.to_csv("2014-2018_specialty_score_with_ids.csv")

In [12]:
# specialty_score.to_csv("2014-2018_specialty_score_with_ids.csv")