In [1]:
import pandas as pd
import numpy as np
import pybaseball as bball
import seaborn as sns
import matplotlib.pyplot as plot
from scipy import stats
import sklearn

In [2]:
def load_fangraphs_data(start_year, end_year, team=True, player=True):
    '''
    Loads data from Fangraphs.

    Inputs:
      start_year: the first year of data you would like (int)
      end_year: the last year of data you would like (int)
      team: whether you want team data or not (bool)
      player: whether you want player data or not (bool)

    Output:
      a Pandas dataframe for either player data or team data (or both)
      
    '''
    if (team == True) and (player == True):
      team_data = bball.team_batting(start_year, end_year)
      player_data = bball.batting_stats(start_year, end_year)
      return team_data, player_data
    elif team == True:
      team_data = bball.team_batting(start_year, end_year)
      return team_data
    elif player == True:
      player_data = bball.batting_stats(start_year, end_year)
      return player_data

In [4]:
'''
Plots team wRC against team runs scored from 
2005 to 2019. 
'''
def plot_wRC_runs_regression(team_data):
    team_data_to_2019 = team_data[team_data["Season"] < 2020]
    print(stats.pearsonr(team_data_to_2019["wRC"], team_data_to_2019["R"]))
    sns.regplot("wRC", "R", team_data_to_2019)


In [5]:
def create_single_stat_df(player_data, statistic, first_year, last_year, min_PA):
    '''
    Creates a table of all tabulated individual instances of a given statistic
    within a given timeframe. 

    Inputs:
      player_data: a Pandas dataframe
      statistic: the selected statistic you would like data on (str)
      first_year: the first year you would like data on (int)
      last_year: the last year you would like data on (int)
      min_PA: the minimum threshold of plate appearances in a season
              required to represent the statistic in the dataframe

    Output:
      df: a Pandas dataframe
    '''
    
    player_names = list(set(player_data["Name"].to_list()))
    player_array = np.array([player_names]).transpose()
    df = pd.DataFrame(player_array)
    years = []
    for x in range(first_year, last_year + 1):
        years.append(str(x))
    for year in years:
        new_data = []
        season_data = player_data[player_data["Season"] == int(year)]
        for player in player_names:
            try:
                y = season_data[(season_data["Name"] == player)
                    & (season_data["PA"] >= min_PA)][statistic]
                y = y.item()
            except: 
                y = "NA"
            new_data.append(y)
        df[year] = new_data
    return df


In [None]:
def create_multi_stat_df(player_data, statistics, first_year, last_year, min_PA):
    '''
    Creates a table of all tabulated individual instances of the given statistics
    within a given timeframe. 

    Inputs:
      player_data: a Pandas dataframe
      statistics: the selected statistics you would like data on (list of strings)
      first_year: the first year you would like data on (int)
      last_year: the last year you would like data on (int)
      min_PA: the minimum threshold of plate appearances in a season
              required to represent the statistic in the dataframe

    Output:
      df: a Pandas dataframe
    '''
    
    player_names = list(set(player_data["Name"].to_list()))
    player_array = np.array([player_names]).transpose()
    df = pd.DataFrame(player_array)
    years = []
    for x in range(first_year, last_year + 1):
        years.append(str(x))
    for year in years:
        season_data = player_data[player_data["Season"] == int(year)]
        for stat in statistics:
            new_data = []
            for player in player_names:
                try:
                    y = season_data[(season_data["Name"] == player)
                        & (season_data["PA"] >= min_PA)][stat]
                    y = y.item()
                except: 
                    y = "NA"
                new_data.append(y)
            col_name = stat + '_' + str(year)
            df[col_name] = new_data
    return df