In [1]:
import pandas as pd
import numpy as np
import os
import unidecode
import datetime
from datetime import date
import warnings
warnings.simplefilter(action="ignore")

from plotnine import *

baseball_path = r"C:\Users\james\Documents\MLB\Data"

In [2]:
todaysdate = date.today()
todaysdate = str(todaysdate)

# Set custom date
todaysdate = "2022-10-01"

# Create name for Daily Fantasy Fuel Files
dff_name = "DFF_MLB_cheatsheet_" + todaysdate + ".csv"

# Remove dashes for other file names
# todaysdate = todaysdate.replace("-", "")

result_name = "Results " + todaysdate + ".csv"

sims_name = "Player_Sims_" + todaysdate + ".csv"

In [3]:
# Clean names for consistency
def name_clean(df):    
    df['Name_Adjusted'] = df['Name'].apply(unidecode.unidecode)  # remove accents
    df['Name_Adjusted'] = df['Name_Adjusted'].str.replace('[^a-zA-Z0-9 ]', '')
    
    # Separate first and last names
    df['First'] = df['Name_Adjusted'].str.split(" ").str[0] 
    df['First'] = df['First'].str.lower()
    df['Last'] = df['Name_Adjusted'].str.split(" ").str[1]
    df['Last'] = df['Last'].str.lower()
    
    # Take first 2 letters of first name and first 5 of last as a sort of merge code
    df['First2'] = df['First'].str.slice(0,2)
    df['Last5'] = df['Last'].str.slice(0,5)
    
    return df

In [4]:
def create_my_df(nn, date=todaysdate):
    # Remove dashes for other file names
    date = date.replace("-", "")
    
    sims_name = "Player_Sims_" + date + ".csv"
    
    # Read in my projections
    my_df = pd.read_csv(os.path.join(baseball_path, "Player Sims", nn, sims_name))
    my_df = my_df[['Name', 'AvgPointsPerGame']]
    my_df = my_df[my_df['AvgPointsPerGame'] > 3]
    my_df = name_clean(my_df)
    
    return my_df

In [5]:
# Read in Daily Fantasy Fuel Projections
def create_dff_df(date=todaysdate):
    # Create name for Daily Fantasy Fuel Files
    dff_name = "DFF_MLB_cheatsheet_" + todaysdate + ".csv"

    dff_df = pd.read_csv(os.path.join(baseball_path, "Online Projections", dff_name))
    dff_df['Name'] = dff_df['first_name'] + " " + dff_df['last_name']
    dff_df = dff_df[['Name', 'ppg_projection']]
    dff_df = name_clean(dff_df)
    dff_df = dff_df[dff_df['ppg_projection'] > 0]

    return dff_df

In [6]:
def create_score_df(date=todaysdate):
    date = date.replace("-", "")

    result_name = "Results " + date + ".csv"
    
    # Read in results
    score_df = pd.read_csv(os.path.join(baseball_path, "Results", result_name))
    score_df = score_df[['Player', 'Roster Position', '%Drafted', 'FPTS']]
    score_df.dropna(inplace=True)
    score_df.rename(columns={'Player':'Name'}, inplace=True)
    score_df = name_clean(score_df)
    score_df['FPTS'] = np.where(((score_df['FPTS'] == 0) & (score_df['Roster Position'] == "P")), np.nan, score_df['FPTS'])

    score_df.dropna(inplace=True)             
    return score_df

In [7]:
def pregame(nn='200200200r', date=todaysdate):
    my_dff = create_my_df(nn, date)
    dff_df = create_dff_df(date)
    
    proj_df = my_dff.merge(dff_df, on=['First2', 'Last5'], how='inner', suffixes=("", "_x"))
    
    print("My correlation with DFF was " + str(proj_df['AvgPointsPerGame'].corr(proj_df['ppg_projection'])))
    
    proj_df['diff'] = proj_df['AvgPointsPerGame'] - proj_df['ppg_projection']
    
    proj_df = proj_df[['Name', 'AvgPointsPerGame', 'diff']]
    
    print("Not Even Close List:")
    print(proj_df.query('diff > 3 | diff < -3'))
    
    return proj_df 

In [8]:
proj_df = pregame()

My correlation with DFF was 0.8164701248754788
Not Even Close List:
                Name  AvgPointsPerGame    diff
2      Nestor Cortes           16.7854 -3.9146
3       Drey Jameson           15.9692  3.1692
7        Austin Voth            8.5302 -3.9698
46        Adam Oller            6.7958 -6.8042
60  LaMonte Wade Jr.            6.5750  5.7750
65      Jason Vosler            6.4210  3.6210
96      Abraham Toro            5.7000  3.1000


In [12]:

def postgame(nn='200200200r', date=todaysdate):
    my_df = create_my_df(nn, date)
    dff_df = create_dff_df(date)
    score_df = create_score_df(date)
    
    complete_df = my_df.merge(dff_df, on=['First2', 'Last5'], how='inner', suffixes=("", "_dff"))
    complete_df = complete_df.merge(score_df, on=['First2', 'Last5'], how='inner', suffixes=("", "_dk"))
    complete_df = complete_df[['Name', 'Roster Position', 'AvgPointsPerGame', 'ppg_projection', 'FPTS']]
    
    # Calculate errors
    complete_df['my_error'] = complete_df['AvgPointsPerGame'] - complete_df['FPTS']
    complete_df['dff_error'] = complete_df['ppg_projection'] - complete_df['FPTS']
    
    complete_df['my_abs_error'] = abs(complete_df['my_error'])
    complete_df['dff_abs_error'] = abs(complete_df['dff_error'])
    
    complete_df['my_error_sq'] = complete_df['my_error'] ** 2
    complete_df['dff_error_sq'] = complete_df['dff_error'] ** 2

    # Correlations
    # print("My correlation with actual scores was " + str(complete_df['AvgPointsPerGame'].corr(complete_df['FPTS'])))
    # print("DFF's correlation with actual scores was " + str(complete_df['ppg_projection'].corr(complete_df['FPTS'])))
    
    return complete_df

In [36]:
def evaluations(nn):
    all_df_list = []
    for file in os.listdir(os.path.join(baseball_path, "Player Sims", nn)):
        date = file.replace("Player_Sims_", "")
        date = date.replace(".csv", "")
        year = date[0:4]
        month = date[4:6]
        day = date[6:8]
        date = year + "-" + month + "-" + day

        complete_df = postgame(nn, date)
        
        all_df_list.append(complete_df)
        
    all_df = pd.concat(all_df_list, axis=0)
    
    print("All")
    print("my error: " + str(all_df['my_abs_error'].mean()))
    print("DFF error: " + str(all_df['dff_abs_error'].mean()))
    
    print("my MSE: " + str(all_df['my_error_sq'].mean()))
    print("DFF MSE: " + str(all_df['dff_error_sq'].mean()))
    
    print("Pitchers")
    pitcher_df = all_df[all_df["Roster Position"] == 'P']
    print("my error: " + str(pitcher_df['my_abs_error'].mean()))
    print("DFF error: " + str(pitcher_df['dff_abs_error'].mean()))
    
    print("my MSE: " + str(pitcher_df['my_error_sq'].mean()))
    print("DFF MSE: " + str(pitcher_df['dff_error_sq'].mean()))
    
    print("Batters")
    batter_df = all_df[all_df["Roster Position"] != 'P']
    print("my error: " + str(batter_df['my_abs_error'].mean()))
    print("DFF error: " + str(batter_df['dff_abs_error'].mean()))
    
    print("my MSE: " + str(batter_df['my_error_sq'].mean()))
    print("DFF MSE: " + str(batter_df['dff_error_sq'].mean()))
    
    # return all_df

In [37]:
evaluations("200200r")


All
my error: 5.637129487179497
DFF error: 5.68913308913309
my MSE: 53.620299429123975
DFF MSE: 53.59366910866911
Pitchers
my error: 8.396517499999998
DFF error: 9.065000000000001
my MSE: 102.30784558887498
DFF MSE: 116.83824999999999
Batters
my error: 5.5680584480600865
DFF error: 5.604630788485605
my MSE: 52.40158738507514
DFF MSE: 52.010575719649594


In [40]:
evaluations("200200200t")

All
my error: 5.474701648351644
DFF error: 5.68913308913309
my MSE: 53.66551642050978
DFF MSE: 53.59366910866911
Pitchers
my error: 8.527662499999998
DFF error: 9.065000000000001
my MSE: 105.6179902116249
DFF MSE: 116.83824999999999
Batters
my error: 5.398282102628282
DFF error: 5.604630788485605
my MSE: 52.36507902899252
DFF MSE: 52.010575719649616


In [38]:
evaluations("200200200r")

All
my error: 5.594045054945056
DFF error: 5.6891330891330885
my MSE: 53.58085095960319
DFF MSE: 53.593669108669125
Pitchers
my error: 7.9096575
DFF error: 9.065000000000001
my MSE: 96.27323614062503
DFF MSE: 116.83824999999999
Batters
my error: 5.536082290362953
DFF error: 5.604630788485604
my MSE: 52.51220552328226
DFF MSE: 52.01057571964959
