In [1]:
import os
import pandas as pd
import numpy as np
import time
import datetime
from datetime import date
import warnings
import unidecode
import re
import statsmodels.api as sm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

import import_ipynb
from Utilities import *

warnings.simplefilter(action="ignore")
baseball_path = r"C:\Users\james\Documents\MLB\Data"

In [2]:
# Read in DK results and extracts player scores and lineup performances
def read_results(date):
    results = pd.read_csv(os.path.join(baseball_path, "Results", "Results " + date + ".csv" ))
    
    players = results[['Player', 'FPTS']]
    players.dropna(inplace=True)
    
    lineups = results[['Rank', 'EntryId', 'EntryName', 'TimeRemaining', 'Points', 'Lineup']]
    
    lineups['Rank'] = lineups['Rank'].astype('int')
    
    return players, lineups

In [3]:
# Read in contest payout structure
def read_payouts(date):
    payouts = pd.read_excel(os.path.join(baseball_path, "Payouts", "Payouts " + date + ".xlsx"), header=None)
    
    # Reshape the dataframe using numpy 
    # It's currently one column, we want two columns with half as many rows
    rows = int(len(payouts)/2)
    
    arr = np.array(payouts[0]).reshape(rows, 2)

    # Create a new dataframe from the reshaped array
    payouts = pd.DataFrame(arr, columns=["Range", "Payout"])
    
    # Choose bounds for payouts
    payouts[['Lower', 'Upper']] = payouts['Range'].str.split("-", expand=True)
    payouts['Upper'].fillna(payouts['Lower'], inplace=True)
    
    payouts['Lower'] = payouts['Lower'].str.replace('[^0-9]', '')
    payouts['Upper'] = payouts['Upper'].str.replace('[^0-9]', '')
    
    payouts['Lower'] = payouts['Lower'].astype('int')
    payouts['Upper'] = payouts['Upper'].astype('int')

    
    payouts = payouts[['Lower', 'Upper', 'Payout']]
    
    return payouts

In [4]:
# Assign payouts to existing lineups
def add_payouts(date):
    # Read in results
    players, lineups = read_results(date)
    # Read in payouts
    payouts = read_payouts(date)
    
    # Loop over column rows
    for i, row in lineups.iterrows():
        rank = row['Rank']
        mask = (payouts['Lower'] <= rank) & (rank <= payouts['Upper'])
        if mask.any():
            payout = payouts.loc[mask, 'Payout'].values[0]
        else:
            payout = 0
        # Assign payout if Rank in range [Lower, Upper]
        lineups.at[i, 'Payout'] = payout

    return players, lineups

In [5]:
# Read in lineup sims and assign points
def read_sims(date):
    # Read in lineup sims
    sims = pd.read_csv(os.path.join(baseball_path, "A9. Optimizer - 2. Decisions", "Lineups Ranked " + date + ".csv"))
    
    # Read in players and opponent lineups
    players, lineups = add_payouts(date)
    
    
    # Remove accents for better merging
    players['Player'] = players.apply(lambda x: remove_accents(x['Player']), axis=1) 

    
    # For each position, merge the player's point total
    points_list = []
    for pos in ['P', 'P.1', 'C', '1B', '2B', '3B', 'SS', 'OF', 'OF.1', 'OF.2']:
        sims[pos] = sims[pos].str.replace(r'[(0-9)]', "", regex=True)
        sims[pos] = sims[pos].str.rstrip()
        
        sims = sims.merge(players, left_on=pos, right_on='Player', how='left')
        points_name = 'FPTS' + "_" + pos
        points_list.append(points_name)
        sims.drop(columns={'Player'},inplace=True)
        sims.rename(columns={'FPTS':points_name},inplace=True)
    
    sims.drop_duplicates(subset=['P', 'P.1', 'C', '1B', '2B', '3B', 'SS', 'OF', 'OF.1', 'OF.2'], inplace=True)
    sims.reset_index(inplace=True, drop=True)
    
    
    # Calculate lineup points
    sims['Points'] = sims[points_list].sum(axis=1)
    sims['Payout'] = 0
    sims['Rank'] = 0
    # Calculate payouts
    for i in range(len(sims)):
        for j in range(len(lineups)):
            # If it's a tie, average the two payouts
            if sims['Points'][i] == lineups['Points'][j]:
                sims['Rank'][i] = lineups['Rank'][j]
                try:
                    sims['Payout'][i] = (lineups['Payout'][j] + lineups['Payout'][j+1])/2
                # Might break if j+1 doesn't exist
                except:
                    sims['Payout'][i] = 0
                break
            # If it's a win, assign the payout the other lineup had
            elif sims['Points'][i] > lineups['Points'][j]:
                sims['Rank'][i] = lineups['Rank'][j]
                sims['Payout'][i] = lineups['Payout'][j]
                break
    
    
    sims['Payout'] = sims['Payout'].astype('float')
    
    
    return sims

In [6]:
sims = read_sims("20230611")
sims.describe()

Unnamed: 0.1,Unnamed: 0,Budget,AvgPointsPerGame,Sim 0,Sim 1,Sim 2,Sim 3,Sim 4,Sim 5,Sim 6,...,FPTS_1B,FPTS_2B,FPTS_3B,FPTS_SS,FPTS_OF,FPTS_OF.1,FPTS_OF.2,Points,Payout,Rank
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,...,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,99.5,49763.0,116.145685,116.65125,130.09075,112.24975,148.2655,104.99475,109.3115,116.07025,...,16.045,13.035,19.1,14.535,11.14,5.27,9.78,132.61725,178.825,1433.485
std,57.879185,162.056507,0.685509,15.824998,12.855773,30.997662,24.96691,22.044348,26.532502,18.34564,...,12.398856,8.888691,11.141394,8.123344,8.005049,6.612543,7.932606,28.022459,785.101053,1927.420603
min,0.0,49500.0,115.101,81.25,99.95,50.5,90.3,65.1,51.8,68.95,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.9,0.0,1.0
25%,49.75,49600.0,115.56075,105.1625,119.95,86.5,127.675,88.8125,87.55,103.95,...,5.0,4.0,6.0,7.0,2.0,2.0,2.0,110.15,0.0,129.75
50%,99.5,49800.0,116.247,114.9,130.6,117.425,151.8,102.1,109.3,115.1,...,10.0,12.0,26.0,22.0,18.0,2.0,11.0,134.025,0.0,523.0
75%,149.25,49900.0,116.54525,129.4125,137.6875,133.35,167.3,116.3375,126.925,128.3125,...,30.0,21.0,28.0,22.0,18.0,5.75,11.0,150.4,15.0,2292.0
max,199.0,50000.0,118.456,159.25,161.95,190.35,202.8,169.05,178.3,172.1,...,30.0,24.0,28.0,23.0,19.0,30.0,30.0,210.15,4000.0,9628.0


# Run Evaluations

In [None]:
# Loop over lineups
days = []
for filename in os.listdir(r"C:\Users\james\Documents\MLB\Data\A9. Optimizer - 2. Decisions"): 
    # 2023 
    if filename.endswith(".csv") and filename.startswith("Lineups Ranked 202"):
        # Pull out date
        date = filename[15:23]
        days.append(date)
        
def read_sims2(date):
    try:
        sims = read_sims(date)
        sims['date'] = date
    except:
        sims = None
    
    return sims
    

# Run all in parallel
all_sims = Parallel(n_jobs=-2, verbose=5)(delayed(read_sims2)(day) for day in days)

# Create dataframe with every payout for each lineup
payout_df = pd.concat(all_sims, axis=0)

payout_df['Payout'].mean()

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=-2)]: Done  42 tasks      | elapsed:   16.6s


In [None]:
payout_df[payout_df['Payout'] > 500]

### Average Payouts by Measure

In [None]:
measures = ['Tail', 'AvgPointsPerGame', 'Sim STD', 'Plus2', 'Plus3', 'ownership', 'batter ownership', 'pitcher ownership']
measure = 'Tail'

payout_df.sort_values(measure, ascending=False, inplace=True)
group = payout_df.groupby('date')['Payout'].head(20)
group.describe()

### Metrics

In [None]:
# Identify share as % of max
for stat in measures:
    payout_df[f'{stat}_max'] = payout_df.groupby('date')[stat].transform('max')
    payout_df[f'{stat}_pct'] = payout_df[stat] / payout_df[f'{stat}_max']
    
# Identify rank
for stat in measures:
    payout_df.sort_values(['date', stat], ascending=False, inplace=True)
    payout_df[f'{stat} Rank'] = payout_df.groupby('date').cumcount() + 1

### Regression

In [None]:
# Probability of big win
payout_df['win'] = np.where(payout_df['Payout'] >= 100, 1, 0)
print(payout_df['win'].sum())

# Step 2: Prepare your data0
X = payout_df[['Plus3 Rank']]
y = payout_df['win']

# Add a constant term to the input features
X = sm.add_constant(X)

# Step 3: Create and fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X)
results = model.fit()

# Step 4: Print the summary
print(results.summary())

### Plot

In [None]:
# Take average by rank
# measures = ['Tail', 'AvgPointsPerGame', 'Sim STD', 'Plus2', 'Plus3', 'ownership', 'batter ownership', 'pitcher ownership']
rank = "Tail"
df = payout_df.groupby([f'{rank} Rank'])['Payout'].mean().reset_index()

lineups = 20
print("Entering the top {} lineups will the {} metric will average a total payout of {}.".format(lineups, rank, df.head(lineups)['Payout'].sum()))

# Graph it
plt.scatter(df[f'{rank} Rank'], df['Payout'])

# Polynomial regression
coefficients = np.polyfit(df[f'{rank} Rank'], df['Payout'], 1)
p = np.poly1d(coefficients)
plt.plot(df[f'{rank} Rank'], p(df[f'{rank} Rank']), color='red', label='Trend line')

# Labels and title
plt.xlabel(f'{rank} Rank')
plt.ylabel('Payout')
plt.title('Scatter Plot of Rank vs Payout')

# Display trend line equation
equation = f'Payout = {coefficients[0]:.2f} * Rank + {coefficients[1]:.2f}'
plt.text(0.5, 0.9, equation, ha='center', va='center', transform=plt.gca().transAxes)

# Legend and show plot
plt.legend()
plt.show()


In [None]:
lineups = 5
buyin = 4

average = (df['Payout'].head(lineups).mean())

days = len(payout_df['date'].unique())


profit_margin = average/buyin - 1
profit = lineups*(average-buyin)
total_profit = profit * days

print("Average: {} \nProfit Margin: {} \nProfit: {} \nTotal Profit: {}".format(average, profit_margin, profit, total_profit))

# Profit