In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set working directory
import os
current_dir = os.getcwd()
os.chdir(current_dir.replace('\code', '\data'))

In [8]:
# Load data
data_ii = pd.read_csv('best_ball_mania_ii.csv')
data_iii = pd.read_csv('best_ball_mania_iii.csv')
data_iv = pd.read_csv('best_ball_mania_iv.csv')

In [9]:
# Increase 'roster_points' and 'pick_points' for dataset iv by 4.5
data_iv['pick_points'] = data_iv['pick_points']*4.5
data_iv['roster_points'] = data_iv['roster_points']*4.5

# Columns that are in all datasets
cols = list(set(data_ii.columns) & set(data_iii.columns) & set(data_iv.columns))

# Filter columns
data_ii = data_ii[cols]
data_iii = data_iii[cols]
data_iv = data_iv[cols]

# Concatenate data
data = pd.concat([data_ii, data_iii, data_iv], axis=0)

In [10]:
# Rename 'tournament_entry_id' to 'team_id'
data = data.rename(columns={'tournament_entry_id': 'team_id'})

data = data.drop(columns=['clock', 'tournament_round_number', 'bye_week', 'draft_time'])

# Fit a polynomial regression
p = np.poly1d(np.polyfit(data['overall_pick_number'], data['pick_points'], 2))
data['poly_points'] = p(data['overall_pick_number'])

# Bootstrapping

In [None]:
# Bootstrap the points per position rank
# Find distributions of points per position rank for each position
# Simulate a single draft with 30 QBs, 70 RBs, 100 WRs, and 30 TEs
# Each pick is a random number from the distribution of *centered* points per position rank
# Find the chance that RB1 scores more points than WR1 and so on

def polynomial_points(position_name):
    # Group by 'draft_id', 'position_name', 'pick_points' and 'overall_pick_number'
    data_grouped = data.groupby(['draft_id', 'position_name', 'overall_pick_number', 'pick_points']).mean().reset_index()
    data_grouped = data_grouped[['draft_id', 'position_name', 'overall_pick_number', 'pick_points']]
    data_grouped = data_grouped.drop(data_grouped[data_grouped['position_name'] == 'FB'].index)

    # New column 'pos_rank' which is the rank of the 'position_name' in each 'draft_id'
    # if lowest 'overall_pick_number' then rank is 1, if second lowest then rank is 2, and so on
    data_grouped['pos_rank'] = data_grouped.groupby(['draft_id', 'position_name'])['overall_pick_number'].rank()

    # Limit by these parameters: 30 QBs, 70 RBs, 100 WRs, and 30 TEs
    data_grouped = data_grouped[(data_grouped['position_name'] == 'QB') & (data_grouped['pos_rank'] <= 30) |
                                (data_grouped['position_name'] == 'RB') & (data_grouped['pos_rank'] <= 70) |
                                (data_grouped['position_name'] == 'WR') & (data_grouped['pos_rank'] <= 100) |
                                (data_grouped['position_name'] == 'TE') & (data_grouped['pos_rank'] <= 30)]

    # Fit a polynomial regression to the data
    p = np.poly1d(np.polyfit(data_grouped[data_grouped['position_name'] == position_name]['pos_rank'],
                              data_grouped[data_grouped['position_name'] == position_name]['pick_points'], 2))
    
    return p

def variance_regression(position_name):
    # Group by 'draft_id', 'position_name', 'pick_points' and 'overall_pick_number'
    data_grouped = data.groupby(['draft_id', 'position_name', 'overall_pick_number', 'pick_points']).mean().reset_index()
    data_grouped = data_grouped[['draft_id', 'position_name', 'overall_pick_number', 'pick_points']]
    data_grouped = data_grouped.drop(data_grouped[data_grouped['position_name'] == 'FB'].index)

    # New column 'pos_rank' which is the rank of the 'position_name' in each 'draft_id'
    # if lowest 'overall_pick_number' then rank is 1, if second lowest then rank is 2, and so on
    data_grouped['pos_rank'] = data_grouped.groupby(['draft_id', 'position_name'])['overall_pick_number'].rank()

    # Limit by these parameters: 30 QBs, 70 RBs, 100 WRs, and 30 TEs
    data_grouped = data_grouped[(data_grouped['position_name'] == 'QB') & (data_grouped['pos_rank'] <= 30) |
                                (data_grouped['position_name'] == 'RB') & (data_grouped['pos_rank'] <= 70) |
                                (data_grouped['position_name'] == 'WR') & (data_grouped['pos_rank'] <= 100) |
                                (data_grouped['position_name'] == 'TE') & (data_grouped['pos_rank'] <= 30)]
    
    # Fit a linear regression to the standard deviation of the points per position rank
    grouped_data = data_grouped[data_grouped['position_name'] == position_name].groupby('pos_rank')['pick_points'].std()
    p = np.poly1d(np.polyfit(grouped_data.index, grouped_data.values, 1))
    
    return p

In [None]:
# Define a function that estimates the points for each position rank
def estimations(position_name, max, starter):
    points = polynomial_points(position_name)
    variance = variance_regression(position_name)
    df = pd.DataFrame()
    df['rank'] = np.arange(1, max+1)
    df['pos_rank'] = position_name + df['rank'].astype(str)
    df['points'] = points(df['rank'])
    df['top_five'] = points(df['rank']) + 2*variance(df['rank'])
    df['bottom_five'] = points(df['rank']) - 2*variance(df['rank'])
    mu = df[df['rank'] == starter]['points'].values[0]
    df['points'] -= mu
    df['top_five'] -= mu
    df['bottom_five'] -= mu
    return df

# Estimations for each position
qb = estimations('QB', max=30, starter=12)
rb = estimations('RB', max=70, starter=28)
wr = estimations('WR', max=100, starter=40)
te = estimations('TE', max=30, starter=16)

# New df with all positions
df_points = pd.concat([qb[['pos_rank', 'points', 'top_five', 'bottom_five']], 
                rb[['pos_rank', 'points', 'top_five', 'bottom_five']], 
                wr[['pos_rank', 'points', 'top_five', 'bottom_five']], 
                te[['pos_rank', 'points', 'top_five', 'bottom_five']]], axis=0)
df_points = df_points.sort_values(by='points', ascending=False)
df_points.head(24)

# Data Preprocessing for Models

In [78]:
def ordinal_ranks(data, drop_columns=False):
    # Group by 'draft_id', 'position_name', 'pick_points' and 'overall_pick_number'
    df = data[['team_id', 'position_name', 'overall_pick_number', 'pick_points']]
    roster_points = data[['team_id', 'roster_points']].drop_duplicates()

    data_grouped = df.groupby(['team_id', 'position_name', 'overall_pick_number', 'pick_points']).mean().reset_index()
    data_grouped = data_grouped[['team_id', 'position_name', 'overall_pick_number', 'pick_points']]
    data_grouped = data_grouped.drop(data_grouped[data_grouped['position_name'] == 'FB'].index)

    # New column 'pos_rank' which is the rank of the 'position_name' in each 'draft_id'
    # if lowest 'overall_pick_number' then rank is 1, if second lowest then rank is 2, and so on
    data_grouped['pos_rank'] = data_grouped.groupby(['team_id', 'position_name'])['overall_pick_number'].rank()

    # New column pos_team_rank which is position_name + pos_rank
    data_grouped['pos_team_rank'] = data_grouped['position_name'] + data_grouped['pos_rank'].astype(str)

    # Pivot the data so that position names are columns, and overall pick numbers are rows
    data_grouped = data_grouped.pivot(index='team_id', columns='pos_team_rank', values='overall_pick_number').reset_index()

    # Drop columns below 5 percent full
    if drop_columns == True:
        data_grouped = data_grouped.dropna(thresh=0.05*len(data_grouped), axis=1)

    # Fill NaN values with 432
    data_grouped = data_grouped.fillna(432)

    # Merge with 'roster_points'
    data_grouped = data_grouped.merge(roster_points, on='team_id')
    
    # Drop 'team_id' column
    data_grouped = data_grouped.drop(columns=['team_id'])

    return data_grouped

Unnamed: 0,QB1.0,QB2.0,QB3.0,RB1.0,RB2.0,RB3.0,RB4.0,RB5.0,RB6.0,RB7.0,...,WR1.0,WR2.0,WR3.0,WR4.0,WR5.0,WR6.0,WR7.0,WR8.0,WR9.0,roster_points
0,36.0,156.0,432.0,13.0,108.0,109.0,157.0,180.0,205.0,432.0,...,12.0,60.0,61.0,84.0,85.0,133.0,181.0,204.0,432.0,1622.50
1,47.0,194.0,432.0,26.0,74.0,95.0,98.0,146.0,432.0,432.0,...,2.0,23.0,50.0,119.0,122.0,143.0,167.0,191.0,215.0,1177.38
2,68.0,188.0,432.0,5.0,20.0,125.0,140.0,149.0,432.0,432.0,...,29.0,44.0,53.0,77.0,92.0,116.0,164.0,173.0,212.0,1487.10
3,115.0,126.0,432.0,6.0,43.0,139.0,150.0,198.0,432.0,432.0,...,19.0,30.0,67.0,78.0,91.0,102.0,163.0,187.0,211.0,1633.24
4,31.0,151.0,432.0,7.0,55.0,90.0,114.0,138.0,175.0,199.0,...,42.0,66.0,79.0,103.0,127.0,186.0,210.0,432.0,432.0,1752.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35995,34.0,106.0,183.0,10.0,39.0,82.0,159.0,178.0,207.0,432.0,...,15.0,58.0,63.0,87.0,111.0,135.0,154.0,432.0,432.0,1390.50
35996,80.0,104.0,432.0,8.0,17.0,152.0,176.0,432.0,432.0,432.0,...,32.0,41.0,56.0,65.0,89.0,113.0,137.0,200.0,209.0,1685.38
35997,37.0,132.0,432.0,36.0,60.0,109.0,181.0,205.0,432.0,432.0,...,12.0,13.0,84.0,85.0,108.0,133.0,157.0,180.0,432.0,1629.99
35998,60.0,156.0,204.0,13.0,36.0,84.0,85.0,205.0,432.0,432.0,...,12.0,37.0,61.0,108.0,109.0,132.0,133.0,180.0,432.0,1788.08


In [32]:
def one_got_encode(data):
    df = data[['team_pick_number', 'position_name', 'roster_points', 'team_id']]

    roster_points = data[['team_id', 'roster_points']].drop_duplicates()

    # Group by 'team_id' and 'team_pick_number' and aggregate 'position_name'
    df = df.groupby(['team_id', 'team_pick_number'])['position_name'].first().reset_index()

    # Now pivot the data
    df = df.pivot(index='team_id', columns='team_pick_number', values='position_name').reset_index()

    # Add 'roster_points' to the data without adding new rows
    df = pd.merge(df, roster_points, on='team_id')
    df = df.drop(columns='team_id')
    df = df.sample(frac=1.0, random_state=0)

    # One-hot encode the 'position_name' column
    columns_to_encode = list(range(1, 19))
    df = pd.get_dummies(df, columns=columns_to_encode)

    return df

In [38]:
def numerical_indicies(data):
    # Converting positions to numerical indicies for the model
    positions = ['QB', 'RB', 'WR', 'TE', 'FB']
    position_dict = {position: i for i, position in enumerate(positions)}

    # Load data
    df = data[['team_pick_number', 'position_name', 'roster_points', 'team_id']]
    roster_points = data[['team_id', 'roster_points']].drop_duplicates()

    # Group by 'team_id' and 'team_pick_number' and aggregate 'position_name'
    df = df.groupby(['team_id', 'team_pick_number'])['position_name'].first().reset_index()

    # Now pivot the data
    df = df.pivot(index='team_id', columns='team_pick_number', values='position_name').reset_index()

    # Add 'roster_points' to the data without adding new rows
    df = pd.merge(df, roster_points, on='team_id')
    df = df.drop(columns='team_id')

    # Use the position_dict to convert the 'position_name' to a numerical index
    df = df.map(lambda x: position_dict[x] if x in position_dict else x)

    return df

# Modeling

In [80]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Load data
#   df = one_got_encode(data)
#   df = numerical_indicies(data)
df = ordinal_ranks(data, drop_columns=True)

# Split the df into training and testing sets
X = df.drop(columns='roster_points')
y = df['roster_points']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Fit a random forest regressor
rfr = RandomForestRegressor(n_estimators=500, 
                            max_depth=15,
                            min_samples_leaf=15,
                            min_samples_split=8)
rfr.fit(X_train, y_train)

# Predict roster_points
y_pred = rfr.predict(X_test)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print(r2)

0.04464058977276375


In [81]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [10, 20],
    'min_samples_split': [5, 15],
    'min_samples_leaf': [10, 20],
}

# Load data
#   df = one_got_encode(data)
#   df = numerical_indicies(data)
df = ordinal_ranks(data, drop_columns=True)

# Split the df into training and testing sets
X = df.drop(columns='roster_points')
y = df['roster_points']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Fit a random forest regressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

# Create a base model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

print(best_params)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
{'max_depth': 20, 'min_samples_leaf': 20, 'min_samples_split': 15, 'n_estimators': 300}


In [85]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Load data
#   df = numerical_indicies(data)
df = ordinal_ranks(data, drop_columns=False)

# Split the df into training and testing sets
X = df.drop(columns='roster_points')
y = df['roster_points']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create a neural network
model = Sequential()
model.add(Dense(50, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))
model.compile(optimizer=Adam(0.01), loss='mse')
model.fit(x=X_train, y=y_train.values, 
          validation_data=(X_test, y_test.values), 
          batch_size=50, epochs=40)
# Predict roster_points
y_pred = model.predict(X_test)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print(r2)

Epoch 1/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 602973.8750 - val_loss: 29124.2910
Epoch 2/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 25597.4766 - val_loss: 27841.9258
Epoch 3/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 26172.7871 - val_loss: 27476.2090
Epoch 4/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 25844.1074 - val_loss: 26827.5859
Epoch 5/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 25221.6953 - val_loss: 25709.6797
Epoch 6/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 25520.9688 - val_loss: 26800.1113
Epoch 7/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 24952.9570 - val_loss: 26463.8145
Epoch 8/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 24989.330

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Flatten
from tensorflow.keras.optimizers import Adam

# Load data
#   df = numerical_indicies(data)
df = ordinal_ranks(data, drop_columns=True)

# Split the df into training and testing sets
X = df.drop(columns='roster_points')
y = df['roster_points']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create the model
inputs = Input(shape=(X_train.shape[1],))
x = Embedding(input_dim=5, output_dim=3)(inputs)
x = LSTM(100)(x)
x = Dense(1)(x)
model = Model(inputs=inputs, outputs=x)
model.compile(optimizer=Adam(0.01), loss='mse')

# Fit the model
model.fit(x=X_train, y=y_train.values, 
          validation_data=(X_test, y_test.values), 
          batch_size=50, epochs=40)

# Predict roster_points
y_pred = model.predict(X_test)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print(r2)

Epoch 1/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 1976162.1250 - val_loss: 1112360.2500
Epoch 2/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 917505.1250 - val_loss: 453926.6875
Epoch 3/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 359387.9375 - val_loss: 155719.6875
Epoch 4/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 121903.8516 - val_loss: 51669.6641
Epoch 5/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 42928.7852 - val_loss: 26729.2480
Epoch 6/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 25705.7441 - val_loss: 23238.1875
Epoch 7/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 23178.0000 - val_loss: 22975.0977
Epoch 8/40
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 2

In [88]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load data
#   df = numerical_indicies(data)
df = ordinal_ranks(data, drop_columns=True)

# Split the df into training and testing sets
X = df.drop(columns='roster_points')
y = df['roster_points']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Fit an XGBoost regressor
xgbr = xgb.XGBRegressor(n_estimators=100, learning_rate=0.001)
xgbr.fit(X_train, y_train)

# Predict roster_points
y_pred = xgbr.predict(X_test)

# Calculate the R^2 score
r2 = r2_score(y_test, y_pred)
print(r2)

0.004462948330022032


After finding teams with the highest expect points/playoff chances; run a cluster analysis to discover the possible drafting patterns that are most successful