In [5]:
#QB ML MODEL
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import MinMaxScaler

pd.options.mode.chained_assignment = None

import sklearn
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
import joblib


#scaler to scale data
scaler = MinMaxScaler()

#read csv files into pandas
dfFantasy = pd.read_csv("/Users/kmaran3/Dropbox/Darkhorse/final position group data/final_qb_data.csv")
dfFantasy.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = dfFantasy.select_dtypes(include=[np.number]).columns
for column in numeric_cols:
    dfFantasy[column].fillna(dfFantasy[column].mean(), inplace=True)
dfGrades = pd.read_csv("/Users/kmaran3/Dropbox/Darkhorse/approximate value data/AVbyPositionGroup.csv")

def correctData(df, pprTF):
  #cols to make per game
  cols = ['completions', 'attempts', 'passing_yards',
       'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles',
       'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch',
       'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr',
       'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles',
       'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa',
       'rushing_2pt_conversions', 'fantasy_points']

  #basing data if ppr or not
  if pprTF == 2:
    pass
  elif pprTF == 0:
    pass
  elif pprTF == 1:
    pass

    
  #adding ppg column
  df.loc[:, 'PPG'] = df['fantasy_points'] / df['GP']


  #make all columns in a per game basis
  for col in cols:
    df.loc[:, col] = df[col] / df['GP'] 


  #only players with more than 7 games.
  df = df[df.GP > 7]
  df = df[df.fantasy_points >= 0]

  df = df[df.PPG > 5]
  

  return df

def putAV(df, dfAV):
  #years to iterate through
  yearsBig = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
  teams = ["crd", "atl", "rav", "buf", "car", "chi", "cin", "cle", "dal", "den", "det", "gnb", "htx", "clt", "jax", "kan", "rai", "sdg", "ram", "mia", "min", "nwe", "nor", "nyg", "nyj", "phi", "pit", "sfo", "sea", "tam", "oti", "was"]

  #columns wanted to add
  columns = ["oline", "rb", "wrte", "qb", "dst"]
  df[columns] = np.nan

  #gets rid of nan for these columns, as when we drop nan we want to save these.
  for colNow in columns:
    df.loc[df["season"] == 2013, colNow] = "no"



  #iterates through years
  for year in yearsBig:

    #assigns df of AVS to only include current year
    dfCurr = dfAV[dfAV.season == year].copy()
    
    #iterates through team list
    for teamNow in teams:
      #makes the df
      dfCurrNew = dfCurr[dfCurr.team == teamNow]
      
      #iterates through columns and adds AV
      for colNow in columns:
        #locate correct year and team rows
        condition = (df["season"] == year) & (df["team"] == teamNow)

        #set these rows to correct values
        df.loc[condition, colNow] = dfCurrNew.iloc[0][colNow]

  df = df.dropna()

  return df

#removes unneccesary stats
def removeUnwanted(dfPos, pos):
  dfPos = dfPos.drop(columns=['season',"GP", "season_type", "fantasy_points", "player_display_name", "player_id", "team", "position", "status"])
  return dfPos

#shifts data forward one year
def makeCorrectShift(df):
  shifters = ['PPG','season','GP','season_type','fantasy_points','ppr_sh','completions','attempts','passing_yards','passing_tds','interceptions','sacks','sack_yards','sack_fumbles','sack_fumbles_lost','passing_air_yards','passing_yards_after_catch','passing_first_downs','passing_epa','passing_2pt_conversions','pacr','dakota','carries','rushing_yards','rushing_tds','rushing_fumbles','rushing_fumbles_lost','rushing_first_downs','rushing_epa','rushing_2pt_conversions','comp %','td:int','yards/attempts','yards/comp','yards/carry','passer rating','AV']
  
  #adds target variable
  df["targetPPG"] = df["PPG"]
  
  #shifts it forward a year (for example 2011 goes to 2012)
  df[shifters] = df.groupby('player_display_name')[shifters].shift(1)
  df = df.dropna()

  return df

#where machine learning is done. returns the model and score.
from sklearn.inspection import permutation_importance

def machineLearning(df, arr, dictParam):
    # Define predictors excluding the target variable
    predictors = [col for col in df.columns if col != "targetPPG"]

    # Split the data
    x = df[predictors].values
    y = df["targetPPG"].values
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

    # Initialize and train MLPRegressor
    mlp = MLPRegressor(hidden_layer_sizes=dictParam["hidden_layer_sizes"], activation=dictParam["activation"], solver=dictParam["solver"], max_iter=dictParam["max_iter"])
    mlp.fit(x_train, y_train)

    # Evaluate the model
    predict_test = mlp.predict(x_test)
    mae = mean_absolute_error(y_test, predict_test)

    predict_test_unscaled = predict_test * (arr[1] - arr[0]) + arr[0]
    y_test_unscaled = y_test * (arr[1] - arr[0]) + arr[0]

    # Calculate permutation importance
    r = permutation_importance(mlp, x_test, y_test, n_repeats=100, random_state=0)

    # Organize importances
    importance_dict = {name: score for name, score in zip(predictors, r.importances_mean)}
    sorted_importances = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
    


    return [mae, mlp]

# Example usage of the modified function


def getBestParams(df, arr):

  #make the predictors and data and test sets correctly
  predictors = [col for col in df.columns if col != "targetPPG"]
  x = df[predictors].values
  y = df["targetPPG"].values
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=40)

  
  #make the parameters to search over. for hidden_layer_sizes, I experimented with alot and the ones listed now is just final set of experiment.
  
  grid = {
      'hidden_layer_sizes': [(32,32), (64,32), (64), (64,64)],
      'activation': ['tanh', 'identity', 'logistic', 'relu'],
      'solver': ['adam', 'sgd', 'lbfgs'],
      'max_iter': [100, 200, 500]
  }

  #create an MLPRegressor object
  mlp = MLPRegressor()

  #create a GridSearchCV object and fit it to the training data
  grid_search = GridSearchCV(mlp, param_grid=grid, cv=5, n_jobs=-1)
  grid_search.fit(x_train, y_train)

  print("Best things:", grid_search.best_params_)

  #the best model to make predictions on the test data and evaluate performance
  y_pred = grid_search.predict(x_test)

  #inverse transform the scaled predictions to get the original scale, uses a reverse of original formula
  for i in range(len(y_pred)):
    y_pred[i] = (y_pred[i]*(arr[1] - arr[0])) + arr[0]
  for i in range(len(y_test)):
    y_test[i] = (y_test[i]*(arr[1] - arr[0])) + arr[0]


  print(mean_absolute_error(y_test, y_pred))

  return grid_search.best_params_

#gets original value for fantasy points for predictions.
def getScaleBack(df):
  #index of column
  column_index = df.columns.get_loc("PPG")

  #min value of column:
  min_value = df["PPG"].min()

  #scaling valye of column
  #scaling_factor = scaler.scale_[column_index]
  max_value = df["PPG"].max()

  #array to be used later to scale each data
  arr = [min_value, max_value]

  return arr

def test(df, model, arr):
  #make columns everything but target
  predictors = [col for col in df.columns if col != "targetPPG"]


  #make train and test sets
  x = df[predictors].values
  y = df["targetPPG"].values
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=40)

  #make the predictions
  predict_test = model.predict(x_test)

  #inverse transform the scaled predictions to get the original scale by reversing formula
  for i in range(len(predict_test)):
    predict_test[i] = (predict_test[i]*(arr[1] - arr[0])) + arr[0]
  for i in range(len(y_test)):
    y_test[i] = (y_test[i]*(arr[1] - arr[0])) + arr[0]

  #average error 
  mae = mean_absolute_error(y_test, predict_test)
  print("test ", mae)

#if ppr is 0, than it is non ppr. if 1, then it is half ppr. if 2, full ppr. loops through each.
for ppr in [0,1,2]:

  dfFantasyCopy = dfFantasy.copy()

  dfFantasyCopy = correctData(dfFantasyCopy, ppr)

  dfFantasyCopy = putAV(dfFantasyCopy, dfGrades)

  dfFantasyCopy = makeCorrectShift(dfFantasyCopy)

  dfFantasyCopy = dfFantasyCopy.loc[dfFantasyCopy["season"] != 2012]

  dfFantasyCopy = removeUnwanted(dfFantasyCopy, "QB")

  dfFantasyCopy = dfFantasyCopy.reset_index(drop=True)

  #gets fantasy_points_ppr scale per each position
  scaleQB = getScaleBack(dfFantasyCopy)

  dfFantasyCopy[dfFantasyCopy.columns] = scaler.fit_transform(dfFantasyCopy[dfFantasyCopy.columns])

  #obtained by running the getBestParams function per each respective position
  paramQB = {'activation': 'relu', 'hidden_layer_sizes': (50, 50), 'max_iter': 300, 'solver': 'adam'}

  #makes array of model and score, then prints it
  qbArray = machineLearning(dfFantasyCopy, scaleQB, paramQB)
  num = qbArray[0]
  qbModel = qbArray[1]
  print("qb score(ppg off on average per player): ", num)

  if ppr == 0:
      joblib.dump(qbModel, "qbModelNonPPR.joblib")
  elif ppr == 1:
      joblib.dump(qbModel, "qbModelHalfPPR.joblib")
  elif ppr == 2:
      joblib.dump(qbModel, "qbModelPPR.joblib")



qb score(ppg off on average per player):  0.06125283670895603
qb score(ppg off on average per player):  0.08686186609370329
qb score(ppg off on average per player):  0.07514704204486017


In [6]:
#RB ML MODEL
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import MinMaxScaler

pd.options.mode.chained_assignment = None

import sklearn
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
import joblib


#scaler to scale data
scaler = MinMaxScaler()

#read csv files into pandas
dfFantasy = pd.read_csv("/Users/kmaran3/Dropbox/Darkhorse/final position group data/final_rb_data.csv")
dfFantasy.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = dfFantasy.select_dtypes(include=[np.number]).columns
for column in numeric_cols:
    dfFantasy[column].fillna(dfFantasy[column].mean(), inplace=True)
dfGrades = pd.read_csv("/Users/kmaran3/Dropbox/Darkhorse/approximate value data/AVbyPositionGroup.csv")

def correctData(df, pprTF):
  #cols to make per game
  cols = ['carries', 'rushing_yards', 'rushing_tds',
       'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_2pt_conversions', 'receptions', 'targets',
       'receiving_yards', 'receiving_tds', 'receiving_fumbles',
       'receiving_fumbles_lost', 'receiving_air_yards',
       'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
       'receiving_2pt_conversions', 'special_teams_tds', 'fantasy_points', 'touches', 'rrtd']

  #basing data if ppr or not
  if pprTF == 2:
    pass
  elif pprTF == 0:
    df.loc[:, "fantasy_points"] = df["fantasy_points"] - df["receptions"]
  elif pprTF == 1:
    df.loc[:, "fantasy_points"] = df["fantasy_points"] - (df["receptions"]/2)

    
  #adding ppg column
  df.loc[:, 'PPG'] = df['fantasy_points'] / df['GP']


  #make all columns in a per game basis
  for col in cols:
    df.loc[:, col] = df[col] / df['GP'] 


  #only players with more than 7 games.
  df = df[df.GP > 7]
  df = df[df.fantasy_points >= 0]

  df = df[df.PPG > 2]
  

  return df

def putAV(df, dfAV):
  #years to iterate through
  yearsBig = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
  teams = ["crd", "atl", "rav", "buf", "car", "chi", "cin", "cle", "dal", "den", "det", "gnb", "htx", "clt", "jax", "kan", "rai", "sdg", "ram", "mia", "min", "nwe", "nor", "nyg", "nyj", "phi", "pit", "sfo", "sea", "tam", "oti", "was"]

  #columns wanted to add
  columns = ["oline", "rb", "wrte", "qb", "dst"]
  df[columns] = np.nan

  #gets rid of nan for these columns, as when we drop nan we want to save these.
  for colNow in columns:
    df.loc[df["season"] == 2013, colNow] = "no"



  #iterates through years
  for year in yearsBig:

    #assigns df of AVS to only include current year
    dfCurr = dfAV[dfAV.season == year].copy()
    
    #iterates through team list
    for teamNow in teams:
      #makes the df
      dfCurrNew = dfCurr[dfCurr.team == teamNow]
      
      #iterates through columns and adds AV
      for colNow in columns:
        #locate correct year and team rows
        condition = (df["season"] == year) & (df["team"] == teamNow)

        #set these rows to correct values
        df.loc[condition, colNow] = dfCurrNew.iloc[0][colNow]

  df = df.dropna()

  return df

#removes unneccesary stats
def removeUnwanted(dfPos, pos):
  dfPos = dfPos.drop(columns=['season',"GP", "season_type", "fantasy_points", "player_display_name", "player_id", "team", "position", "status", "rec/g", "c/g", "y/g"])
  return dfPos

#shifts data forward one year
def makeCorrectShift(df):
  shifters = ['player_id', 'season', 'player_display_name', 'team', 'GP', 'position',
       'season_type', 'carries', 'rushing_yards', 'rushing_tds',
       'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs',
       'rushing_epa', 'rushing_2pt_conversions', 'receptions', 'targets',
       'receiving_yards', 'receiving_tds', 'receiving_fumbles',
       'receiving_fumbles_lost', 'receiving_air_yards',
       'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
       'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share',
       'wopr_x', 'special_teams_tds', 'fantasy_points', 'games', 'tgt_sh',
       'ay_sh', 'yac_sh', 'wopr_y', 'ry_sh', 'rtd_sh', 'rfd_sh', 'rtdfd_sh',
       'dom', 'w8dom', 'yptmpa', 'ppr_sh', 'age', 'status', 'y/c', 'y/rec', 'y/tgt', 'catch %', 'touches', 'y/touch',
       'rrtd', 'rb', 'wrte', 'receiving_AV', 'rushing_AV']
  
  #adds target variable
  df["targetPPG"] = df["PPG"]
  
  #shifts it forward a year (for example 2011 goes to 2012)
  df[shifters] = df.groupby('player_display_name')[shifters].shift(1)
  df = df.dropna()

  return df

#where machine learning is done. returns the model and score.
from sklearn.inspection import permutation_importance

def machineLearning(df, arr, dictParam):
    # Define predictors excluding the target variable
    predictors = [col for col in df.columns if col != "targetPPG"]

    # Split the data
    x = df[predictors].values
    y = df["targetPPG"].values
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

    # Initialize and train MLPRegressor
    mlp = MLPRegressor(hidden_layer_sizes=dictParam["hidden_layer_sizes"], activation=dictParam["activation"], solver=dictParam["solver"], max_iter=dictParam["max_iter"])
    mlp.fit(x_train, y_train)

    # Evaluate the model
    predict_test = mlp.predict(x_test)
    mae = mean_absolute_error(y_test, predict_test)

    predict_test_unscaled = predict_test * (arr[1] - arr[0]) + arr[0]
    y_test_unscaled = y_test * (arr[1] - arr[0]) + arr[0]

    print("Predicted vs Actual PPG (unscaled):")
    for pred, actual in zip(predict_test_unscaled, y_test_unscaled):
        print(f"Predicted: {pred:.2f}, Actual: {actual:.2f}")

    # Calculate permutation importance
    r = permutation_importance(mlp, x_test, y_test, n_repeats=100, random_state=0)

    # Organize importances
    importance_dict = {name: score for name, score in zip(predictors, r.importances_mean)}
    sorted_importances = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

    # Print sorted importances
    print("Feature importances in descending order:")
    for feature, importance in sorted_importances:
        print(f"{feature}: {importance}")
    


    return [mae, mlp, sorted_importances]

# Example usage of the modified function


def getBestParams(df, arr):

  #make the predictors and data and test sets correctly
  predictors = [col for col in df.columns if col != "targetPPG"]
  x = df[predictors].values
  y = df["targetPPG"].values
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=40)

  
  #make the parameters to search over. for hidden_layer_sizes, I experimented with alot and the ones listed now is just final set of experiment.
  
  grid = {
      'hidden_layer_sizes': [(32,32), (64,32), (64), (64,64)],
      'activation': ['tanh', 'identity', 'logistic', 'relu'],
      'solver': ['adam', 'sgd', 'lbfgs'],
      'max_iter': [100, 200, 500]
  }

  #create an MLPRegressor object
  mlp = MLPRegressor()

  #create a GridSearchCV object and fit it to the training data
  grid_search = GridSearchCV(mlp, param_grid=grid, cv=5, n_jobs=-1)
  grid_search.fit(x_train, y_train)

  print("Best things:", grid_search.best_params_)

  #the best model to make predictions on the test data and evaluate performance
  y_pred = grid_search.predict(x_test)

  #inverse transform the scaled predictions to get the original scale, uses a reverse of original formula
  for i in range(len(y_pred)):
    y_pred[i] = (y_pred[i]*(arr[1] - arr[0])) + arr[0]
  for i in range(len(y_test)):
    y_test[i] = (y_test[i]*(arr[1] - arr[0])) + arr[0]


  print(mean_absolute_error(y_test, y_pred))

  return grid_search.best_params_

#gets original value for fantasy points for predictions.
def getScaleBack(df):
  #index of column
  column_index = df.columns.get_loc("PPG")

  #min value of column:
  min_value = df["PPG"].min()

  #scaling valye of column
  #scaling_factor = scaler.scale_[column_index]
  max_value = df["PPG"].max()

  #array to be used later to scale each data
  arr = [min_value, max_value]

  return arr

def test(df, model, arr):
  #make columns everything but target
  predictors = [col for col in df.columns if col != "targetPPG"]


  #make train and test sets
  x = df[predictors].values
  y = df["targetPPG"].values
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=40)

  #make the predictions
  predict_test = model.predict(x_test)

  #inverse transform the scaled predictions to get the original scale by reversing formula
  for i in range(len(predict_test)):
    predict_test[i] = (predict_test[i]*(arr[1] - arr[0])) + arr[0]
  for i in range(len(y_test)):
    y_test[i] = (y_test[i]*(arr[1] - arr[0])) + arr[0]

  #average error 
  mae = mean_absolute_error(y_test, predict_test)
  print("test ", mae)

#if ppr is 0, than it is non ppr. if 1, then it is half ppr. if 2, full ppr. loops through each.
for ppr in [0,1,2]:

  dfFantasyCopy = dfFantasy.copy()

  dfFantasyCopy = correctData(dfFantasyCopy, ppr)

  dfFantasyCopy = putAV(dfFantasyCopy, dfGrades)

  dfFantasyCopy = makeCorrectShift(dfFantasyCopy)

  dfFantasyCopy = dfFantasyCopy.loc[dfFantasyCopy["season"] != 2012]

  dfFantasyCopy = removeUnwanted(dfFantasyCopy, "RB")

  dfFantasyCopy = dfFantasyCopy.reset_index(drop=True)

  #gets fantasy_points_ppr scale per each position
  scaleQB = getScaleBack(dfFantasyCopy)

  dfFantasyCopy[dfFantasyCopy.columns] = scaler.fit_transform(dfFantasyCopy[dfFantasyCopy.columns])

  #obtained by running the getBestParams function per each respective position
  paramQB = {'activation': 'relu', 'hidden_layer_sizes': (50, 50), 'max_iter': 300, 'solver': 'adam'}

  #makes array of model and score, then prints it
  qbArray = machineLearning(dfFantasyCopy, scaleQB, paramQB)
  num = qbArray[0]
  qbModel = qbArray[1]

  print("rb score(ppg off on average per player): ", num)
  print("")
  if ppr == 0:
      joblib.dump(qbModel, "rbModelNonPPR.joblib")
  elif ppr == 1:
      joblib.dump(qbModel, "rbModelHalfPPR.joblib")
  elif ppr == 2:
      joblib.dump(qbModel, "rbModelPPR.joblib")
#print(dfFantasyRB.columns)

Predicted vs Actual PPG (unscaled):
Predicted: 5.71, Actual: 5.28
Predicted: 10.19, Actual: 9.16
Predicted: 12.83, Actual: 12.51
Predicted: 12.71, Actual: 12.70
Predicted: 4.70, Actual: 4.91
Predicted: 9.29, Actual: 8.99
Predicted: 10.61, Actual: 11.01
Predicted: 3.64, Actual: 3.85
Predicted: 2.03, Actual: 2.32
Predicted: 6.49, Actual: 6.81
Predicted: 3.80, Actual: 3.22
Predicted: 6.51, Actual: 6.50
Predicted: 7.96, Actual: 8.26
Predicted: 6.39, Actual: 6.59
Predicted: 16.83, Actual: 19.66
Predicted: 5.79, Actual: 6.76
Predicted: 10.09, Actual: 10.45
Predicted: 5.50, Actual: 4.21
Predicted: 3.00, Actual: 4.01
Predicted: 7.73, Actual: 7.99
Predicted: 8.25, Actual: 7.01
Predicted: 3.89, Actual: 4.24
Predicted: 6.64, Actual: 6.48
Predicted: 2.36, Actual: 2.40
Predicted: 7.08, Actual: 8.24
Predicted: 6.15, Actual: 6.71
Predicted: 8.02, Actual: 8.23
Predicted: 3.72, Actual: 3.86
Predicted: 9.02, Actual: 8.38
Predicted: 7.99, Actual: 8.09
Predicted: 2.95, Actual: 2.71
Predicted: 10.43, Actua

In [7]:
#WRTE ML MODEL
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import MinMaxScaler

pd.options.mode.chained_assignment = None

import sklearn
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
import joblib


#scaler to scale data
scaler = MinMaxScaler()

#read csv files into pandas
dfFantasy = pd.read_csv("/Users/kmaran3/Dropbox/Darkhorse/final position group data/final_wrte_data.csv")
dfFantasy.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = dfFantasy.select_dtypes(include=[np.number]).columns
for column in numeric_cols:
    dfFantasy[column].fillna(dfFantasy[column].mean(), inplace=True)
dfGrades = pd.read_csv("/Users/kmaran3/Dropbox/Darkhorse/approximate value data/AVbyPositionGroup.csv")

def correctData(df, pprTF):
  #cols to make per game
  cols = ['carries', 'rushing_yards', 'rushing_tds',
       'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_2pt_conversions', 'receptions', 'targets',
       'receiving_yards', 'receiving_tds', 'receiving_fumbles',
       'receiving_fumbles_lost', 'receiving_air_yards',
       'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
       'receiving_2pt_conversions', 'special_teams_tds', 'fantasy_points', 'touches', 'rrtd']

  #basing data if ppr or not
  if pprTF == 2:
    pass
  elif pprTF == 0:
    df.loc[:, "fantasy_points"] = df["fantasy_points"] - df["receptions"]
  elif pprTF == 1:
    df.loc[:, "fantasy_points"] = df["fantasy_points"] - (df["receptions"]/2)

    
  #adding ppg column
  df.loc[:, 'PPG'] = df['fantasy_points'] / df['GP']


  #make all columns in a per game basis
  for col in cols:
    df.loc[:, col] = df[col] / df['GP'] 


  #only players with more than 7 games.
  df = df[df.GP > 7]
  df = df[df.fantasy_points >= 0]

  df = df[df.PPG > 2]
  

  return df

def putAV(df, dfAV):
  #years to iterate through
  yearsBig = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
  teams = ["crd", "atl", "rav", "buf", "car", "chi", "cin", "cle", "dal", "den", "det", "gnb", "htx", "clt", "jax", "kan", "rai", "sdg", "ram", "mia", "min", "nwe", "nor", "nyg", "nyj", "phi", "pit", "sfo", "sea", "tam", "oti", "was"]

  #columns wanted to add
  columns = ["oline", "rb", "wrte", "qb", "dst"]
  df[columns] = np.nan

  #gets rid of nan for these columns, as when we drop nan we want to save these.
  for colNow in columns:
    df.loc[df["season"] == 2013, colNow] = "no"



  #iterates through years
  for year in yearsBig:

    #assigns df of AVS to only include current year
    dfCurr = dfAV[dfAV.season == year].copy()
    
    #iterates through team list
    for teamNow in teams:
      #makes the df
      dfCurrNew = dfCurr[dfCurr.team == teamNow]
      
      #iterates through columns and adds AV
      for colNow in columns:
        #locate correct year and team rows
        condition = (df["season"] == year) & (df["team"] == teamNow)

        #set these rows to correct values
        df.loc[condition, colNow] = dfCurrNew.iloc[0][colNow]

  df = df.dropna()

  return df

#removes unneccesary stats
def removeUnwanted(dfPos, pos):
  dfPos = dfPos.drop(columns=['season',"GP", "season_type", "fantasy_points", "player_display_name", "player_id", "team", "position", "status", "rec/g", "c/g", "y/g"])
  return dfPos

#shifts data forward one year
def makeCorrectShift(df):
  shifters = ['player_id', 'season', 'player_display_name', 'team', 'GP', 'position',
       'season_type', 'carries', 'rushing_yards', 'rushing_tds',
       'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs',
       'rushing_epa', 'rushing_2pt_conversions', 'receptions', 'targets',
       'receiving_yards', 'receiving_tds', 'receiving_fumbles',
       'receiving_fumbles_lost', 'receiving_air_yards',
       'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
       'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share',
       'wopr_x', 'special_teams_tds', 'fantasy_points', 'games', 'tgt_sh',
       'ay_sh', 'yac_sh', 'wopr_y', 'ry_sh', 'rtd_sh', 'rfd_sh', 'rtdfd_sh',
       'dom', 'w8dom', 'yptmpa', 'ppr_sh', 'age', 'status', 'y/c', 'y/rec', 'y/tgt', 'catch %', 'touches', 'y/touch',
       'rrtd', 'rb', 'wrte', 'receiving_AV', 'rushing_AV']
  
  #adds target variable
  df["targetPPG"] = df["PPG"]
  
  #shifts it forward a year (for example 2011 goes to 2012)
  df[shifters] = df.groupby('player_display_name')[shifters].shift(1)
  df = df.dropna()

  return df

#where machine learning is done. returns the model and score.
from sklearn.inspection import permutation_importance

def machineLearning(df, arr, dictParam):
    # Define predictors excluding the target variable
    predictors = [col for col in df.columns if col != "targetPPG"]

    # Split the data
    x = df[predictors].values
    y = df["targetPPG"].values
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

    # Initialize and train MLPRegressor
    mlp = MLPRegressor(hidden_layer_sizes=dictParam["hidden_layer_sizes"], activation=dictParam["activation"], solver=dictParam["solver"], max_iter=dictParam["max_iter"])
    mlp.fit(x_train, y_train)

    # Evaluate the model
    predict_test = mlp.predict(x_test)
    mae = mean_absolute_error(y_test, predict_test)

    predict_test_unscaled = predict_test * (arr[1] - arr[0]) + arr[0]
    y_test_unscaled = y_test * (arr[1] - arr[0]) + arr[0]

    print("Predicted vs Actual PPG (unscaled):")
    for pred, actual in zip(predict_test_unscaled, y_test_unscaled):
        print(f"Predicted: {pred:.2f}, Actual: {actual:.2f}")

    # Calculate permutation importance
    r = permutation_importance(mlp, x_test, y_test, n_repeats=100, random_state=0)

    # Organize importances
    importance_dict = {name: score for name, score in zip(predictors, r.importances_mean)}
    sorted_importances = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

    # Print sorted importances
    print("Feature importances in descending order:")
    for feature, importance in sorted_importances:
        print(f"{feature}: {importance}")
    


    return [mae, mlp, sorted_importances]

# Example usage of the modified function


def getBestParams(df, arr):

  #make the predictors and data and test sets correctly
  predictors = [col for col in df.columns if col != "targetPPG"]
  x = df[predictors].values
  y = df["targetPPG"].values
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=40)

  
  #make the parameters to search over. for hidden_layer_sizes, I experimented with alot and the ones listed now is just final set of experiment.
  
  grid = {
      'hidden_layer_sizes': [(32,32), (64,32), (64), (64,64)],
      'activation': ['tanh', 'identity', 'logistic', 'relu'],
      'solver': ['adam', 'sgd', 'lbfgs'],
      'max_iter': [100, 200, 500]
  }

  #create an MLPRegressor object
  mlp = MLPRegressor()

  #create a GridSearchCV object and fit it to the training data
  grid_search = GridSearchCV(mlp, param_grid=grid, cv=5, n_jobs=-1)
  grid_search.fit(x_train, y_train)

  print("Best things:", grid_search.best_params_)

  #the best model to make predictions on the test data and evaluate performance
  y_pred = grid_search.predict(x_test)

  #inverse transform the scaled predictions to get the original scale, uses a reverse of original formula
  for i in range(len(y_pred)):
    y_pred[i] = (y_pred[i]*(arr[1] - arr[0])) + arr[0]
  for i in range(len(y_test)):
    y_test[i] = (y_test[i]*(arr[1] - arr[0])) + arr[0]


  print(mean_absolute_error(y_test, y_pred))

  return grid_search.best_params_

#gets original value for fantasy points for predictions.
def getScaleBack(df):
  #index of column
  column_index = df.columns.get_loc("PPG")

  #min value of column:
  min_value = df["PPG"].min()

  #scaling valye of column
  #scaling_factor = scaler.scale_[column_index]
  max_value = df["PPG"].max()

  #array to be used later to scale each data
  arr = [min_value, max_value]

  return arr

def test(df, model, arr):
  #make columns everything but target
  predictors = [col for col in df.columns if col != "targetPPG"]


  #make train and test sets
  x = df[predictors].values
  y = df["targetPPG"].values
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=40)

  #make the predictions
  predict_test = model.predict(x_test)

  #inverse transform the scaled predictions to get the original scale by reversing formula
  for i in range(len(predict_test)):
    predict_test[i] = (predict_test[i]*(arr[1] - arr[0])) + arr[0]
  for i in range(len(y_test)):
    y_test[i] = (y_test[i]*(arr[1] - arr[0])) + arr[0]

  #average error 
  mae = mean_absolute_error(y_test, predict_test)
  print("test ", mae)

#if ppr is 0, than it is non ppr. if 1, then it is half ppr. if 2, full ppr. loops through each.
for ppr in [0,1,2]:

  dfFantasyCopy = dfFantasy.copy()

  dfFantasyCopy = correctData(dfFantasyCopy, ppr)

  dfFantasyCopy = putAV(dfFantasyCopy, dfGrades)

  dfFantasyCopy = makeCorrectShift(dfFantasyCopy)

  dfFantasyCopy = dfFantasyCopy.loc[dfFantasyCopy["season"] != 2012]

  dfFantasyCopy = removeUnwanted(dfFantasyCopy, "WRTE")

  dfFantasyCopy = dfFantasyCopy.reset_index(drop=True)

  #gets fantasy_points_ppr scale per each position
  scaleQB = getScaleBack(dfFantasyCopy)

  dfFantasyCopy[dfFantasyCopy.columns] = scaler.fit_transform(dfFantasyCopy[dfFantasyCopy.columns])

  #obtained by running the getBestParams function per each respective position
  paramQB = {'activation': 'relu', 'hidden_layer_sizes': (50, 50), 'max_iter': 300, 'solver': 'adam'}

  #makes array of model and score, then prints it
  qbArray = machineLearning(dfFantasyCopy, scaleQB, paramQB)
  num = qbArray[0]
  qbModel = qbArray[1]

  print("wrte score(ppg off on average per player): ", num)
  print("")
  if ppr == 0:
      joblib.dump(qbModel, "wrteModelNonPPR.joblib")
  elif ppr == 1:
      joblib.dump(qbModel, "wrteModelHalfPPR.joblib")
  elif ppr == 2:
      joblib.dump(qbModel, "wrteModelPPR.joblib")
#print(dfFantasyRB.columns)

Predicted vs Actual PPG (unscaled):
Predicted: 4.80, Actual: 4.56
Predicted: 4.97, Actual: 5.04
Predicted: 4.55, Actual: 4.58
Predicted: 2.84, Actual: 2.81
Predicted: 4.56, Actual: 5.25
Predicted: 3.06, Actual: 3.08
Predicted: 3.47, Actual: 3.87
Predicted: 4.71, Actual: 4.95
Predicted: 2.31, Actual: 2.39
Predicted: 3.44, Actual: 3.60
Predicted: 2.66, Actual: 2.39
Predicted: 6.60, Actual: 6.85
Predicted: 2.63, Actual: 2.61
Predicted: 4.67, Actual: 4.68
Predicted: 3.93, Actual: 3.75
Predicted: 2.32, Actual: 2.27
Predicted: 3.20, Actual: 3.36
Predicted: 2.85, Actual: 2.48
Predicted: 2.08, Actual: 2.01
Predicted: 2.70, Actual: 2.68
Predicted: 6.13, Actual: 6.24
Predicted: 6.09, Actual: 6.49
Predicted: 5.09, Actual: 4.57
Predicted: 3.92, Actual: 3.58
Predicted: 3.42, Actual: 3.47
Predicted: 2.89, Actual: 2.89
Predicted: 2.75, Actual: 2.21
Predicted: 2.10, Actual: 2.13
Predicted: 2.80, Actual: 2.27
Predicted: 4.18, Actual: 4.50
Predicted: 3.11, Actual: 2.96
Predicted: 6.82, Actual: 6.42
Pred

In [8]:
#K ML MODEL
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import MinMaxScaler

pd.options.mode.chained_assignment = None

import sklearn
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
import joblib


#scaler to scale data
scaler = MinMaxScaler()

#read csv files into pandas
KdfFantasy = pd.read_csv("/Users/kmaran3/Dropbox/Darkhorse/final position group data/final_kicking_data.csv")
KdfFantasy.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = KdfFantasy.select_dtypes(include=[np.number]).columns
for column in numeric_cols:
    KdfFantasy[column].fillna(KdfFantasy[column].mean(), inplace=True)
dfGrades = pd.read_csv("/Users/kmaran3/Dropbox/Darkhorse/approximate value data/AVbyPositionGroup.csv")

def correctData(df, pprTF):
  #cols to make per game
  cols = ['Age','FGA1', 'FGM1', 'FGA2',
       'FGM2', 'FGA3', 'FGM3', 'FGA4', 'FGM4', 'FGA5', 'FGM5', 'FGA', 'FGM',
       'XPA', 'XPM']

  #basing data if ppr or not
  if pprTF == 2:
    pass
  elif pprTF == 0:
    pass
  elif pprTF == 1:
    pass

    
  #adding ppg column
  df.loc[:, 'PPG'] = df['fantasy_points'] / df['G']


  #make all columns in a per game basis
  for col in cols:
    df.loc[:, col] = df[col] / df['G'] 


  #only players with more than 7 games.
  df = df[df.G > 7]
  df = df[df.fantasy_points >= 0]

  df = df[df.PPG > 1.5]
  

  return df

def putAV(df, dfAV):
  #years to iterate through
  yearsBig = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
  teams = ["crd", "atl", "rav", "buf", "car", "chi", "cin", "cle", "dal", "den", "det", "gnb", "htx", "clt", "jax", "kan", "rai", "sdg", "ram", "mia", "min", "nwe", "nor", "nyg", "nyj", "phi", "pit", "sfo", "sea", "tam", "oti", "was"]

  #columns wanted to add
  columns = ["oline", "rb", "wrte", "dst"]
  df[columns] = np.nan

  #gets rid of nan for these columns, as when we drop nan we want to save these.
  for colNow in columns:
    df.loc[df["season"] == 2013, colNow] = "no"



  #iterates through years
  for year in yearsBig:

    #assigns df of AVS to only include current year
    dfCurr = dfAV[dfAV.season == year].copy()
    
    #iterates through team list
    for teamNow in teams:
      #makes the df
      dfCurrNew = dfCurr[dfCurr.team == teamNow]
      
      #iterates through columns and adds AV
      for colNow in columns:
        #locate correct year and team rows
        condition = (df["season"] == year) & (df["team"] == teamNow)

        #set these rows to correct values
        df.loc[condition, colNow] = dfCurrNew.iloc[0][colNow]

  df = df.dropna()

  return df

#removes unneccesary stats
def removeUnwanted(dfPos, pos):
  dfPos = dfPos.drop(columns=['season',"G", "GS", "fantasy_points", "Player", "team", "Pos"])
  return dfPos

#shifts data forward one year
def makeCorrectShift(df):
  shifters = ['PPG','Player', 'team', 'Age', 'Pos', 'G', 'GS', 'FGA1', 'FGM1', 'FGA2',
       'FGM2', 'FGA3', 'FGM3', 'FGA4', 'FGM4', 'FGA5', 'FGM5', 'FGA', 'FGM',
       'Lng', 'FG%', 'XPA', 'XPM', 'XP%', 'season', 'AV', 'fantasy_points']
  
  #adds target variable
  df["targetPPG"] = df["PPG"]
  
  #shifts it forward a year (for example 2011 goes to 2012)
  df[shifters] = df.groupby('Player')[shifters].shift(1)
  df = df.dropna()

  return df

#where machine learning is done. returns the model and score.
from sklearn.inspection import permutation_importance

def machineLearning(df, arr, dictParam):
    # Define predictors excluding the target variable
    predictors = [col for col in df.columns if col != "targetPPG"]

    # Split the data
    x = df[predictors].values
    y = df["targetPPG"].values
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

    # Initialize and train MLPRegressor
    mlp = MLPRegressor(hidden_layer_sizes=dictParam["hidden_layer_sizes"], activation=dictParam["activation"], solver=dictParam["solver"], max_iter=dictParam["max_iter"])
    mlp.fit(x_train, y_train)

    # Evaluate the model
    predict_test = mlp.predict(x_test)
    mae = mean_absolute_error(y_test, predict_test)

    predict_test_unscaled = predict_test * (arr[1] - arr[0]) + arr[0]
    y_test_unscaled = y_test * (arr[1] - arr[0]) + arr[0]

    print("Predicted vs Actual PPG (unscaled):")
    for pred, actual in zip(predict_test_unscaled, y_test_unscaled):
        print(f"Predicted: {pred:.2f}, Actual: {actual:.2f}")

    # Calculate permutation importance
    r = permutation_importance(mlp, x_test, y_test, n_repeats=100, random_state=0)

    # Organize importances
    importance_dict = {name: score for name, score in zip(predictors, r.importances_mean)}
    sorted_importances = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

    # Print sorted importances
    print("Feature importances in descending order:")
    for feature, importance in sorted_importances:
        print(f"{feature}: {importance}")
    


    return [mae, mlp, sorted_importances]

# Example usage of the modified function


def getBestParams(df, arr):

  #make the predictors and data and test sets correctly
  predictors = [col for col in df.columns if col != "targetPPG"]
  x = df[predictors].values
  y = df["targetPPG"].values
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=40)

  
  #make the parameters to search over. for hidden_layer_sizes, I experimented with alot and the ones listed now is just final set of experiment.
  
  grid = {
      'hidden_layer_sizes': [(32,32), (64,32), (64), (64,64)],
      'activation': ['tanh', 'identity', 'logistic', 'relu'],
      'solver': ['adam', 'sgd', 'lbfgs'],
      'max_iter': [100, 200, 500]
  }

  #create an MLPRegressor object
  mlp = MLPRegressor()

  #create a GridSearchCV object and fit it to the training data
  grid_search = GridSearchCV(mlp, param_grid=grid, cv=5, n_jobs=-1)
  grid_search.fit(x_train, y_train)

  print("Best things:", grid_search.best_params_)

  #the best model to make predictions on the test data and evaluate performance
  y_pred = grid_search.predict(x_test)

  #inverse transform the scaled predictions to get the original scale, uses a reverse of original formula
  for i in range(len(y_pred)):
    y_pred[i] = (y_pred[i]*(arr[1] - arr[0])) + arr[0]
  for i in range(len(y_test)):
    y_test[i] = (y_test[i]*(arr[1] - arr[0])) + arr[0]


  print(mean_absolute_error(y_test, y_pred))

  return grid_search.best_params_

#gets original value for fantasy points for predictions.
def getScaleBack(df):
  #index of column
  column_index = df.columns.get_loc("PPG")

  #min value of column:
  min_value = df["PPG"].min()

  #scaling valye of column
  #scaling_factor = scaler.scale_[column_index]
  max_value = df["PPG"].max()

  #array to be used later to scale each data
  arr = [min_value, max_value]

  return arr

def test(df, model, arr):
  #make columns everything but target
  predictors = [col for col in df.columns if col != "targetPPG"]


  #make train and test sets
  x = df[predictors].values
  y = df["targetPPG"].values
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=40)

  #make the predictions
  predict_test = model.predict(x_test)

  #inverse transform the scaled predictions to get the original scale by reversing formula
  for i in range(len(predict_test)):
    predict_test[i] = (predict_test[i]*(arr[1] - arr[0])) + arr[0]
  for i in range(len(y_test)):
    y_test[i] = (y_test[i]*(arr[1] - arr[0])) + arr[0]

  #average error 
  mae = mean_absolute_error(y_test, predict_test)
  print("test ", mae)

#if ppr is 0, than it is non ppr. if 1, then it is half ppr. if 2, full ppr. loops through each.
for ppr in [0,1,2]:

  KdfFantasyCopy = KdfFantasy.copy()

  KdfFantasyCopy = correctData(KdfFantasyCopy, ppr)

  KdfFantasyCopy = putAV(KdfFantasyCopy, dfGrades)

  KdfFantasyCopy = makeCorrectShift(KdfFantasyCopy)

  KdfFantasyCopy = KdfFantasyCopy.loc[KdfFantasyCopy["season"] != 2012]

  KdfFantasyCopy = removeUnwanted(KdfFantasyCopy, "K")

  KdfFantasyCopy = KdfFantasyCopy.reset_index(drop=True)

  #gets fantasy_points_ppr scale per each position
  scaleK = getScaleBack(KdfFantasyCopy)

  KdfFantasyCopy[KdfFantasyCopy.columns] = scaler.fit_transform(KdfFantasyCopy[KdfFantasyCopy.columns])

  #obtained by running the getBestParams function per each respective position
  paramK = {'activation': 'relu', 'hidden_layer_sizes': (50, 50), 'max_iter': 300, 'solver': 'adam'}

  #makes array of model and score, then prints it
  KArray = machineLearning(KdfFantasyCopy, scaleK, paramK)
  num = KArray[0]
  KModel = KArray[1]

  print("k score(ppg off on average per player): ", num)
  if ppr == 0:
      joblib.dump(KModel, "kModelNonPPR.joblib")
  elif ppr == 1:
      joblib.dump(KModel, "kModelHalfPPR.joblib")
  elif ppr == 2:
      joblib.dump(KModel, "kModelPPR.joblib")


Predicted vs Actual PPG (unscaled):
Predicted: 10.58, Actual: 9.29
Predicted: 10.45, Actual: 10.10
Predicted: 10.43, Actual: 11.35
Predicted: 10.07, Actual: 10.82
Predicted: 9.61, Actual: 8.24
Predicted: 9.50, Actual: 7.92
Predicted: 9.37, Actual: 5.92
Predicted: 9.66, Actual: 10.31
Predicted: 9.93, Actual: 8.60
Predicted: 11.02, Actual: 10.97
Predicted: 10.08, Actual: 9.06
Predicted: 9.36, Actual: 10.05
Predicted: 9.43, Actual: 4.41
Predicted: 8.96, Actual: 7.83
Predicted: 11.21, Actual: 9.93
Predicted: 8.28, Actual: 8.89
Predicted: 8.41, Actual: 9.15
Predicted: 9.32, Actual: 8.52
Predicted: 8.02, Actual: 5.65
Predicted: 9.46, Actual: 9.88
Predicted: 9.64, Actual: 9.13
Predicted: 9.56, Actual: 10.23
Predicted: 9.78, Actual: 8.18
Predicted: 10.97, Actual: 10.75
Predicted: 10.10, Actual: 8.96
Predicted: 8.79, Actual: 9.24
Predicted: 10.23, Actual: 8.95
Predicted: 10.35, Actual: 8.47
Predicted: 9.56, Actual: 9.95
Predicted: 8.03, Actual: 7.99
Predicted: 10.98, Actual: 10.85
Feature impor