## Linear Regression

Train

In [16]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Create a DataFrame with your expanded dataset
data = pd.DataFrame({
    'Week': [1, 2, 1, 2],
    'Player': ['J', 'A', 'A', 'J'],
    'Pts': [15, 17, 19, 25],
    'Yds': [120, 80, 150, 100],  # Example Yds values
    'Tds': [2, 1, 3, 2]  # Example Tds values
})

# Convert categorical 'Player' variable into numerical using one-hot encoding
data = pd.get_dummies(data, columns=['Player'])

# Separate the dataset into features (X) and the target variable (y)
X = data.drop(['Pts', 'Yds', 'Tds'], axis=1)  # Keep 'Player' and 'Week' for prediction
y = data[['Pts', 'Yds', 'Tds']]  # Predict multiple columns

# Create and train a linear regression model for each target variable
models = {}
for target in y.columns:
    model = LinearRegression()
    model.fit(X, y[target])
    models[target] = model

data.head(10)


Unnamed: 0,Week,Pts,Yds,Tds,Player_A,Player_J
0,1,15,120,2,False,True
1,2,17,80,1,True,False
2,1,19,150,3,True,False
3,2,25,100,2,False,True


Predict

In [22]:

import pandas as pd
from sklearn.linear_model import LinearRegression

# Create a DataFrame with your expanded dataset
data = pd.DataFrame({
    'Week': [1, 2, 1, 2],
    'Player': ['J', 'B', 'B', 'J'],
    'Pts': [15, 17, 19, 25],
    'Yds': [120, 80, 150, 100],  # Example Yds values
    'Tds': [2, 1, 3, 2]  # Example Tds values
})

df = data
# Convert categorical 'Player' variable into numerical using one-hot encoding
data = pd.get_dummies(data, columns=['Player'], drop_first=False)

# Separate the dataset into features (X) and the target variable (y)
X = data.drop(['Pts', 'Yds', 'Tds'], axis=1)
y = data[['Pts', 'Yds', 'Tds']]

# Create and train a linear regression model for each player and each target variable
player_models = {}
for player in df['Player'].unique():
    player_data = data[data['Player_' + player] == 1]
    X_player = player_data.drop(['Pts', 'Yds', 'Tds'], axis=1)
    y_player = player_data[['Pts', 'Yds', 'Tds']]
    
    models = {}
    for target in y.columns:
        model = LinearRegression()
        model.fit(X_player, y_player[target])
        models[target] = model
    
    player_models[player] = models

# Now, you can make predictions for week 3 with the new features for each player
week3_data = pd.DataFrame({
    'Week': [3],
    'Player_B': [1],  # Set the corresponding player's column to 1, and others to 0 for prediction
    'Player_J': [0],  # Ensure the player columns match those from training data
})

predicted_data_week3 = {}
for player in df['Player'].unique():
    predictions = {}
    for target in y.columns:
        predictions[target] = player_models[player][target].predict(week3_data)[0]
    predicted_data_week3[player] = predictions

# Print the predictions for each player and each target variable
print("Predicted Data for Week 3:")
for player in df['Player'].unique():
    print(f"Player {player}:")
    for target, value in predicted_data_week3[player].items():
        print(f"{target}: {value:.2f}")



Predicted Data for Week 3:
Player J:
Pts: 35.00
Yds: 80.00
Tds: 2.00
Player B:
Pts: 15.00
Yds: 10.00
Tds: -1.00


## Random Forest with GridSearchCV hyperparemeter tuning and no regularization

In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

# Load your dataset
data = pd.read_csv("datasets/weekly_scoring.csv")

# Preprocessing
data = data[data['POS'] == 'qb']
data = data.drop(columns=['POS', 'MISC G', 'MISC ROST', 'RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
 'RECEIVING LG', 'RECEIVING 20+', 'RECEIVING TD', 'RUSHING Y/A', 'RUSHING LG',
 'RUSHING 20+'])
data = pd.get_dummies(data, columns=['PLAYER'], drop_first=False)

# Identify columns with missing values before imputation
columns_with_missing = data.columns[data.isnull().any()].tolist()

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Define the list of variables to predict
var_list = ['PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 'PASSING Y/A', 'PASSING TD', 'PASSING INT',
            'PASSING SACKS', 'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'MISC FPTS/G', 'WEEK ']

# Separate the dataset into features (X) and the target variable (y)
X = data.drop(var_list, axis=1)
y = data[var_list]

# Save a copy of the dataset
df = data

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)

# Fit the model with the best hyperparameters
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X, y)

# Initialize a DataFrame with zeros
week6_data = pd.DataFrame(0, index=range(1), columns=X.columns)

print("Predicted Points for Week 6:")
for player in week6_data.columns:
    # Clean the player name by removing special characters, spaces, and parentheses
    # player_cleaned = ''.join(e for e in player if (e.isalnum() or e.isspace()))
    week6_data[player] = 1  # Set the corresponding player's column to 1 for prediction
    predictions = best_rf_model.predict(week6_data)
    
    for i, column in enumerate(var_list):
        prediction = predictions[0][i]
        print(f"Player {player}, {column}: {prediction:.2f}")
    week6_data[player] = 0


Predicted Points for Week 6:
Player POS RANK, PASSING CMP: 26.75
Player POS RANK, PASSING ATT: 37.82
Player POS RANK, PASSING PCT: 70.51
Player POS RANK, PASSING YDS: 351.25
Player POS RANK, PASSING Y/A: 9.54
Player POS RANK, PASSING TD: 3.02
Player POS RANK, PASSING INT: 0.48
Player POS RANK, PASSING SACKS: 1.58
Player POS RANK, RUSHING ATT: 5.37
Player POS RANK, RUSHING YDS: 25.79
Player POS RANK, RUSHING TD: 0.40
Player POS RANK, MISC FL: 0.26
Player POS RANK, MISC FPTS: 30.24
Player POS RANK, MISC FPTS/G: 30.24
Player POS RANK, WEEK : 2.96
Player PLAYER_AJ McCarron (CIN), PASSING CMP: 26.75
Player PLAYER_AJ McCarron (CIN), PASSING ATT: 37.82
Player PLAYER_AJ McCarron (CIN), PASSING PCT: 70.51
Player PLAYER_AJ McCarron (CIN), PASSING YDS: 351.25
Player PLAYER_AJ McCarron (CIN), PASSING Y/A: 9.54
Player PLAYER_AJ McCarron (CIN), PASSING TD: 3.02
Player PLAYER_AJ McCarron (CIN), PASSING INT: 0.48
Player PLAYER_AJ McCarron (CIN), PASSING SACKS: 1.58
Player PLAYER_AJ McCarron (CIN), RUS

In [23]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

# Load your dataset
data = pd.read_csv("datasets/weekly_scoring.csv")

# Preprocessing
data = data[data['POS'] == 'qb']
data = data.drop(columns=['POS RANK', 'POS', 'MISC G', 'MISC ROST', 'MISC FPTS/G', 'RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
 'RECEIVING LG', 'RECEIVING 20+', 'RECEIVING TD', 'RUSHING Y/A', 'RUSHING LG',
 'RUSHING 20+'])
data = pd.get_dummies(data, columns=['PLAYER'], drop_first=True)

# Identify columns with missing values before imputation
columns_with_missing = data.columns[data.isnull().any()].tolist()

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Define the list of variables to predict
var_list = ['PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 'PASSING Y/A', 'PASSING TD', 'PASSING INT',
            'PASSING SACKS', 'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK ']

# Separate the dataset into features (X) and the target variable (y)
X = data.drop(var_list, axis=1)
y = data['MISC FPTS']

# Save a copy of the dataset
df = data

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)

# Fit the model with the best hyperparameters
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X, y)

# Get a list of unique player names after one-hot encoding
unique_players = X.columns

# Create a list of dictionaries to store the results
results_list = []

for player in unique_players:
    # Create a DataFrame with all zeros
    week6_data = pd.DataFrame(0, index=range(1), columns=X.columns)
    # Set the corresponding player's column to 1 for prediction
    week6_data[player] = 1
    # Make a prediction for the player
    misc_fpts_prediction = best_rf_model.predict(week6_data)
    results_list.append({'Player': player, 'MISC FPTS': misc_fpts_prediction[0]})

# Convert the list of dictionaries into a DataFrame
results_df = pd.DataFrame(results_list)

results_df.head(10)

# Save the results to a CSV file
# results_df.to_csv('fantasy_football_predictions.csv', index=False)


Unnamed: 0,Player,MISC FPTS
0,PLAYER_Aaron Rodgers (NYJ),0.0
1,PLAYER_Adam Froman (ATL),0.0
2,PLAYER_Aidan O'Connell (LV),2.178488
3,PLAYER_Alex McGough (GB),0.0
4,PLAYER_Andy Dalton (CAR),4.750914
5,PLAYER_Anthony Richardson (IND),15.362299
6,PLAYER_Bailey Zappe (NE),0.592155
7,PLAYER_Baker Mayfield (TB),13.837784
8,PLAYER_Ben Chappell (WAS),0.0
9,PLAYER_Ben DiNucci (DEN),0.0
