In [15]:
from vars import *
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import re
import numpy as np
import matplotlib.pyplot as plt

## Loading the data

In [16]:
train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')  

### Feature Engineering

Jeff has added team name and opposing team name in the dataset. We're also interested in compiling information on a given player's salary, average points, max/min points, and point variance.

In [4]:
# this code will compile average player statistics for the training data
# player_stats = data.groupby('PLAYER')['MISC FPTS'].agg(
#     AVG_FPTS = 'mean',
#     MIN_FPTS = 'min',
#     MAX_FPTS = 'max',
#     VAR_FPTS = 'std'
# ).reset_index()

# data = pd.merge(data, player_stats, on='PLAYER', how='left')

We will split the data into quarterbacks and all other players (running backs, tight ends, and wide receivers).

In [9]:
qb_train_data = train[train['POS'] == 'qb']
qb_test_data = test[test['POS'] == 'qb']
qb_train_data = qb_train_data.drop(columns=[
    'POS RANK', 'POS', 'MISC G', 'MISC ROST', 'MISC FPTS/G', 'RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
    'RECEIVING LG', 'RECEIVING 20+', 'RECEIVING TD', 'RUSHING Y/A', 'RUSHING LG',
    'RUSHING 20+', 'DATE', 'YEAR', 'WEIGHT'
])

We'll want to store our response variable (fantasy points) separately.

In [10]:
qb_train_y = qb_train_data['MISC FPTS']
qb_train_data = qb_train_data.drop(columns=['MISC FPTS'])

In [11]:
qb_train_data.columns

Index(['PLAYER', 'PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS',
       'PASSING Y/A', 'PASSING TD', 'PASSING INT', 'PASSING SACKS',
       'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'WEEK', 'TEAM', 'OPP',
       'AVG_FPTS', 'MIN_FPTS', 'MAX_FPTS', 'VAR_FPTS'],
      dtype='object')

### Standard Scaling

One hot encoding for categorical variables.

In [14]:
qb_data = pd.get_dummies(qb_data, columns=['TEAM', 'OPP', 'PLAYER'])

{}

In [None]:
scaler = StandardScaler()
# change this to only use the train data
scaler.fit_transform(qb_data)
# scaler.transform(test data)

PCA code:

In [None]:
# PCA code
pca = PCA(n_components=0.95)
pca.fit(qb_data)

Make scree plot.

In [None]:
# eigenvalues
eigen = pca.explained_variance_ratio_

plt.plot(eigen)
plt.xlabel('Number of components')
plt.ylabel('Explained variance ratio')
plt.savefig('img/scree_plot.png')

In [None]:
# Get user input for the week number to predict
num_week = int(input("Enter the week to predict: "))

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

positions = ["qb", "rb", "wr", "te"]
positions = ["qb"]


for pos in positions:
    # Load your dataset
    data = pd.read_csv("datasets/weekly_scoring.csv")
    train = pd.read_csv("datasets/train.csv")
    test = pd.read_csv("datasets/test.csv")

    # Preprocessing
    train = train[train['POS'] == pos]
    test = test[test['POS'] == pos]
    # weights = train['WEIGHT']
    
    if(pos == 'qb'):
        data = data.drop(columns=['POS RANK', 'POS', 'MISC G', 'MISC ROST', 'MISC FPTS/G', 'RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
        'RECEIVING LG', 'RECEIVING 20+', 'RECEIVING TD', 'RUSHING Y/A', 'RUSHING LG',
        'RUSHING 20+', 'DATE', 'YEAR', 'WEIGHT', 'TEAM', 'OPP'])
            # Define the list of variables to predict :)
        var_list = ['PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 'PASSING Y/A', 'PASSING TD', 'PASSING INT',
        'PASSING SACKS', 'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK']
    if(pos == 'rb'):
        # Define the list of variables to drop and predict
        data = data.drop(columns=['POS RANK', 'POS', 'MISC G', 'MISC ROST', 'MISC FPTS/G', 'PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 
        'PASSING Y/A', 'PASSING TD', 'PASSING INT', 'RECEIVING LG', 'RECEIVING 20+',
        'PASSING SACKS', 'YEAR', 'WEIGHT', 'DATE'])
        var_list = ['RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
        'RECEIVING TD', 'RUSHING Y/A', 'RUSHING LG',
        'RUSHING 20+', 'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK']
    if(pos == 'wr'):
        # Define the list of variables to drop and predict
        data = data.drop(columns=['POS RANK', 'POS', 'MISC G', 'MISC ROST', 'MISC FPTS/G', 'PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 
        'PASSING Y/A', 'PASSING TD', 'PASSING INT',
        'PASSING SACKS', 'YEAR', 'WEIGHT', 'DATE', 'RUSHING Y/A', 'RUSHING LG', 'RUSHING 20+'])
        var_list = ['RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
        'RECEIVING TD', 'RECEIVING LG', 'RECEIVING 20+',
        'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK']
    if(pos == 'te'):
        # Define the list of variables to drop and predict
        data = data.drop(columns=['POS RANK', 'POS', 'MISC G', 'MISC ROST', 'MISC FPTS/G', 'PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 
        'PASSING Y/A', 'PASSING TD', 'PASSING INT',
        'PASSING SACKS', 'YEAR', 'WEIGHT', 'DATE', 'RUSHING Y/A', 'RUSHING LG', 'RUSHING 20+'])
        var_list = ['RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
        'RECEIVING TD', 'RECEIVING LG', 'RECEIVING 20+',
        'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK']

    train = pd.get_dummies(train, columns=['PLAYER'], drop_first=True)
    test = pd.get_dummies(test, columns=['PLAYER'], drop_first=True)

    # Identify columns with missing values before imputation
    columns_with_missing_tr = train.columns[train.isnull().any()].tolist()
    columns_with_missing_te = test.columns[test.isnull().any()].tolist()

    # Impute missing values with the mean of each column
    imputer = SimpleImputer(strategy='mean')
    train = pd.DataFrame(imputer.fit_transform(train), columns=data.columns)
    test = pd.DataFrame(imputer.fit_transform(test), columns=data.columns)

    # Separate the dataset into features (X) and the target variable (y)
    XTrain = train.drop(var_list, axis=1)
    XTest = test.drop(var_list, axis = 1)
    yTrain = train['MISC FPTS']
    yTest = test['MISC FPTS']

    # Instantiate the Linear Regression model without hyperparameter tuning
    lr_model = LinearRegression(fit_intercept=False)

    # Fit the model directly without GridSearchCV
    lr_model.fit(XTrain, yTrain)

    print(lr_model.intercept_, lr_model.coef_, lr_model.score(XTrain, yTrain))

    # # Get a list of unique player names after one-hot encoding
    unique_players = data.columns

    # Create a list of dictionaries to store the results
    results_list = []

    for player in unique_players:
        # Create a DataFrame with all zeros
        week6_data = pd.DataFrame(0, index=range(1), columns=X.columns)
        # Set the corresponding player's column to 1 for prediction
        week6_data[player] = 1
        # Make a prediction for the player
        misc_fpts_prediction = lr_model.predict(week6_data)
        results_list.append({'Player': player, 'MISC FPTS': misc_fpts_prediction[0]})

    # Convert the list of dictionaries into a DataFrame
    results_df = pd.DataFrame(results_list)

    results_df = results_df.sort_values(by='MISC FPTS', ascending=False)

    # # Save the results to a CSV file
    file_name = f"predictions/LRweek{num_week}{pos}unweighted.csv"
    results_df.to_csv(file_name, index=False)