## Linear Regression

In [4]:
import warnings

warnings.filterwarnings('ignore')

# Get user input for the week number to predict
# num_week = int(input("Enter the week to predict: "))

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

positions = ["qb", "rb", "wr", "te"]
# positions = ["qb"]


for pos in positions:
    # Load your dataset
    data = pd.read_csv("datasets/weekly_scoring.csv")

    # Preprocessing
    data = data[data['POS'] == pos]
    weights = data['WEIGHT']
    
    if(pos == 'qb'):
        # Define the list of variables to predict
        var_list = ['PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 'PASSING Y/A', 'PASSING TD', 'PASSING INT',
        'PASSING SACKS', 'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'WEEK', 'AVG_FPTS', 'MAX_FPTS', 'MIN_FPTS', 'VAR_FPTS']
    if(pos == 'rb'):
        var_list = ['RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
        'RECEIVING TD', 'RUSHING Y/A', 'RUSHING LG',
        'RUSHING 20+', 'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK', 'AVG_FPTS', 'MAX_FPTS', 'MIN_FPTS', 'VAR_FPTS']
    if(pos == 'wr'):
        var_list = ['RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
        'RECEIVING TD', 'RECEIVING LG', 'RECEIVING 20+',
        'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK', 'AVG_FPTS', 'MAX_FPTS', 'MIN_FPTS', 'VAR_FPTS']
    if(pos == 'te'):
        var_list = ['RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
        'RECEIVING TD', 'RECEIVING LG', 'RECEIVING 20+',
        'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK', 'AVG_FPTS', 'MAX_FPTS', 'MIN_FPTS', 'VAR_FPTS']

# Without Player
    # # Extracting features and target variable
    # X = data[var_list]  # Ensure 'MISC FPTS' is not in var_list
    # y = data['MISC FPTS']

    # # Splitting the dataset into training and testing sets
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # # Creating and training the linear regression model
    # model = LinearRegression()
    # model.fit(X_train, y_train)

    # # Predicting on the test set
    # y_pred = model.predict(X_test)

    # # Evaluating the model (optional)
    # mse = mean_squared_error(y_test, y_pred)
    # print(f'Mean Squared Error: {mse}')

    # # Printing the coefficients
    # coefficients = pd.DataFrame({'Variable': X.columns, 'Coefficient': model.coef_})
    # print(coefficients)

# With Player
    # Extracting features and target variable (split X and Y)
    X = data[var_list]  # Ensure 'MISC FPTS' is not in var_list
    X['PLAYER'] = data['PLAYER']  # Include 'Player' as a feature

    y = data['MISC FPTS']

    # Handling missing values in numeric columns using SimpleImputer
    numeric_cols = X.select_dtypes(include='number').columns
    imputer = SimpleImputer(strategy='mean')  # You can change the strategy as needed
    X[numeric_cols] = imputer.fit_transform(X[numeric_cols])
    
    # Target encoding Player --> replaces the Player value with the mean FPTS for each player
    encoder = ce.TargetEncoder(cols=['PLAYER'])
    X = encoder.fit_transform(X, y)

    # One-hot encoding Player
    # X = pd.get_dummies(X, columns=['PLAYER'], drop_first=True)

    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standard Scale
    scaler = StandardScaler()
    # change this to only use the train data
    scaler.fit_transform(X_train)
    scaler.transform(X_test)
    
    # PCA
    pca = PCA(n_components = 0.95)
    pca.fit(X_train)
    # print(f'Explained variance ratios: {pca.explained_variance_ratio_}')
    # print(f'Components... {pca.components_}')

    pca_X_train = pca.transform(X_train)
    pca_X_test = pca.transform(X_test)

    # Creating and training the linear regression model
    model, model_pca = LinearRegression(), LinearRegression()
    model.fit(X_train, y_train)
    model_pca.fit(pca_X_train, y_train)

    # Predicting on the test set
    y_pred = model.predict(X_test)
    y_pred_pca = model_pca.predict(pca_X_test)

    # Evaluating the model (optional)
    mse = mean_squared_error(y_test, y_pred)
    print(y_pred[:5])
    print(y_test[:5])
    mse_pca = mean_squared_error(y_test, y_pred_pca)
    print(f'Mean Squared Error {pos}: {mse}')
    print(f'Mean Squared Error with PCA {pos}: {mse_pca}')

    # Printing the coefficients
    # coefficients = pd.DataFrame({'Variable': X.columns, 'Coefficient': model.coef_})
    # print(coefficients)

## Create result dataset
    # import re

    # data2 = pd.read_csv("datasets/weekly_scoring.csv")

    # # Predicting on the entire dataset
    # data['Predicted_FPTS'] = model.predict(X)

    # # Organizing the results into a new DataFrame
    # result_df = data[['PLAYER', 'Predicted_FPTS']].copy()

    # # Grouping by 'PLAYER' and calculating the average predicted FPTS
    # result_df = result_df.groupby('PLAYER').mean().reset_index()
    # result_df = result_df.sort_values(by='Predicted_FPTS', ascending=False)

    # pattern = r'\((.*?)\)'
    # result_df['TEAM'] = result_df['PLAYER'].apply(lambda x: re.search(pattern, x).group(1) if re.search(pattern, x) else pd.NA)

    # # Keep only the first unique occurrence of any value in the 'Team' column
    # result_df = result_df.drop_duplicates(subset='TEAM')

    # # Remove any player with the team equal to 'FA'
    # result_df = result_df.query("TEAM != 'FA'")

    # # Save the results to a CSV file
    # file_name = f"predictions/LR{pos}.csv"
    # result_df.to_csv(file_name, index=False)


[10.88480365 15.32510854 11.41096141 20.38903563 13.6830667 ]
2476    10.7
889     17.2
1177    11.3
485     20.3
647     13.5
Name: MISC FPTS, dtype: float64
Mean Squared Error qb: 0.20118776911889338
Mean Squared Error with PCA qb: 15.801407819839113
[8.91765975e-14 3.10000000e+00 8.69964325e-14 8.51003797e-14
 6.60000000e+00]
11891    0.0
16093    3.1
15761    0.0
15374    0.0
14514    6.6
Name: MISC FPTS, dtype: float64
Mean Squared Error rb: 1.5693954121561797e-26
Mean Squared Error with PCA rb: 6.700588099498068
[ 1.10000000e+00  1.83000000e+01 -4.90641724e-14 -4.98403091e-14
  2.70000000e+00]
8932      1.1
6977     18.3
6963      0.0
11062     0.0
5173      2.7
Name: MISC FPTS, dtype: float64
Mean Squared Error wr: 5.053308908087753e-27
Mean Squared Error with PCA wr: 5.145663596872624
[2.00000000e+00 2.31463216e-14 2.00000000e+00 2.75400365e-14
 2.00000000e+00]
17677    2.0
18692    0.0
19298    2.0
19556    0.0
16905    2.0
Name: MISC FPTS, dtype: float64
Mean Squared Error te