In [5]:
# Get user input for the week number to predict
# num_week = int(input("Enter the week to predict: "))

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

positions = ["qb", "rb", "wr", "te"]
# positions = ["qb"]


for pos in positions:
    # Load your dataset
    data = pd.read_csv("datasets/weekly_scoring.csv")

    # Preprocessing
    data = data[data['POS'] == pos]
    weights = data['WEIGHT']
    
    if(pos == 'qb'):
        # Define the list of variables to predict
        var_list = ['PASSING CMP', 'PASSING ATT', 'PASSING PCT', 'PASSING YDS', 'PASSING Y/A', 'PASSING TD', 'PASSING INT',
        'PASSING SACKS', 'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'WEEK', 'AVG_FPTS', 'MAX_FPTS', 'MIN_FPTS', 'VAR_FPTS']
    if(pos == 'rb'):
        var_list = ['RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
        'RECEIVING TD', 'RUSHING Y/A', 'RUSHING LG',
        'RUSHING 20+', 'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK', 'AVG_FPTS', 'MAX_FPTS', 'MIN_FPTS', 'VAR_FPTS']
    if(pos == 'wr'):
        var_list = ['RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
        'RECEIVING TD', 'RECEIVING LG', 'RECEIVING 20+',
        'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK', 'AVG_FPTS', 'MAX_FPTS', 'MIN_FPTS', 'VAR_FPTS']
    if(pos == 'te'):
        var_list = ['RECEIVING REC', 'RECEIVING TGT', 'RECEIVING YDS', 'RECEIVING Y/R',
        'RECEIVING TD', 'RECEIVING LG', 'RECEIVING 20+',
        'RUSHING ATT', 'RUSHING YDS', 'RUSHING TD', 'MISC FL', 'MISC FPTS', 'WEEK', 'AVG_FPTS', 'MAX_FPTS', 'MIN_FPTS', 'VAR_FPTS']

# Without Player
    # # Extracting features and target variable
    # X = data[var_list]  # Ensure 'MISC FPTS' is not in var_list
    # y = data['MISC FPTS']

    # # Splitting the dataset into training and testing sets
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # # Creating and training the linear regression model
    # model = LinearRegression()
    # model.fit(X_train, y_train)

    # # Predicting on the test set
    # y_pred = model.predict(X_test)

    # # Evaluating the model (optional)
    # mse = mean_squared_error(y_test, y_pred)
    # print(f'Mean Squared Error: {mse}')

    # # Printing the coefficients
    # coefficients = pd.DataFrame({'Variable': X.columns, 'Coefficient': model.coef_})
    # print(coefficients)

# With Player
    # Extracting features and target variable (split X and Y)
    X = data[var_list]  # Ensure 'MISC FPTS' is not in var_list
    X['PLAYER'] = data['PLAYER']  # Include 'Player' as a feature

    y = data['MISC FPTS']

    # Handling missing values in numeric columns using SimpleImputer
    numeric_cols = X.select_dtypes(include='number').columns
    imputer = SimpleImputer(strategy='mean')  # You can change the strategy as needed
    X[numeric_cols] = imputer.fit_transform(X[numeric_cols])
    
    # Target encoding Player --> replaces the Player value with the mean FPTS for each player
    encoder = ce.TargetEncoder(cols=['PLAYER'])
    X = encoder.fit_transform(X, y)

    # One-hot encoding Player
    # X = pd.get_dummies(X, columns=['PLAYER'], drop_first=True)

    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Hyperparameter Tuning
    # Define the hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    # Creating and training the Random Forest model with GridSearchCV
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Getting the best parameters from the grid search
    best_params = grid_search.best_params_
    print(f'Best Hyperparameters: {best_params}')

    # Creating a new Random Forest model with the best parameters
    best_model = RandomForestRegressor(**best_params, random_state=42)
    best_model.fit(X_train, y_train)

    # Predicting on the test set
    y_pred = best_model.predict(X_test)

# Evaluating the model
    # MSE
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error {pos}: {mse}')

    # # R Squared
    from sklearn.metrics import r2_score

    r2 = r2_score(y_test, y_pred)
    print(f'R-squared Score: {r2}')

    # # Feature Importance
    # feature_importances = model.feature_importances_
    # feature_names = X_train.columns

    # # Print feature importances
    # for feature, importance in zip(feature_names, feature_importances):
    #     print(f'{feature}: {importance}')

    # # Residual Analysis
    # import matplotlib.pyplot as plt

    # plt.scatter(y_test, y_pred)
    # plt.xlabel('Actual Values')
    # plt.ylabel('Predicted Values')
    # plt.title('Actual vs. Predicted Values')
    # plt.show()

    # # Cross Validation
    # from sklearn.model_selection import cross_val_score

    # # Perform cross-validation
    # cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    # print(f'Cross-Validation Mean Squared Error: {abs(cv_scores.mean())}')

    ## Create result dataset
    import re

    data2 = pd.read_csv("datasets/weekly_scoring.csv")

    # Predicting on the entire dataset
    data['Predicted_FPTS'] = best_model.predict(X)

    # Organizing the results into a new DataFrame
    result_df = data[['PLAYER', 'Predicted_FPTS']].copy()

    # Grouping by 'PLAYER' and calculating the average predicted FPTS
    result_df = result_df.groupby('PLAYER').mean().reset_index()
    result_df = result_df.sort_values(by='Predicted_FPTS', ascending=False)

    pattern = r'\((.*?)\)'
    result_df['TEAM'] = result_df['PLAYER'].apply(lambda x: re.search(pattern, x).group(1) if re.search(pattern, x) else pd.NA)

    # Keep only the first unique occurrence of any value in the 'Team' column
    result_df = result_df.drop_duplicates(subset='TEAM')

    # Remove any player with the team equal to 'FA'
    result_df = result_df.query("TEAM != 'FA'")

    # Save the results to a CSV file
    file_name = f"predictions/RF{pos}.csv"
    result_df.to_csv(file_name, index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['PLAYER'] = data['PLAYER']  # Include 'Player' as a feature
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols] = imputer.fit_transform(X[numeric_cols])


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Mean Squared Error qb: 1.7044157933419806
R-squared Score: 0.981548536100729


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['PLAYER'] = data['PLAYER']  # Include 'Player' as a feature
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols] = imputer.fit_transform(X[numeric_cols])


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Mean Squared Error rb: 0.003954045256745086
R-squared Score: 0.9999073389571544


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['PLAYER'] = data['PLAYER']  # Include 'Player' as a feature
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols] = imputer.fit_transform(X[numeric_cols])


Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Mean Squared Error wr: 0.002641629643258929
R-squared Score: 0.9999045823950067


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['PLAYER'] = data['PLAYER']  # Include 'Player' as a feature
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols] = imputer.fit_transform(X[numeric_cols])


Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 150}
Mean Squared Error te: 0.0018100202249517103
R-squared Score: 0.9998657955133161
