Designing the Algorithm

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [25]:
flex_data_path = '/Users/itayakad/Desktop/nfl_data/oy_flex.csv'
flex_data = pd.read_csv(flex_data_path)
# Select only numeric columns
numeric_columns = flex_data.select_dtypes(include=['number'])
# Calculate correlation of each column with ppr_ppg
correlation = numeric_columns.corr()
ppr_ppg_correlation = correlation['ppr_ppg'].sort_values(ascending=False)
print(ppr_ppg_correlation)

ppr_ppg                        1.000000
ypg                            0.952677
fantasy_points_ppr             0.908767
total_yards                    0.889715
total_tds                      0.834190
receiving_yards_after_catch    0.818325
receptions                     0.805404
rec_ypg                        0.784015
targets                        0.772262
target_share                   0.770296
receiving_yards                0.762716
receiving_first_downs          0.747934
offense_snaps                  0.733822
offense_pct                    0.717520
touches                        0.696904
receiving_tds                  0.677990
teams_offense_snaps            0.598212
games                          0.595047
receiving_air_yards            0.563552
air_yards_share                0.479453
rushing_tds                    0.463643
rushing_first_downs            0.463389
rushing_yards                  0.460348
carries                        0.443148
rush_ypg                       0.439123


In [26]:
# Using the top correlated features
top_features = ['ypg', 'total_yards', 'total_tds', 'receiving_yards_after_catch', 'receptions']
X = numeric_columns[top_features]
y = numeric_columns['ppr_ppg']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")



Mean Squared Error: 0.8582478914043316
R-squared: 0.9666406095226697


Testing the Algorithm

In [27]:
def predict_next_year(train_year):
    # Filter data for the specified training season
    data_train = flex_data[flex_data['season'] == train_year]

    # Select only numeric columns for the training data
    numeric_columns_train = data_train.select_dtypes(include=['number'])

    # Select the top correlated features for training data
    top_features = ['ypg', 'total_yards', 'total_tds', 'receiving_yards_after_catch', 'receptions']
    X_train = numeric_columns_train[top_features]
    y_train = numeric_columns_train['ppr_ppg']

    # Train the model using the training season data
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Use the trained model to predict the next season's PPR PPG
    data_predict = data_train.copy()  # We'll use the same players from the training year for predictions
    data_predict['predicted_ppr_ppg'] = model.predict(X_train)

    # Identify the top 20 players based on predicted PPR PPG for the next season
    top_20_predicted = data_predict[['name', 'predicted_ppr_ppg']].sort_values(by='predicted_ppr_ppg', ascending=False).head(20).reset_index(drop=True)
    top_20_predicted['predicted_rank'] = range(1, 21)

    # Create a results DataFrame
    combined_results = pd.DataFrame({
        'Rank': range(1, 21),
        'Name (proj)': top_20_predicted['name'],
        'Proj PPR PPG': top_20_predicted['predicted_ppr_ppg']
    })

    # Display the results
    print(f"Projected {train_year + 1} leaders based on {train_year}'s stats:")
    print(combined_results)

# Predict the best players for next year (2024) based on this year's data (2023)
predict_next_year(2014)


Projected 2015 leaders based on 2014's stats:
    Rank       Name (proj)  Proj PPR PPG
0      1     Antonio Brown     24.376727
1      2      Le'Veon Bell     22.080346
2      3    DeMarco Murray     21.304719
3      4        Matt Forte     21.180766
4      5  Demaryius Thomas     21.167179
5      6      Jordy Nelson     20.742358
6      7        Dez Bryant     20.611707
7      8    Marshawn Lynch     19.246066
8      9       Julio Jones     19.193651
9     10      Randall Cobb     18.941958
10    11  Emmanuel Sanders     18.502543
11    12    Rob Gronkowski     18.191705
12    13     Jeremy Maclin     17.437500
13    14    Jamaal Charles     17.303793
14    15        Eddie Lacy     17.106559
15    16    Calvin Johnson     17.030563
16    17        A.J. Green     16.921561
17    18       T.Y. Hilton     16.878328
18    19   Adrian Peterson     16.775572
19    20    Alshon Jeffery     16.730377
