### First need to collect data from Kaggle

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("philiphyde1/nfl-stats-1999-2022")

print("Path to dataset files:", path)

### Now need to remove unneeded columns

In [None]:
import pandas as pd 
path = path.replace("\\", "/")  # For Windows compatibility
df = pd.read_csv(f"{path}\\yearly_player_stats_offense.csv")

print(df.head())
column_names_list = df.columns.tolist()
print("\nColumn names (Python list):")
print(column_names_list)



### isolate positions

In [None]:
#need to have players that are only qb,wr,rb 
df_QB = df[df['position'].isin(['QB'])]
df_WR = df[df['position'].isin(['WR'])]
df_RB = df[df['position'].isin(['RB'])] 
print(len(df_QB))
print(len(df_WR))
print(len(df_RB))   

print(df_QB.head())

In [None]:
df_QB['fantasy_points_standard'].isna().sum()
df_RB['fantasy_points_ppr'].isna().sum()
df_WR['fantasy_points_ppr'].isna().sum()
target_columns = ['fantasy_points_standard', 'fantasy_points_ppr', 'fantasy_points_half_ppr']


In [None]:
#remove if games_missed > 7

df_QB = df_QB[df_QB['games_missed'] <= 7]
df_RB = df_RB[df_RB['games_missed'] <= 7]
df_WR = df_WR[df_WR['games_missed'] <= 7]
df_QB = df_QB[df_QB['games_missed'].notna()]
df_RB = df_RB[df_RB['games_missed'].notna()]
df_WR = df_WR[df_WR['games_missed'].notna()]




In [None]:
import pandas as pd

# Example: start with your existing df
# df = pd.read_csv("player_stats.csv")

# If not already present:
df["games_played"] = 17 - df["games_missed"]

# Avoid divide-by-zero
df["games_played"] = df["games_played"].replace(0, pd.NA)

# Features to normalize (totals)
to_normalize = [
    "offense_snaps", "touches", "targets", "receptions",
    "rushing_yards", "receiving_yards", "yards_after_catch", "total_yards",
    "rush_touchdown", "receiving_touchdown", "total_tds",
    "rush_attempts", "rush_attempts_redzone", "targets_redzone",
    "touches", "total_yards", "rush_touchdown",
    "receiving_yards", "offense_snaps"
]

# Create new per-game versions
for col in to_normalize:
    df[f"{col}_pg"] = df[col] / df["games_played"]

# Keep static / rate-based features as-is
static_features = [
    "position", "age", "years_exp", "height", "weight",
    "team", "depth_team",
    "offense_pct", "team_offense_snaps",
    "yptarget", "yptouch", "ypc", "ypr",
    "rec_td_pct", "rush_td_pct", "td_pct",
    "team_targets_share", "team_rush_attempts_share", "team_receiving_yards_share",
    "games_missed",'fantasy_points_standard','fantasy_points_ppr',
]

# Combine everything into a new averaged feature DataFrame
avg_features = df[[*static_features, *[f"{c}_pg" for c in to_normalize]]].copy()

# Optionally inspect
print(avg_features.head())


In [None]:
#now time to use these features to predict fantasy points for qbs, wrs, rbs
target_columns = ['fantasy_points_standard', 'fantasy_points_ppr',]   

df_QB_targets = df_QB[target_columns]
df_RB_targets = df_RB[target_columns]       
df_WR_targets = df_WR[target_columns]

#need to target each position separately 

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Example split by position
df_QB = avg_features[avg_features['position'] == 'QB'].copy()
df_WR = avg_features[avg_features['position'] == 'WR'].copy()
df_RB = avg_features[avg_features['position'] == 'RB'].copy()

# Define your target
target = 'fantasy_points_standard'  # Replace this with your actual target column

# Define features — exclude target and identifiers
features = [col for col in avg_features.columns if col not in ['position', 'team', 'depth_team', target,'fantasy_points_ppr', 'fantasy_points_half_ppr']]

# Function to train model for a given position
def train_position_model(df_position, position_name):
    # Drop rows missing the target
    df_position = df_position.dropna(subset=[target])
    
    X = df_position[features]
    y = df_position[target]
    
    # Handle categorical variables
    X = pd.get_dummies(X, drop_first=True)
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Normalize
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Model (Random Forest for example)
    model = RandomForestRegressor(n_estimators=200, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{position_name} model performance:")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.2f}\n")
    
    return model, scaler

# Train all three
QB_model, QB_scaler = train_position_model(df_QB, "Quarterback")
WR_model, WR_scaler = train_position_model(df_WR, "Wide Receiver")
RB_model, RB_scaler = train_position_model(df_RB, "Running Back")


Wide Receiver model performance:
MAE: 4.66
R²: 0.98

Running Back model performance:
MAE: 5.48
R²: 0.97



In [None]:
def graph_feature_importance(features, positionmodel,position_scaler, positionDF,position):
    import matplotlib.pyplot as plt
    import numpy as np
    import seaborn as sns
    importances = positionmodel.feature_importances_
    feature_names = position_scaler.get_feature_names_out(input_features=positionDF[features].columns)
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
    plt.title(f"Top 7 Feature Importances for {position} Model")
    plt.tight_layout()
    plt.show()
    

In [None]:
graph_feature_importance(features,RB_model,RB_scaler,df_RB,'RB')
graph_feature_importance(features,WR_model,WR_scaler,df_WR,'WR')
graph_feature_importance(features,QB_model,QB_scaler,df_QB,'QB')

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

def train_position_model_varied(df_position, position_name, model_type="lasso", alpha=1.0):
    # Drop rows missing the target
    df_position = df_position.dropna(subset=[target])
    
    X = df_position[features]
    y = df_position[target]
    
    # Handle categorical variables
    X = pd.get_dummies(X, drop_first=True)
    X = X.fillna(X.mean())
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Normalize
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Select model type
    if model_type == "linear":
        model = LinearRegression()
    elif model_type == "lasso":
        model = Lasso(alpha=alpha, random_state=42, max_iter=10000)
    elif model_type == "ridge":
        model = Ridge(alpha=alpha, random_state=42)
    elif model_type == "rf":
        model = RandomForestRegressor(n_estimators=200, random_state=42)
    else:
        raise ValueError("Invalid model_type. Choose 'linear', 'lasso', 'ridge', or 'rf'.")
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{position_name} - {model_type.upper()} Model Performance:")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.2f}\n")
    
    return model, scaler


In [49]:
train_position_model_varied(df_QB, "Quarterback", model_type="rf", alpha=1)
train_position_model_varied(df_RB, "Running Back", model_type="rf", alpha=1.0)
train_position_model_varied(df_WR, "Wide Receiver", model_type="rf",alpha=1.0)

Quarterback - RF Model Performance:
MAE: 9.06
R²: 0.99

Running Back - RF Model Performance:
MAE: 5.54
R²: 0.98

Wide Receiver - RF Model Performance:
MAE: 4.51
R²: 0.98



(RandomForestRegressor(n_estimators=200, random_state=42), StandardScaler())

In [39]:
print(df_QB.head())

   position   age  years_exp  height  weight team  depth_team  offense_pct  \
0        QB  37.0       14.0    74.0   216.0  PIT         3.0     1.000000   
4        QB  36.0       14.0    76.0   235.0  TEN         2.0     0.774510   
5        QB  36.0       14.0    77.0   230.0  DEN         1.0     1.000000   
6        QB  36.0       14.0    77.0   230.0  DEN         1.0     0.971129   
11       QB  35.0       12.0    76.0   225.0   NE         1.0     1.000000   

    team_offense_snaps  yptarget  ...  receiving_touchdown_pg  total_tds_pg  \
0                129.0       NaN  ...                     0.0      0.333333   
4                510.0       NaN  ...                     0.0      0.777778   
5                 94.0       NaN  ...                     0.0      0.200000   
6               1143.0       NaN  ...                     0.0      2.176471   
11               150.0       NaN  ...                     0.0      0.250000   

    rush_attempts_pg  rush_attempts_redzone_pg  targets_

In [40]:
# i want to get the years the player was on a team 


