Our preprocessing approach is designed to transform raw weekly player statistics into meaningful, position-specific performance metrics. The main goal is to identify players with consistent and high production for predictive modeling.

1. Aggregation of Weekly Data

We start by parsing the weekly offense dataset.

For each player, we compute average performance metrics per week within their respective position (RB, WR, TE, QB).

This allows us to rank players by typical production levels for each position.

In [2]:
import pandas as pd
import numpy as np

In [5]:
weekly_offense = pd.read_csv('Data/weekly_player_stats_offense.csv')
## check how many record are in the weekly player stats 
print(f"Total records in weekly_offense: {len(weekly_offense):,}")
## check how many columns are there print out there names 
print(weekly_offense.columns.tolist())

Total records in weekly_offense: 58,629
['season', 'week', 'offense_snaps', 'offense_pct', 'team_offense_snaps', 'player_id', 'birth_year', 'draft_year', 'draft_round', 'draft_pick', 'draft_ovr', 'height', 'weight', 'college', 'season_type', 'player_name', 'position', 'depth_team', 'conference', 'division', 'team', 'shotgun', 'no_huddle', 'qb_dropback', 'qb_scramble', 'pass_attempts', 'complete_pass', 'incomplete_pass', 'passing_yards', 'receiving_yards', 'yards_after_catch', 'rush_attempts', 'rushing_yards', 'tackled_for_loss', 'first_down_pass', 'first_down_rush', 'third_down_converted', 'third_down_failed', 'fourth_down_converted', 'fourth_down_failed', 'rush_touchdown', 'pass_touchdown', 'safety', 'interception', 'fumble', 'fumble_lost', 'fumble_forced', 'fumble_not_forced', 'fumble_out_of_bounds', 'receptions', 'targets', 'passing_air_yards', 'receiving_air_yards', 'receiving_touchdown', 'pass_attempts_redzone', 'complete_pass_redzone', 'pass_touchdown_redzone', 'pass_attempts_gtg

In [10]:
## check all the positions that are available 
print(weekly_offense['position'].value_counts())
## make a dict with actual column names 
performance_metrics = {
    'QB': ['passing_yards', 'pass_touchdown', 'interception', 'rushing_yards', 'rush_touchdown', 'fantasy_points_ppr'],
    'RB': ['rushing_yards', 'rush_touchdown', 'receiving_yards', 'receiving_touchdown', 'receptions', 'fumble', 'fantasy_points_ppr'],
    'WR': ['receiving_yards', 'receiving_touchdown', 'receptions', 'targets', 'fumble', 'fantasy_points_ppr'],
    'TE': ['receiving_yards', 'receiving_touchdown', 'receptions', 'targets', 'fumble', 'fantasy_points_ppr']
}

position
WR     23085
RB     15009
TE     11862
QB      7670
FB       730
P         84
CB        54
SS        38
FS        33
ILB       18
DE        11
DT        11
OLB        7
K          6
NT         5
MLB        3
LS         2
T          1
Name: count, dtype: int64


In [13]:
# Filter for fantasy football positions only
fantasy_positions = ['QB', 'RB', 'WR', 'TE']
weekly_offense_filtered = weekly_offense[weekly_offense['position'].isin(fantasy_positions)].copy()
## check how many record are in the new weekly player stats 
print(f"Total records in weekly_offense: {len(weekly_offense):,}")
## check our offense position distribution
print(weekly_offense_filtered['position'].value_counts())


Total records in weekly_offense: 58,629
position
WR    23085
RB    15009
TE    11862
QB     7670
Name: count, dtype: int64


In [17]:
# Calculate weekly averages for each player by position
position_aggregations = {}

for position in fantasy_positions:
    #Get only players from this position
    position_data = weekly_offense_filtered[weekly_offense_filtered['position'] == position].copy()
    print(f"Found {len(position_data):,} weekly records for {position} players")

    # get the stats we care about for this position
    stats_we_want = performance_metrics.get(position, [])

    # Set up what we want to calculate
    calculations = {}

    # Add each stat we want to average
    for stat in stats_we_want:
        calculations[stat] = 'mean'
    # Also count how many games each player played
    calculations['week'] = 'count'

    # Group players and calculate averages
    player_groups = ['player_id', 'player_name']
    player_averages = position_data.groupby(player_groups).agg(calculations)
    player_averages = player_averages.round(2)
    
    #Clean up the column names
    player_averages = player_averages.rename(columns={'week': 'games_played'})
    player_averages['position'] = position

    # Sort by fantasy points (best first)
    player_averages = player_averages.sort_values('fantasy_points_ppr', ascending=False)

    #Save results
    position_aggregations[position] = player_averages

    # Show top 5 players for this position
    print(f"Top 5 {position} players:")
    top_5 = player_averages[['fantasy_points_ppr', 'games_played']].head(5)
    
    for i, (player_info, stats) in enumerate(top_5.iterrows(), 1):
        player_name = player_info[1]
        fantasy_avg = stats['fantasy_points_ppr']
        games = stats['games_played']
        print(f"   {i}. {player_name}: {fantasy_avg} pts/game ({games} games)")


    


Found 7,670 weekly records for QB players
Top 5 QB players:
   1. Patrick Mahomes: 26.0 pts/game (131.0 games)
   2. Josh Allen: 25.43 pts/game (117.0 games)
   3. Drew Brees: 23.9 pts/game (137.0 games)
   4. Jayden Daniels: 23.89 pts/game (19.0 games)
   5. Lamar Jackson: 23.36 pts/game (93.0 games)
Found 15,009 weekly records for RB players
Top 5 RB players:
   1. Christian McCaffrey: 22.0 pts/game (93.0 games)
   2. Alvin Kamara: 19.15 pts/game (118.0 games)
   3. Jahmyr Gibbs: 18.73 pts/game (35.0 games)
   4. Saquon Barkley: 18.37 pts/game (94.0 games)
   5. Le'Veon Bell: 17.88 pts/game (90.0 games)
Found 23,085 weekly records for WR players
Top 5 WR players:
   1. Antonio Brown: 19.4 pts/game (122.0 games)
   2. Justin Jefferson: 19.01 pts/game (74.0 games)
   3. Ja'Marr Chase: 18.81 pts/game (66.0 games)
   4. Calvin Johnson: 18.37 pts/game (60.0 games)
   5. Malik Nabers: 17.84 pts/game (15.0 games)
Found 11,862 weekly records for TE players
Top 5 TE players:
   1. Travis Kelc

In [18]:
# Summary for each position
for position, data in position_aggregations.items():
    num_players = len(data)
    avg_fantasy = data['fantasy_points_ppr'].mean()
    
    # Get best player info
    best_player_info = data.index[0]
    best_player_name = best_player_info[1]
    best_player_score = data['fantasy_points_ppr'].iloc[0]
    
    print(f"\n{position} Summary:")
    print(f"   Players: {num_players}")
    print(f"   Average fantasy pts: {avg_fantasy:.1f}/game")
    print(f"   Best player: {best_player_name} ({best_player_score:.1f} pts/game)")

# Save all results to file
if position_aggregations:
    print(f"\nSaving results...")
    
    # Combine all positions
    all_positions = []
    for position_data in position_aggregations.values():
        all_positions.append(position_data)
    
    all_players = pd.concat(all_positions, ignore_index=False)
    all_players = all_players.reset_index()
    
    # Save to CSV
    output_file = 'Data/player_weekly_averages.csv'
    all_players.to_csv(output_file, index=False)
    print(f"Saved {len(all_players)} players to: {output_file}")
    
    # Show top 10 overall
    print(f"\nTOP 10 FANTASY PLAYERS OVERALL:")
    top_10 = all_players.nlargest(10, 'fantasy_points_ppr')
    
    for i, player in top_10.iterrows():
        name = player['player_name']
        pos = player['position']
        pts = player['fantasy_points_ppr']
        games = player['games_played']
        print(f"   {i+1:2d}. {name} ({pos}): {pts:.1f} pts/game ({games} games)")


QB Summary:
   Players: 182
   Average fantasy pts: 11.0/game
   Best player: Patrick Mahomes (26.0 pts/game)

RB Summary:
   Players: 397
   Average fantasy pts: 6.8/game
   Best player: Christian McCaffrey (22.0 pts/game)

WR Summary:
   Players: 573
   Average fantasy pts: 6.3/game
   Best player: Antonio Brown (19.4 pts/game)

TE Summary:
   Players: 285
   Average fantasy pts: 4.9/game
   Best player: Travis Kelce (15.6 pts/game)

Saving results...
Saved 1437 players to: Data/player_weekly_averages.csv

TOP 10 FANTASY PLAYERS OVERALL:
    1. Patrick Mahomes (QB): 26.0 pts/game (131 games)
    2. Josh Allen (QB): 25.4 pts/game (117 games)
    3. Drew Brees (QB): 23.9 pts/game (137 games)
    4. Jayden Daniels (QB): 23.9 pts/game (19 games)
    5. Lamar Jackson (QB): 23.4 pts/game (93 games)
    6. Aaron Rodgers (QB): 23.2 pts/game (187 games)
    7. Joe Burrow (QB): 23.1 pts/game (74 games)
    8. Deshaun Watson (QB): 23.1 pts/game (73 games)
    9. Tom Brady (QB): 22.8 pts/game (

2. Reliability Index

Using data from the previous season, we calculate the standard deviation of each player’s weekly scores.

Players with low standard deviations and high average performance are considered reliable performers.

This creates a reliability index for each player, which helps quantify consistency in scoring.

In [21]:
# Dictionary to store reliability metrics for each position
position_reliability = {}

for position in fantasy_positions:
    # Get only players from this position
    position_data = weekly_offense_filtered[weekly_offense_filtered['position'] == position].copy()
    
    # Group by player and calculate standard deviation of fantasy points
    player_groups = ['player_id', 'player_name']
    
    # Calculate both mean and std for fantasy points
    reliability_stats = position_data.groupby(player_groups).agg({
        'fantasy_points_ppr': ['mean', 'std', 'count']}).round(2)
    
    # Flatten column names
    reliability_stats.columns = ['avg_fantasy_pts', 'std_fantasy_pts', 'games_played']
    
    # Filter players with at least 4 games for meaningful analysis
    min_games = 4
    reliable_players = reliability_stats[reliability_stats['games_played'] >= min_games].copy()
    
    position_reliability[position] = reliable_players
    print(f"Found {len(reliable_players)} {position} players with {min_games}+ games")
## Example Output 
# Tom Brady:    avg_fantasy_pts=24.5, std_fantasy_pts=3.2, games_played=16
# Patrick Mahomes: avg_fantasy_pts=26.1, std_fantasy_pts=8.7, games_played=17

## How we can interpret it 
# Tom Brady: Averages 24.5 points with low variance (3.2) = consistent performer
# Patrick Mahomes: Averages 26.1 points with high variance (8.7) = boom-or-bust



Found 156 QB players with 4+ games
Found 362 RB players with 4+ games
Found 511 WR players with 4+ games
Found 258 TE players with 4+ games


In [24]:
# Calculate reliability index and show top performers
for position in position_reliability.keys():
    reliable_players = position_reliability[position]
    
    if len(reliable_players) == 0:
        continue
    
    # Calculate Reliability Index
    # Formula: Average Points / (Standard Deviation + 1)
    # Higher index = better combination of performance and consistency
    reliable_players['reliability_index'] = (
        reliable_players['avg_fantasy_pts'] / (reliable_players['std_fantasy_pts'] + 1)
    ).round(2)
    
    # Add position info
    reliable_players['position'] = position
    
    # Sort by reliability index (highest = most reliable)
    reliable_players = reliable_players.sort_values('reliability_index', ascending=False)
    
    # Update stored data
    position_reliability[position] = reliable_players
    
    # Show top 5 most reliable players for this position
    print(f"\nTop 5 most reliable {position} players:")
    top_reliable = reliable_players.head(5)
    
    for i, (player_info, stats) in enumerate(top_reliable.iterrows(), 1):
        player_name = player_info[1]
        avg_pts = stats['avg_fantasy_pts']
        std_pts = stats['std_fantasy_pts']
        reliability = stats['reliability_index']
        games = stats['games_played']
        
        print(f"   {i}. {player_name}")
        print(f"      Avg: {avg_pts} pts/game | Std: {std_pts} | Reliability: {reliability}")
        print(f"      Games played: {games}")


Top 5 most reliable QB players:
   1. Patrick Mahomes
      Avg: 26.0 pts/game | Std: 9.85 | Reliability: 2.4
      Games played: 131
   2. Andrew Luck
      Avg: 22.73 pts/game | Std: 9.18 | Reliability: 2.23
      Games played: 91
   3. Josh Allen
      Avg: 25.43 pts/game | Std: 10.81 | Reliability: 2.15
      Games played: 117
   4. Kyler Murray
      Avg: 22.12 pts/game | Std: 9.34 | Reliability: 2.14
      Games played: 81
   5. Tom Brady
      Avg: 22.84 pts/game | Std: 9.88 | Reliability: 2.1
      Games played: 195

Top 5 most reliable RB players:
   1. Christian McCaffrey
      Avg: 22.0 pts/game | Std: 9.88 | Reliability: 2.02
      Games played: 93
   2. Bijan Robinson
      Avg: 16.97 pts/game | Std: 7.78 | Reliability: 1.93
      Games played: 34
   3. DeMarco Murray
      Avg: 16.34 pts/game | Std: 7.55 | Reliability: 1.91
      Games played: 87
   4. Chris Carson
      Avg: 14.24 pts/game | Std: 6.76 | Reliability: 1.84
      Games played: 48
   5. Jahmyr Gibbs
      A

In [26]:
# Combine all positions
all_reliable_players = pd.concat(position_reliability.values(), ignore_index=False)
all_reliable_players = all_reliable_players.reset_index()
# Save to CSV
reliability_file = 'Data/player_reliability_index.csv'
all_reliable_players.to_csv(reliability_file, index=False)
# Show top 10 most reliable players overall
print(f"\nTOP 10 MOST RELIABLE PLAYERS OVERALL:")
top_overall = all_reliable_players.nlargest(10, 'reliability_index')

for i, player in top_overall.iterrows():
    name = player['player_name']
    pos = player['position']
    avg_pts = player['avg_fantasy_pts']
    reliability = player['reliability_index']
    
    print(f"   {i+1:2d}. {name} ({pos})")
    print(f"       Reliability Index: {reliability} | Avg: {avg_pts} pts/game")


TOP 10 MOST RELIABLE PLAYERS OVERALL:
    1. Patrick Mahomes (QB)
       Reliability Index: 2.4 | Avg: 26.0 pts/game
    2. Andrew Luck (QB)
       Reliability Index: 2.23 | Avg: 22.73 pts/game
    3. Josh Allen (QB)
       Reliability Index: 2.15 | Avg: 25.43 pts/game
    4. Kyler Murray (QB)
       Reliability Index: 2.14 | Avg: 22.12 pts/game
    5. Tom Brady (QB)
       Reliability Index: 2.1 | Avg: 22.84 pts/game
    6. Carson Palmer (QB)
       Reliability Index: 2.1 | Avg: 18.99 pts/game
    7. Philip Rivers (QB)
       Reliability Index: 2.1 | Avg: 19.6 pts/game
    8. Justin Herbert (QB)
       Reliability Index: 2.09 | Avg: 22.21 pts/game
    9. Drew Brees (QB)
       Reliability Index: 2.08 | Avg: 23.9 pts/game
   10. Bo Nix (QB)
       Reliability Index: 2.06 | Avg: 22.07 pts/game


3. Position-Specific Normalization

Each position has unique performance metrics (e.g., rushing yards for RBs, receiving yards for WRs/TEs, passing yards for QBs).

Metrics are normalized per position to account for natural differences in production scales and ensure fair comparison across players.

Z-Score tells you: </br>
How many standard deviations above/below average </br>
+2.0 z-score = Elite (top ~2% statistically) </br>
0.0 z-score = Exactly average </br>
-1.0 z-score = Below average </br>

Percentile tells you: </br>
What percentage of players you're better than </br>
90th percentile = Better than 90% of position players </br>
50th percentile = Better than 50% (median) </br>

In [29]:
normalized_players = {}

for position in fantasy_positions:
    if position not in position_aggregations:
        continue
        
    # Get position data
    pos_data = position_aggregations[position].copy()
    
    # Calculate position benchmarks
    mean_fantasy = pos_data['fantasy_points_ppr'].mean()
    std_fantasy = pos_data['fantasy_points_ppr'].std()

    print(f"\n{position} Benchmarks:")
    print(f"   Mean: {mean_fantasy:.2f}")
    print(f"   Std Dev: {std_fantasy:.2f}")
    
    # Calculate z-scores (standard deviations above/below average)
    pos_data['fantasy_zscore'] = (
        (pos_data['fantasy_points_ppr'] - mean_fantasy) / std_fantasy
    ).round(2)
    
    # Calculate percentile ranks (0-100 scale within position)
    pos_data['fantasy_percentile'] = (
        pos_data['fantasy_points_ppr'].rank(pct=True) * 100
    ).round(1)
    
    # Store normalized data
    normalized_players[position] = pos_data
    
    print(f"   Top 3 {position} (normalized):")
    top_3 = pos_data.head(3)
    for i, (player_info, stats) in enumerate(top_3.iterrows(), 1):
        name = player_info[1]
        raw_pts = stats['fantasy_points_ppr']
        zscore = stats['fantasy_zscore']
        percentile = stats['fantasy_percentile']
        
        print(f"      {i}. {name}")
        print(f"         Raw: {raw_pts} pts | Z-score: {zscore} | {percentile}th percentile")

## 


QB Benchmarks:
   Mean: 11.01
   Std Dev: 6.94
   Top 3 QB (normalized):
      1. Patrick Mahomes
         Raw: 26.0 pts | Z-score: 2.16 | 100.0th percentile
      2. Josh Allen
         Raw: 25.43 pts | Z-score: 2.08 | 99.5th percentile
      3. Drew Brees
         Raw: 23.9 pts | Z-score: 1.86 | 98.9th percentile

RB Benchmarks:
   Mean: 6.77
   Std Dev: 4.22
   Top 3 RB (normalized):
      1. Christian McCaffrey
         Raw: 22.0 pts | Z-score: 3.61 | 100.0th percentile
      2. Alvin Kamara
         Raw: 19.15 pts | Z-score: 2.93 | 99.7th percentile
      3. Jahmyr Gibbs
         Raw: 18.73 pts | Z-score: 2.83 | 99.5th percentile

WR Benchmarks:
   Mean: 6.29
   Std Dev: 4.04
   Top 3 WR (normalized):
      1. Antonio Brown
         Raw: 19.4 pts | Z-score: 3.25 | 100.0th percentile
      2. Justin Jefferson
         Raw: 19.01 pts | Z-score: 3.15 | 99.8th percentile
      3. Ja'Marr Chase
         Raw: 18.81 pts | Z-score: 3.1 | 99.7th percentile

TE Benchmarks:
   Mean: 4.89
  

4. Additional Factors

Player age, seasons played, and general position trends (e.g., typical weekly output) are incorporated.

This helps adjust rankings for players who may be improving, aging, or trending differently relative to their peers.

In [31]:
all_normalized_players = []

for position, pos_data in normalized_players.items():
    pos_df = pos_data.reset_index()
    all_normalized_players.append(pos_df)

# Combine all normalized data
final_player_analysis = pd.concat(all_normalized_players, ignore_index=True)

# Save normalized results
normalized_file = 'Data/player_normalized_stats.csv'
final_player_analysis.to_csv(normalized_file, index=False)


In [38]:
comprehensive_players = {}

for position in fantasy_positions:
    if position in normalized_players and position in position_reliability:
        # Get normalized stats
        norm_data = normalized_players[position].copy()
        norm_data = norm_data.reset_index()
        
        # Get reliability stats  
        reliability_data = position_reliability[position].copy()
        reliability_data = reliability_data.reset_index()
        
        # Merge on player info
        merged = pd.merge(
            norm_data, 
            reliability_data[['player_id', 'player_name', 'reliability_index']], 
            on=['player_id', 'player_name'], 
            how='left'
        )
        
        # fill missing scores with penality
        merged['reliability_index'] = merged['reliability_index'].fillna(0.5)  
        
        # Calculate reliability z-score within position 
        reliability_mean = merged['reliability_index'].mean()
        reliability_std = merged['reliability_index'].std()
        merged['reliability_zscore'] = (
            (merged['reliability_index'] - reliability_mean) / reliability_std
        ).round(2)

        merged['value_score'] = (
            (merged['fantasy_zscore'] * 0.6) +          # 60% performance vs position  
            (merged['reliability_zscore'] * 0.4)        # 40% consistency vs position
        ).round(2)
        
        # Sort by comprehensive value
        merged = merged.sort_values('value_score', ascending=False)
        
        comprehensive_players[position] = merged
        
        print(f"\n{position} - Top 5 Most Valuable Players:")
        top_valuable = merged.head(5)# Create a player value score combining all metrics

        for i, player in top_valuable.iterrows():
            name = player['player_name']
            raw_pts = player['fantasy_points_ppr']
            zscore = player['fantasy_zscore']
            percentile = player['fantasy_percentile']
            reliability = player['reliability_index']
            value = player['value_score']
            
            print(f"   {i+1}. {name}")
            print(f"      Value Score: {value} | Raw: {raw_pts} pts | Z-score: {zscore}")
            print(f"      Percentile: {percentile}% | Reliability: {reliability}")



QB - Top 5 Most Valuable Players:
   1. Patrick Mahomes
      Value Score: 2.22 | Raw: 26.0 pts | Z-score: 2.16
      Percentile: 100.0% | Reliability: 2.4
   2. Josh Allen
      Value Score: 1.99 | Raw: 25.43 pts | Z-score: 2.08
      Percentile: 99.5% | Reliability: 2.15
   11. Andrew Luck
      Value Score: 1.81 | Raw: 22.73 pts | Z-score: 1.69
      Percentile: 94.5% | Reliability: 2.23
   3. Drew Brees
      Value Score: 1.8 | Raw: 23.9 pts | Z-score: 1.86
      Percentile: 98.9% | Reliability: 2.08
   4. Jayden Daniels
      Value Score: 1.76 | Raw: 23.89 pts | Z-score: 1.86
      Percentile: 98.4% | Reliability: 2.02

RB - Top 5 Most Valuable Players:
   1. Christian McCaffrey
      Value Score: 3.36 | Raw: 22.0 pts | Z-score: 3.61
      Percentile: 100.0% | Reliability: 2.02
   3. Jahmyr Gibbs
      Value Score: 2.68 | Raw: 18.73 pts | Z-score: 2.83
      Percentile: 99.5% | Reliability: 1.83
   2. Alvin Kamara
      Value Score: 2.66 | Raw: 19.15 pts | Z-score: 2.93
      Per