# Data Cleaning

This notebook handles data cleaning for the integrated dataset.

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load Dataset

In [24]:
# Read the integrated dataset
df = pd.read_csv('integrated_df.csv')

print(f"Dataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Dataset shape: (27222, 16)
Number of rows: 27222
Number of columns: 16


## Initial Data Inspection

In [25]:
# Display first few rows
df.head()

Unnamed: 0,name,status,chance_of_playing_next_round,chance_of_playing_this_round,now_cost,event_points,ep_next,ep_this,gw,position,team,xP,minutes,starts,total_points,value
0,Karl Hein,u,0.0,0.0,4.0,0,0.0,0.0,23,GK,Arsenal,0.0,0,0,0,40
1,Jurriën Timber,a,100.0,100.0,5.6,5,2.0,2.5,23,DEF,Arsenal,2.3,90,1,5,56
2,Jorge Luiz Frello Filho,a,100.0,100.0,4.7,0,0.5,1.0,23,MID,Arsenal,1.1,0,0,0,47
3,Jakub Kiwior,a,,,4.8,0,0.0,0.5,23,DEF,Arsenal,0.5,0,0,0,48
4,Gabriel Martinelli Silva,a,100.0,100.0,6.7,3,3.0,3.5,23,MID,Arsenal,4.7,86,1,3,67


In [26]:
# Display dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27222 entries, 0 to 27221
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   name                          27222 non-null  object 
 1   status                        27222 non-null  object 
 2   chance_of_playing_next_round  17130 non-null  float64
 3   chance_of_playing_this_round  16823 non-null  float64
 4   now_cost                      27222 non-null  float64
 5   event_points                  27222 non-null  int64  
 6   ep_next                       27219 non-null  float64
 7   ep_this                       27222 non-null  float64
 8   gw                            27222 non-null  int64  
 9   position                      27222 non-null  object 
 10  team                          27222 non-null  object 
 11  xP                            27222 non-null  float64
 12  minutes                       27222 non-null  int64  
 13  s

In [27]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_values,
    'Missing_Percentage': missing_percentage
})

# Display only columns with missing values
missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

Unnamed: 0,Missing_Count,Missing_Percentage
chance_of_playing_this_round,10399,38.20072
chance_of_playing_next_round,10092,37.072956
ep_next,3,0.01102


In [28]:
# Display summary statistics
df.describe()

Unnamed: 0,chance_of_playing_next_round,chance_of_playing_this_round,now_cost,event_points,ep_next,ep_this,gw,xP,minutes,starts,total_points,value
count,17130.0,16823.0,27222.0,27222.0,27219.0,27222.0,27222.0,27222.0,27222.0,27222.0,27222.0,27222.0
mean,54.441039,53.588837,4.857149,1.216994,1.148907,1.183521,20.251378,1.122651,27.133201,0.303872,1.216994,48.593086
std,48.487861,48.612877,1.111858,2.466298,1.897932,1.932179,10.925046,2.086232,37.924507,0.459936,2.466298,11.116539
min,0.0,0.0,0.5,-5.0,-2.2,-1.7,1.0,-2.2,0.0,0.0,-5.0,5.0
25%,0.0,0.0,4.3,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,44.0
50%,100.0,100.0,4.5,0.0,0.3,0.3,21.0,0.0,0.0,0.0,0.0,45.0
75%,100.0,100.0,5.1,1.0,1.8,1.9,30.0,1.7,71.0,1.0,1.0,51.0
max,100.0,100.0,15.4,29.0,22.0,26.4,38.0,52.8,90.0,1.0,29.0,154.0


## Compare now_cost vs value

Check if `now_cost * 10` equals `value` (these measure the same thing from different sources)

In [29]:
# Create comparison column
df['cost_match'] = (df['now_cost'] * 10) == df['value']

# Calculate statistics
total_rows = len(df)
matching_rows = df['cost_match'].sum()
mismatching_rows = total_rows - matching_rows

print("=" * 60)
print("COMPARISON: now_cost * 10 vs value")
print("=" * 60)
print(f"Total rows: {total_rows}")
print(f"Matching rows: {matching_rows} ({matching_rows/total_rows*100:.2f}%)")
print(f"Mismatching rows: {mismatching_rows} ({mismatching_rows/total_rows*100:.2f}%)")
print()

# If there are mismatches, show the differences
if mismatching_rows > 0:
    df['difference'] = df['value'] - (df['now_cost'] * 10)
    
    print("Difference Statistics (value - now_cost*10):")
    print(df['difference'].describe())
    print()
    
    # Distribution of mismatches by game week
    print("=" * 60)
    print("DISTRIBUTION OF MISMATCHES BY GAME WEEK (GW)")
    print("=" * 60)
    
    mismatch_by_gw = df[~df['cost_match']].groupby('gw').size()
    total_by_gw = df.groupby('gw').size()
    
    gw_distribution = pd.DataFrame({
        'Total_Rows': total_by_gw,
        'Mismatches': mismatch_by_gw,
        'Mismatch_Percentage': (mismatch_by_gw / total_by_gw * 100).round(2)
    })
    
    print(gw_distribution)
    print()
    
    print("Summary by GW:")
    print(f"GWs with mismatches: {len(mismatch_by_gw)}")
    print(f"GW with most mismatches: GW{mismatch_by_gw.idxmax()} ({mismatch_by_gw.max()} mismatches)")
    print()
    
    print("Sample of mismatching rows:")
    print(df[~df['cost_match']][['gw', 'now_cost', 'value', 'difference']].head(10))
else:
    print("✓ All values match perfectly!")

COMPARISON: now_cost * 10 vs value
Total rows: 27222
Matching rows: 25700 (94.41%)
Mismatching rows: 1522 (5.59%)

Difference Statistics (value - now_cost*10):
count    27222.000000
mean         0.021600
std          0.240105
min         -2.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          2.000000
Name: difference, dtype: float64

DISTRIBUTION OF MISMATCHES BY GAME WEEK (GW)
    Total_Rows  Mismatches  Mismatch_Percentage
gw                                             
1          616         1.0                 0.16
2          627        70.0                11.16
3          648        58.0                 8.95
4          659        28.0                 4.25
5          661        54.0                 8.17
6          664        54.0                 8.13
7          666        21.0                 3.15
8          667        52.0                 7.80
9          670        33.0                 4.93
10         674        49.0                 7.27
11       

### Create Reconciled Cost Column

Create `cost_reconciled` as the maximum of `now_cost * 10` and `value`

In [30]:
# Create reconciled cost column using maximum of now_cost*10 and value
df['cost_reconciled'] = np.maximum(df['now_cost'] * 10, df['value'])

print("Created 'cost_reconciled' column")
print(f"\nSample of cost_reconciled values:")
print(df[['now_cost', 'value', 'cost_reconciled']].head(10))

Created 'cost_reconciled' column

Sample of cost_reconciled values:
   now_cost  value  cost_reconciled
0       4.0     40             40.0
1       5.6     56             56.0
2       4.7     47             47.0
3       4.8     48             48.0
4       6.7     67             67.0
5       4.5     45             45.0
6       8.2     83             83.0
7       5.5     56             56.0
8       6.2     62             62.0
9      10.2    102            102.0


## Compare event_points vs total_points

Check if `event_points` equals `total_points` (these should be identical from different sources)

In [31]:
# Create comparison column
df['points_match'] = df['event_points'] == df['total_points']

# Calculate statistics
total_rows = len(df)
matching_rows = df['points_match'].sum()
mismatching_rows = total_rows - matching_rows

print("=" * 60)
print("COMPARISON: event_points vs total_points")
print("=" * 60)
print(f"Total rows: {total_rows}")
print(f"Matching rows: {matching_rows} ({matching_rows/total_rows*100:.2f}%)")
print(f"Mismatching rows: {mismatching_rows} ({mismatching_rows/total_rows*100:.2f}%)")
print()

# If there are mismatches, show the differences
if mismatching_rows > 0:
    df['points_difference'] = df['total_points'] - df['event_points']
    
    print("Difference Statistics (total_points - event_points):")
    print(df['points_difference'].describe())
    print()
    
    # Distribution of mismatches by game week
    print("=" * 60)
    print("DISTRIBUTION OF MISMATCHES BY GAME WEEK (GW)")
    print("=" * 60)
    
    mismatch_by_gw = df[~df['points_match']].groupby('gw').size()
    total_by_gw = df.groupby('gw').size()
    
    gw_distribution = pd.DataFrame({
        'Total_Rows': total_by_gw,
        'Mismatches': mismatch_by_gw,
        'Mismatch_Percentage': (mismatch_by_gw / total_by_gw * 100).round(2)
    })
    
    print(gw_distribution)
    print()
    
    print("Summary by GW:")
    print(f"GWs with mismatches: {len(mismatch_by_gw)}")
    print(f"GW with most mismatches: GW{mismatch_by_gw.idxmax()} ({mismatch_by_gw.max()} mismatches)")
    print()
    
    print("Sample of mismatching rows:")
    print(df[~df['points_match']][['gw', 'event_points', 'total_points', 'points_difference']].head(10))
else:
    print("✓ All values match perfectly!")

COMPARISON: event_points vs total_points
Total rows: 27222
Matching rows: 27222 (100.00%)
Mismatching rows: 0 (0.00%)

✓ All values match perfectly!


## Compare ep_this vs xP

Check if `ep_this` equals `xP` (both represent expected points from different sources)

In [32]:
# Create comparison column
df['ep_match'] = df['ep_this'] == df['xP']

# Calculate statistics
total_rows = len(df)
matching_rows = df['ep_match'].sum()
mismatching_rows = total_rows - matching_rows

print("=" * 60)
print("COMPARISON: ep_this vs xP")
print("=" * 60)
print(f"Total rows: {total_rows}")
print(f"Matching rows: {matching_rows} ({matching_rows/total_rows*100:.2f}%)")
print(f"Mismatching rows: {mismatching_rows} ({mismatching_rows/total_rows*100:.2f}%)")
print()

# If there are mismatches, show the differences
if mismatching_rows > 0:
    df['ep_difference'] = df['xP'] - df['ep_this']
    
    print("Difference Statistics (xP - ep_this):")
    print(df['ep_difference'].describe())
    print()
    
    # Check for game weeks where xP is all zeros
    print("=" * 60)
    print("GAME WEEKS WHERE xP IS ALL ZEROS")
    print("=" * 60)
    
    zero_xP_by_gw = df[df['xP'] == 0].groupby('gw').size()
    total_by_gw = df.groupby('gw').size()
    
    gws_all_zero = zero_xP_by_gw[zero_xP_by_gw == total_by_gw].index.tolist()
    if gws_all_zero:
        print(f"GWs with all xP = 0: {gws_all_zero}")
    else:
        print("No GWs with all xP = 0")
    print()
    
    # Distribution of mismatches by game week
    print("=" * 60)
    print("DISTRIBUTION OF MISMATCHES BY GAME WEEK (GW)")
    print("=" * 60)
    
    mismatch_by_gw = df[~df['ep_match']].groupby('gw').size()
    total_by_gw = df.groupby('gw').size()
    
    gw_distribution = pd.DataFrame({
        'Total_Rows': total_by_gw,
        'Mismatches': mismatch_by_gw,
        'Mismatch_Percentage': (mismatch_by_gw / total_by_gw * 100).round(2)
    })
    
    # Fill NaN with 0 (GWs with no mismatches)
    gw_distribution = gw_distribution.fillna(0)
    # Convert to int for cleaner display
    gw_distribution['Mismatches'] = gw_distribution['Mismatches'].astype(int)
    
    print(gw_distribution)
    print()
    print("Note: Mismatches = 0 means ep_this == xP for all rows in that GW")
    print()
    
    print("Summary by GW:")
    gws_with_mismatches = gw_distribution[gw_distribution['Mismatches'] > 0]
    print(f"GWs with mismatches: {len(gws_with_mismatches)}")
    if len(gws_with_mismatches) > 0:
        max_mismatch_gw = gws_with_mismatches['Mismatches'].idxmax()
        print(f"GW with most mismatches: GW{max_mismatch_gw} ({gws_with_mismatches.loc[max_mismatch_gw, 'Mismatches']} mismatches)")
    print()
    
    print("Sample of mismatching rows:")
    print(df[~df['ep_match']][['gw', 'ep_this', 'xP', 'ep_difference', 'total_points']].head(10))
else:
    print("✓ All values match perfectly!")

COMPARISON: ep_this vs xP
Total rows: 27222
Matching rows: 23515 (86.38%)
Mismatching rows: 3707 (13.62%)

Difference Statistics (xP - ep_this):
count    27222.00000
mean        -0.06087
std          0.90677
min        -22.00000
25%          0.00000
50%          0.00000
75%          0.00000
max         26.40000
Name: ep_difference, dtype: float64

GAME WEEKS WHERE xP IS ALL ZEROS
GWs with all xP = 0: [22, 32, 34]

DISTRIBUTION OF MISMATCHES BY GAME WEEK (GW)
    Total_Rows  Mismatches  Mismatch_Percentage
gw                                             
1          616           0                 0.00
2          627           0                 0.00
3          648           0                 0.00
4          659           0                 0.00
5          661           0                 0.00
6          664           0                 0.00
7          666           0                 0.00
8          667           0                 0.00
9          670           0                 0.00
10       

### Create Reconciled Expected Points Column

Create `ep_reconciled` by choosing the value (ep_this or xP) that is closest to total_points

In [33]:
# Calculate absolute differences from total_points for both columns
diff_ep_this = np.abs(df['ep_this'] - df['total_points'])
diff_xP = np.abs(df['xP'] - df['total_points'])

# Choose the value with smaller difference (argmin logic)
df['ep_reconciled'] = np.where(diff_ep_this <= diff_xP, df['ep_this'], df['xP'])

# Create indicator showing which column was chosen
df['ep_source'] = np.where(diff_ep_this <= diff_xP, 'ep_this', 'xP')

print("Created 'ep_reconciled' column")
print(f"\nSource distribution:")
print(df['ep_source'].value_counts())
print(f"\nSource distribution by GW:")
print(df.groupby('gw')['ep_source'].value_counts().unstack(fill_value=0))
print(f"\nSample of reconciled values:")
print(df[['gw', 'ep_this', 'xP', 'total_points', 'ep_reconciled', 'ep_source']].head(10))

Created 'ep_reconciled' column

Source distribution:
ep_source
ep_this    25487
xP          1735
Name: count, dtype: int64

Source distribution by GW:
ep_source  ep_this   xP
gw                     
1              616    0
2              627    0
3              648    0
4              659    0
5              661    0
6              664    0
7              666    0
8              667    0
9              670    0
10             674    0
11             505  173
12             684    0
13             566  124
14             693    0
15             630    0
16             701    0
17             568  133
18             705    0
19             588  121
20             595  116
21             724    0
22             479  250
23             741   14
24             690   71
25             766   12
26             666  116
27             774    9
28             767   21
29             559   79
30             790    0
31             791    0
32             516  282
33             782   17
34       

## Mean Absolute Error (MAE) Analysis

Evaluate prediction accuracy of xP, ep_this, and ep_reconciled against actual total_points

In [34]:
# Calculate MAE for each prediction method
mae_xP = np.abs(df['xP'] - df['total_points']).mean()
mae_ep_this = np.abs(df['ep_this'] - df['total_points']).mean()
mae_ep_reconciled = np.abs(df['ep_reconciled'] - df['total_points']).mean()

print("=" * 60)
print("OVERALL MEAN ABSOLUTE ERROR (MAE)")
print("=" * 60)
print(f"MAE (xP vs total_points):           {mae_xP:.4f}")
print(f"MAE (ep_this vs total_points):      {mae_ep_this:.4f}")
print(f"MAE (ep_reconciled vs total_points): {mae_ep_reconciled:.4f}")
print()

# Calculate improvement
best_individual = min(mae_xP, mae_ep_this)
improvement = ((best_individual - mae_ep_reconciled) / best_individual) * 100
print(f"Improvement from reconciliation: {improvement:.2f}%")
print()

# Calculate MAE by game week
mae_by_gw = df.groupby('gw').apply(lambda x: pd.Series({
    'MAE_xP': np.abs(x['xP'] - x['total_points']).mean(),
    'MAE_ep_this': np.abs(x['ep_this'] - x['total_points']).mean(),
    'MAE_ep_reconciled': np.abs(x['ep_reconciled'] - x['total_points']).mean()
}))

print("MAE by Game Week:")
print(mae_by_gw)

OVERALL MEAN ABSOLUTE ERROR (MAE)
MAE (xP vs total_points):           0.9497
MAE (ep_this vs total_points):      0.8884
MAE (ep_reconciled vs total_points): 0.8452

Improvement from reconciliation: 4.86%

MAE by Game Week:
      MAE_xP  MAE_ep_this  MAE_ep_reconciled
gw                                          
1   1.311364     1.311364           1.311364
2   0.821053     0.821053           0.821053
3   0.806944     0.806944           0.806944
4   0.947496     0.947496           0.947496
5   0.772163     0.772163           0.772163
6   0.903614     0.903614           0.903614
7   0.978529     0.978529           0.978529
8   0.853823     0.853823           0.853823
9   0.838507     0.838507           0.838507
10  0.797626     0.797626           0.797626
11  0.832891     0.885988           0.754425
12  0.888889     0.888889           0.888889
13  0.851304     0.858696           0.779855
14  1.064358     1.064358           1.064358
15  0.885714     0.885714           0.885714
16  0.869187

  mae_by_gw = df.groupby('gw').apply(lambda x: pd.Series({


## (Dated) Data Validation: chance_of_playing_this_round vs minutes

Check if there are violations where `chance_of_playing_this_round = 0` but `minutes > 0`

In [36]:
# Find rows where chance_of_playing_this_round = 0
zero_chance_rows = df[df['chance_of_playing_this_round'] == 0]

# Among those, find violations where minutes > 0
violations = zero_chance_rows[zero_chance_rows['minutes'] > 0]

print("=" * 60)
print("DATA VALIDATION: chance_of_playing_this_round vs minutes")
print("=" * 60)
print(f"Total rows where chance_of_playing_this_round = 0: {len(zero_chance_rows)}")
print(f"Violations (minutes > 0 despite 0% chance): {len(violations)}")

if len(violations) > 0:
    violation_pct = (len(violations) / len(zero_chance_rows)) * 100
    print(f"Violation rate: {violation_pct:.2f}%")
    print()
    
    print("Minutes distribution in violations:")
    print(violations['minutes'].describe())
    print()
    
    # Distribution by game week
    print("=" * 60)
    print("VIOLATIONS BY GAME WEEK")
    print("=" * 60)
    
    violations_by_gw = violations.groupby('gw').size()
    zero_chance_by_gw = zero_chance_rows.groupby('gw').size()
    
    gw_validation = pd.DataFrame({
        'Total_0%_Chance': zero_chance_by_gw,
        'Violations': violations_by_gw,
        'Violation_Percentage': (violations_by_gw / zero_chance_by_gw * 100).round(2)
    })
    
    # Fill NaN with 0 (GWs with no violations)
    gw_validation = gw_validation.fillna(0)
    gw_validation['Violations'] = gw_validation['Violations'].astype(int)
    
    print(gw_validation)
    print()
    
    print("Sample of violation rows:")
    print(violations[['gw', 'name', 'chance_of_playing_this_round', 'minutes', 'total_points']].head(20))
else:
    print("\n✓ No violations found! All rows with 0% chance have 0 minutes.")

DATA VALIDATION: chance_of_playing_this_round vs minutes
Total rows where chance_of_playing_this_round = 0: 7354
Violations (minutes > 0 despite 0% chance): 18
Violation rate: 0.24%

Minutes distribution in violations:
count    18.000000
mean     43.388889
std      33.011832
min       1.000000
25%      15.500000
50%      30.500000
75%      77.500000
max      90.000000
Name: minutes, dtype: float64

VIOLATIONS BY GAME WEEK
    Total_0%_Chance  Violations  Violation_Percentage
gw                                                   
1                78           0                  0.00
2                92           0                  0.00
3               145           0                  0.00
4               164           0                  0.00
5               168           0                  0.00
6               173           1                  0.58
7               171           0                  0.00
8               173           0                  0.00
9               177           0   

## (Dated) 
Check if there are violations where `chance_of_playing_this_round = 100` but `minutes = 0`

In [37]:
# Find rows where chance_of_playing_this_round = 100
full_chance_rows = df[df['chance_of_playing_this_round'] == 100]
violations_100 = full_chance_rows[full_chance_rows['minutes'] == 0]

print("=" * 60)
print("DATA VALIDATION: chance_of_playing_this_round = 100 vs minutes")
print("=" * 60)
print(f"Total rows where chance_of_playing_this_round = 100: {len(full_chance_rows)}")
print(f"Violations (minutes = 0 despite 100% chance): {len(violations_100)}")

if len(violations_100) > 0:
    violation_pct = (len(violations_100) / len(full_chance_rows)) * 100
    print(f"Violation rate: {violation_pct:.2f}%")
    print()
    
    violations_by_gw = violations_100.groupby('gw').size()
    full_chance_by_gw = full_chance_rows.groupby('gw').size()
    
    gw_validation = pd.DataFrame({
        'Total_100%_Chance': full_chance_by_gw,
        'Violations': violations_by_gw,
        'Violation_Percentage': (violations_by_gw / full_chance_by_gw * 100).round(2)
    }).fillna(0)
    gw_validation['Violations'] = gw_validation['Violations'].astype(int)
    
    print(gw_validation)
    print("\nSample of violation rows:")
    print(violations_100[['gw', 'name', 'chance_of_playing_this_round', 'minutes', 'total_points']].head(20))
else:
    print("\n✓ No violations found! All rows with 100% chance have minutes > 0.")


DATA VALIDATION: chance_of_playing_this_round = 100 vs minutes
Total rows where chance_of_playing_this_round = 100: 8432
Violations (minutes = 0 despite 100% chance): 2073
Violation rate: 24.58%

    Total_100%_Chance  Violations  Violation_Percentage
gw                                                     
1                   4           2                 50.00
2                   8           1                 12.50
3                  32          11                 34.38
4                  56          15                 26.79
5                  75          20                 26.67
6                  86          21                 24.42
7                 100          23                 23.00
8                 121          31                 25.62
9                 143          36                 25.17
10                149          37                 24.83
11                167          49                 29.34
12                162          43                 26.54
13                20

## (Dated) 

In [38]:
# Function to add noise with bounds checking
def add_noise(minutes, sigma=3, seed=42):
    """Add Gaussian noise to minutes, keeping values in [0, 90]."""
    np.random.seed(seed)
    noisy_minutes = np.zeros(len(minutes))
    
    for i, m in enumerate(minutes):
        while True:
            noise = np.random.normal(0, sigma)
            noisy = m + noise
            if 0 <= noisy <= 90:
                noisy_minutes[i] = noisy
                break
    
    return noisy_minutes

# Step 1: Add noise to minutes
print("Adding noise to minutes...")
df['minutes_noisy'] = add_noise(df['minutes'].values, sigma=3)

# Step 2: Apply sigmoid transformation
print("Applying sigmoid transformation...")
x = (df['minutes_noisy'] / 90 - 0.5) * 12  # center and scale
df['chance_proxy'] = 1 / (1 + np.exp(-x))

# Display results
print("\n" + "=" * 60)
print("CHANCE_PROXY CREATED")
print("=" * 60)
print(f"Range: [{df['chance_proxy'].min():.4f}, {df['chance_proxy'].max():.4f}]")
print(f"Mean: {df['chance_proxy'].mean():.4f}")
print(f"Std: {df['chance_proxy'].std():.4f}")
print(f"\nSample comparison:")
print(df[['minutes', 'minutes_noisy', 'chance_proxy']].head(10))


Adding noise to minutes...
Applying sigmoid transformation...

CHANCE_PROXY CREATED
Range: [0.0025, 0.9975]
Mean: 0.3033
Std: 0.4425

Sample comparison:
   minutes  minutes_noisy  chance_proxy
0        0       1.490142      0.003014
1       90      89.585207      0.997387
2        0       1.943066      0.003202
3        0       4.569090      0.004538
4       86      85.297540      0.995381
5       45      44.297589      0.476603
6        0       4.737638      0.004640
7       90      88.591577      0.997018
8       90      88.609747      0.997025
9        0       0.725887      0.002723


## (Dated) 

In [40]:
# Select and rename columns
output_df = df[['gw', 'name', 'cost_reconciled', 'event_points', 'position', 'team', 'minutes', 'ep_reconciled', 'chance_proxy']].copy()

# Rename columns
output_df = output_df.rename(columns={
    'cost_reconciled': 'cost',
    'event_points': 'points',
    'ep_reconciled': 'eP',
    'chance_proxy': 'prob_showup'
})

# Sort by gw in ascending order
output_df = output_df.sort_values('gw', ascending=True)

# Export to CSV
output_df.to_csv('cleaned_data.csv', index=False)

print(f"CSV created with shape: {output_df.shape}")
print(f"\nColumn names in output:")
print(output_df.columns.tolist())
print(f"\nFirst few rows (sorted by gw):")
print(output_df.head(10))
print(f"\nLast few rows:")
print(output_df.tail(10))


CSV created with shape: (27222, 9)

Column names in output:
['gw', 'name', 'cost', 'points', 'position', 'team', 'minutes', 'eP', 'prob_showup']

First few rows (sorted by gw):
       gw                               name  cost  points position  \
18760   1                          Joe Aribo  50.0       2      MID   
18778   1                       Paul Onuachu  50.0       0      FWD   
18745   1                     Jonathan Panzo  40.0       0      DEF   
18744   1                 Andrew Omobamidele  45.0       0      DEF   
18743   1                      Lewis O'Brien  50.0       0      MID   
18742   1                      Omar Richards  45.0       0      DEF   
18741   1                      Neco Williams  45.0       1      DEF   
18740   1  Murillo Santiago Costa dos Santos  45.0       2      DEF   
18786   1                       Flynn Downes  50.0       2      MID   
18772   1                  Juan Larios López  40.0       0      DEF   

                team  minutes   eP  prob_

## Player Set Issue

In [41]:
# Count distribution of player appearances across GWs
player_gw_counts = output_df.groupby('name').size()
distribution = player_gw_counts.value_counts().sort_index(ascending=False)

print("Player Distribution by GW Appearances:")
print("=" * 40)
for gw_count in range(38, 0, -1):
    num_players = distribution.get(gw_count, 0)
    print(f"{num_players:3d} players present in {gw_count:2d} GWs")


Player Distribution by GW Appearances:
417 players present in 38 GWs
116 players present in 37 GWs
105 players present in 36 GWs
  9 players present in 35 GWs
 11 players present in 34 GWs
  4 players present in 33 GWs
  2 players present in 32 GWs
  2 players present in 31 GWs
  3 players present in 30 GWs
  2 players present in 29 GWs
  4 players present in 28 GWs
  7 players present in 27 GWs
  6 players present in 26 GWs
  3 players present in 25 GWs
  1 players present in 24 GWs
  8 players present in 23 GWs
  1 players present in 22 GWs
  2 players present in 21 GWs
  5 players present in 20 GWs
  2 players present in 19 GWs
 13 players present in 18 GWs
  5 players present in 17 GWs
 14 players present in 16 GWs
 15 players present in 15 GWs
 19 players present in 14 GWs
  5 players present in 13 GWs
  2 players present in 12 GWs
  3 players present in 11 GWs
  2 players present in 10 GWs
  1 players present in  9 GWs
  2 players present in  8 GWs
  6 players present in  7 GWs
 

In [42]:
# Find which GWs are missing for players in 37 GWs
player_gw_counts = output_df.groupby('name').size()
players_37_gws = player_gw_counts[player_gw_counts == 37].index

missing_gw_distribution_37 = {}
for player in players_37_gws:
    player_gws = set(output_df[output_df['name'] == player]['gw'])
    missing_gw = list(set(range(1, 39)) - player_gws)[0]  # Only 1 missing
    missing_gw_distribution_37[missing_gw] = missing_gw_distribution_37.get(missing_gw, 0) + 1

print("Players in 37 GWs - Missing GW Distribution:")
print("=" * 40)
for gw in sorted(missing_gw_distribution_37.keys()):
    print(f"GW {gw:2d}: {missing_gw_distribution_37[gw]:3d} players")
print()

# Find which GWs are missing for players in 36 GWs
players_36_gws = player_gw_counts[player_gw_counts == 36].index

missing_gw_pairs_36 = {}
for player in players_36_gws:
    player_gws = set(output_df[output_df['name'] == player]['gw'])
    missing_gws = tuple(sorted(set(range(1, 39)) - player_gws))  # 2 missing GWs
    missing_gw_pairs_36[missing_gws] = missing_gw_pairs_36.get(missing_gws, 0) + 1

print("Players in 36 GWs - Missing GW Pair Distribution:")
print("=" * 40)
for gw_pair in sorted(missing_gw_pairs_36.keys()):
    print(f"GWs {gw_pair}: {missing_gw_pairs_36[gw_pair]:3d} players")


Players in 37 GWs - Missing GW Distribution:
GW  1:   7 players
GW 15:  26 players
GW 29:  33 players
GW 34:  50 players

Players in 36 GWs - Missing GW Pair Distribution:
GWs (1, 2):  11 players
GWs (1, 15):   2 players
GWs (1, 34):   2 players
GWs (15, 29):  28 players
GWs (29, 34):  62 players


In [43]:
# Get players in at least 36 GWs
player_gw_counts = output_df.groupby('name').size()
players_at_least_36 = player_gw_counts[player_gw_counts >= 36].index

# For each target GW, find which of these players are missing
target_gws = [1, 15, 29, 34]

print("Players (in AT LEAST 36 GWs) Missing in Target GWs:")
print("=" * 60)

for target_gw in target_gws:
    # Players present in this GW
    players_in_gw = set(output_df[output_df['gw'] == target_gw]['name'])
    
    # Players (at least 36 GWs) missing in this GW
    missing_players = sorted(set(players_at_least_36) - players_in_gw)
    
    print(f"\nGW {target_gw:2d}: {len(missing_players)} players missing")
    for player in missing_players:
        print(f"  - {player}")


Players (in AT LEAST 36 GWs) Missing in Target GWs:

GW  1: 22 players missing
  - André Trindade da Costa Neto
  - Asmir Begovic
  - Bastien Meupiyou
  - Bilal El Khannouss
  - Chiedozie Ogbene
  - Dara O'Shea
  - Ethan Wheatley
  - Ferdi Kadioglu
  - Francisco Evanilson de Lima Barbosa
  - Georginio Rutter
  - Gustavo Nunes Fernandes Gomes
  - Ilkay Gündogan
  - Issa Kaboré
  - Jack Clarke
  - Jens Cajuste
  - João Félix Sequeira
  - Kaelan Casey
  - Mateus Gonçalo Espanha Fernandes
  - Matt O'Riley
  - Ramón Sosa
  - Roman Dixon
  - Sander Berge

GW 15: 56 players missing
  - Abdoulaye Doucouré
  - Alexis Mac Allister
  - Alisson Ramses Becker
  - Andrew Robertson
  - Armando Broja
  - Ashley Young
  - Asmir Begovic
  - Ben Doak
  - Bobby Clark
  - Caoimhin Kelleher
  - Cody Gakpo
  - Conor Bradley
  - Curtis Jones
  - Darwin Núñez Ribeiro
  - Diogo Teixeira da Silva
  - Dominic Calvert-Lewin
  - Dominik Szoboszlai
  - Dwight McNeil
  - Endo Wataru
  - Harrison Armstrong
  - Harvey 

## Player Set Standardization Strategy

### Problem
Not every game week has the same players available, making optimization difficult. Some players appear in all 38 GWs, while others appear in fewer.

### Solution: Focus on Consistent Players (≥36 GWs)

We standardize the dataset by **including only players who appeared in at least 36 GWs** (~638 players), ensuring every GW has the same player roster.

#### Why ≥36 GWs?
- Recovers **~200 additional high-quality players** beyond the 417 who played all 38 GWs
- **Key insight**: Missing GWs are **not consecutive** - they concentrate in specific game weeks (GW 1, 15, 29, 34)
- Players with <36 appearances are too sporadic to be reliable options

#### Imputation for Missing GWs
For the 221 players (116+105) missing 1-2 GWs, we impute missing data with:

**Cost Interpolation Rules:**
- **Missing GW1**: Use GW2 cost if available, otherwise use GW3 cost
- **Missing GW2**: Use GW3 cost  
- **Missing GW3-38**: Linear interpolation `(cost_{i-1} + cost_{i+1}) / 2`, rounded to nearest integer

**Other Stats:**
- Points, eP, prob_showup, minutes: All set to 0
- Unavailable flag: Set to 1

**Why this works**: Non-consecutive missing pattern means neighboring GWs provide reliable cost estimates with minimal error. Hardcoded edge cases for GW1/GW2 handle the rare cases where early-season data is missing.

### Outcome
- Dataset contains **~638 players** who played nearly full season
- Every GW has **identical player count**
- `unavailable` column distinguishes real vs imputed data
- Cleaner dataset focusing on realistic player options


In [45]:
## Standardize Player Set - Focus on Players in ≥36 GWs Only

print("=" * 60)
print("STANDARDIZING PLAYER SET (≥36 GWs)")
print("=" * 60)

# Get player appearance counts
player_gw_counts = output_df.groupby('name').size()

# Keep only players in at least 36 GWs
players_to_keep = player_gw_counts[player_gw_counts >= 36].index.tolist()

print(f"Players in ≥36 GWs: {len(players_to_keep)}")
print(f"Players excluded (<36 GWs): {len(player_gw_counts) - len(players_to_keep)}")
print()

# Filter to keep only these players
filtered_df = output_df[output_df['name'].isin(players_to_keep)].copy()

# Add unavailable column (0 for existing data)
filtered_df['unavailable'] = 0

# Create list to store all data
all_data = filtered_df.to_dict('records')

print("=" * 60)
print("IMPUTING MISSING GWs")
print("=" * 60)

imputed_count = 0

for player in players_to_keep:
    player_data = filtered_df[filtered_df['name'] == player]
    existing_gws = set(player_data['gw'])
    missing_gws = sorted(set(range(1, 39)) - existing_gws)
    
    if len(missing_gws) == 0:
        continue
    
    # Get player metadata
    first_row = player_data.iloc[0]
    position = first_row['position']
    team = first_row['team']
    
    for missing_gw in missing_gws:
        # Hardcoded interpolation logic
        if missing_gw == 1:
            # Use GW3 if both GW1 and GW2 are missing
            gw2_data = player_data[player_data['gw'] == 2]
            if len(gw2_data) > 0:
                cost_interpolated = gw2_data['cost'].values[0]
            else:
                # GW2 also missing, use GW3
                cost_interpolated = player_data[player_data['gw'] == 3]['cost'].values[0]
        elif missing_gw == 2:
            # Use GW3 if GW2 missing
            cost_interpolated = player_data[player_data['gw'] == 3]['cost'].values[0]
        else:
            # Standard interpolation for all other GWs
            prev_gw_data = player_data[player_data['gw'] == missing_gw - 1]
            next_gw_data = player_data[player_data['gw'] == missing_gw + 1]
            
            cost_prev = prev_gw_data['cost'].values[0]
            cost_next = next_gw_data['cost'].values[0]
            cost_interpolated = round((cost_prev + cost_next) / 2)
        
        # Create imputed row
        imputed_row = {
            'gw': missing_gw,
            'name': player,
            'position': position,
            'team': team,
            'cost': cost_interpolated,
            'points': 0,
            'eP': 0,
            'prob_showup': 0,
            'minutes': 0,
            'unavailable': 1
        }
        all_data.append(imputed_row)
        imputed_count += 1

print(f"Imputed {imputed_count} missing GWs")
print()

# Create final DataFrame
final_df = pd.DataFrame(all_data)

# Create player_id mapping
all_unique_players = sorted(final_df['name'].unique())
player_id_map = {name: idx for idx, name in enumerate(all_unique_players, start=1)}
final_df['player_id'] = final_df['name'].map(player_id_map)

# Reorder columns
column_order = ['player_id', 'gw', 'name', 'position', 'team', 'cost', 
                'points', 'eP', 'prob_showup', 'minutes', 'unavailable']
final_df = final_df[column_order]

# Sort by gw, then player_id
final_df = final_df.sort_values(['gw', 'player_id']).reset_index(drop=True)

print("=" * 60)
print("FINAL DATASET STATISTICS")
print("=" * 60)
print(f"Total rows: {len(final_df)}")
print(f"Unique players: {final_df['player_id'].nunique()}")
print(f"Rows per GW: {len(final_df) // 38}")
print(f"Unavailable=1 rows (imputed): {(final_df['unavailable'] == 1).sum()}")
print()

# Verify consistency
gw_counts = final_df.groupby('gw').size()
print(f"All GWs have same player count: {gw_counts.nunique() == 1}")
print()

# Save to CSV
final_df.to_csv('cleaned_data.csv', index=False)
print("✓ Data saved to cleaned_data.csv")
print(f"  Shape: {final_df.shape}")


STANDARDIZING PLAYER SET (≥36 GWs)
Players in ≥36 GWs: 638
Players excluded (<36 GWs): 166

IMPUTING MISSING GWs
Imputed 326 missing GWs

FINAL DATASET STATISTICS
Total rows: 24244
Unique players: 638
Rows per GW: 638
Unavailable=1 rows (imputed): 326

All GWs have same player count: True

✓ Data saved to cleaned_data.csv
  Shape: (24244, 11)
