# Premier League Data Preprocessing

This notebook cleans and preprocesses the Premier League dataset for machine learning.

In [None]:
import pandas as pd
import os

In [None]:
def preprocess(infile='Premier_League.csv', outfile='premier_league_cleaned.csv'):
    """
    Clean and preprocess Premier League dataset
    
    Parameters:
    - infile: input CSV filename
    - outfile: output CSV filename
    
    Returns:
    - cleaned DataFrame
    """
    # Load raw data
    df = pd.read_csv(infile)
    print(f"Original dataset shape: {df.shape}")

    # Remove rows with any NA values in core columns
    core_cols = ['Goals Home', 'Away Goals', 'attendance',
                 'home_possessions', 'away_possessions', 'home_shots', 'away_shots']
    
    print(f"Missing values before cleaning:")
    print(df[core_cols].isnull().sum())
    
    df = df.dropna(subset=core_cols)

    # Clean attendance: remove commas and convert to int
    df['attendance'] = df['attendance'].str.replace(',', '')
    df['attendance'] = pd.to_numeric(df['attendance'], errors='coerce')

    # Ensure numeric types for stats
    numeric_cols = ['Goals Home', 'Away Goals', 'home_possessions', 'away_possessions', 'home_shots', 'away_shots']
    for c in numeric_cols:
        df[c] = pd.to_numeric(df[c], errors='coerce')

    # Drop rows again if after coercion there are NaNs
    df = df.dropna(subset=numeric_cols + ['attendance'])
    
    print(f"Cleaned dataset shape: {df.shape}")

    # Save cleaned CSV
    df.to_csv(outfile, index=False)
    print(f"Saved cleaned data to {outfile}")
    
    return df

In [None]:
# Run the preprocessing
clean_df = preprocess()

In [None]:
print("Preview of cleaned data:")
print(clean_df.head())
print(f"\nDataset info:")
print(f"Shape: {clean_df.shape}")
print(f"Columns: {list(clean_df.columns)}")

In [None]:
# Check data types and basic statistics
print("Data types:")
print(clean_df.dtypes)

print("\nBasic statistics:")
print(clean_df.describe())

In [None]:
# Final verification - no missing values
print("Missing values in cleaned dataset:")
print(clean_df.isnull().sum())

print(f"\nAll attendance values are numeric: {clean_df['attendance'].dtype in ['int64', 'float64']}")