# Preprocessing the IPL Deliveries Dataset
This notebook performs basic cleaning, renaming, and feature engineering on `deliveries.csv` so it’s ready for loading into MySQL.

In [1]:
# 1. Import libraries and load raw data
import pandas as pd

# Load the raw deliveries CSV
deliveries_df = pd.read_csv('dataset/deliveries.csv')
deliveries_df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,


In [2]:
# 2. Rename columns for SQL readiness
deliveries_df.rename(columns={
    'batsman_runs': 'runs_batsman',
    'extra_runs': 'runs_extra',
    'total_runs': 'runs_total',
    'extras_type': 'extra_type',
    'batter': 'batsman'
}, inplace=True)

# Verify new column names
print(deliveries_df.columns.tolist())

['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball', 'batsman', 'bowler', 'non_striker', 'runs_batsman', 'runs_extra', 'runs_total', 'extra_type', 'is_wicket', 'player_dismissed', 'dismissal_kind', 'fielder']


In [3]:
# 3. Fill missing values in text-based columns with "None"
deliveries_df.fillna({
    'player_dismissed': 'None',
    'dismissal_kind': 'None',
    'fielder': 'None',
    'extra_type': 'None'
}, inplace=True)

# Confirm no nulls remain in those columns
print(deliveries_df[['player_dismissed','dismissal_kind','fielder','extra_type']].isna().sum())

player_dismissed    0
dismissal_kind      0
fielder             0
extra_type          0
dtype: int64


In [4]:
# 4. Data type conversions: ensure numeric columns are integers
for col in ['match_id','inning','over','ball','runs_batsman','runs_extra','runs_total','is_wicket']:
    deliveries_df[col] = deliveries_df[col].astype(int)

# Confirm dtypes
print(deliveries_df[['match_id','inning','over','ball','runs_batsman','runs_extra','runs_total','is_wicket']].dtypes)

match_id        int32
inning          int32
over            int32
ball            int32
runs_batsman    int32
runs_extra      int32
runs_total      int32
is_wicket       int32
dtype: object


In [5]:
# 5. Feature engineering: add 'ball_number' and 'is_extra'
# 'ball_number' = over * 6 + ball
deliveries_df['ball_number'] = deliveries_df['over'] * 6 + deliveries_df['ball']

# 'is_extra' flag: 1 if runs_extra > 0, else 0
deliveries_df['is_extra'] = (deliveries_df['runs_extra'] > 0).astype(int)

# Show a few rows to confirm
print(deliveries_df[['over','ball','ball_number','runs_extra','is_extra']].head())

   over  ball  ball_number  runs_extra  is_extra
0     0     1            1           1         1
1     0     2            2           0         0
2     0     3            3           1         1
3     0     4            4           0         0
4     0     5            5           0         0


In [6]:
# 6. Reorder columns (bring primary/foreign keys first for clarity)
cols_order = [
    'match_id','inning','over','ball','ball_number',
    'batting_team','bowling_team','batsman','bowler','non_striker',
    'runs_batsman','runs_extra','runs_total','is_extra','is_wicket',
    'player_dismissed','dismissal_kind','fielder','extra_type'
]
deliveries_df = deliveries_df[cols_order]

# Verify final column order
deliveries_df.head()

Unnamed: 0,match_id,inning,over,ball,ball_number,batting_team,bowling_team,batsman,bowler,non_striker,runs_batsman,runs_extra,runs_total,is_extra,is_wicket,player_dismissed,dismissal_kind,fielder,extra_type
0,335982,1,0,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,BB McCullum,0,1,1,1,0,,,,legbyes
1,335982,1,0,2,2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,,,,
2,335982,1,0,3,3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,SC Ganguly,0,1,1,1,0,,,,wides
3,335982,1,0,4,4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,,,,
4,335982,1,0,5,5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,SC Ganguly,0,0,0,0,0,,,,


In [8]:
# 7. Save the cleaned DataFrame to a new CSV
deliveries_df.to_csv('preprocessed_data/deliveries_cleaned.csv', index=False)

# Display confirmation
print("deliveries_cleaned.csv has been written. Row count:", len(deliveries_df))

deliveries_cleaned.csv has been written. Row count: 260920


```markdown
(End of preprocessing steps for `deliveries.csv`. Your cleaned file is now ready for import into MySQL as `deliveries_cleaned.csv`.)