In [11]:
# Feature Engineering & Preprocessing (Week 3–4)

This notebook performs:
- Player-match level aggregation
- Feature engineering (form, venue, opponent, career stats)
- Label creation for next-match prediction
- Time-series aware train-test split
- Preprocessing pipeline saving


SyntaxError: invalid syntax (2082733551.py, line 3)

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


In [None]:
df = pd.read_csv("ipl_cleaned_data.csv")
df.head()


Unnamed: 0,city,outcome,overs,season,venue,inning,over,batter,bowler,non_striker,...,wicket_player_out,year,month,day,match_number,winner,team_A,team_B,decision,winteam
0,Chandigarh,"{'by': {'wickets': 6}, 'winner': 'Kings XI Pun...",20,2016.0,Mohali,Rising Pune Supergiants,0,AM Rahane,Sandeep Sharma,F du Plessis,...,playing,2016.0,4.0,17.0,10.0,1,1,12,bat,0
1,Chandigarh,"{'by': {'wickets': 6}, 'winner': 'Kings XI Pun...",20,2016.0,Mohali,Rising Pune Supergiants,0,AM Rahane,Sandeep Sharma,F du Plessis,...,playing,2016.0,4.0,17.0,10.0,1,1,12,bat,0
2,Chandigarh,"{'by': {'wickets': 6}, 'winner': 'Kings XI Pun...",20,2016.0,Mohali,Rising Pune Supergiants,0,AM Rahane,Sandeep Sharma,F du Plessis,...,playing,2016.0,4.0,17.0,10.0,1,1,12,bat,0
3,Chandigarh,"{'by': {'wickets': 6}, 'winner': 'Kings XI Pun...",20,2016.0,Mohali,Rising Pune Supergiants,0,AM Rahane,Sandeep Sharma,F du Plessis,...,playing,2016.0,4.0,17.0,10.0,1,1,12,bat,0
4,Chandigarh,"{'by': {'wickets': 6}, 'winner': 'Kings XI Pun...",20,2016.0,Mohali,Rising Pune Supergiants,0,AM Rahane,Sandeep Sharma,F du Plessis,...,playing,2016.0,4.0,17.0,10.0,1,1,12,bat,0


In [None]:
df['wickets'] = df['wicket_player_out'].notna().astype(int)


In [None]:
player_match = df.groupby(
    ['match_number', 'batter', 'team_A', 'team_B', 'venue']
).agg(
    runs_batter=('runs_batter', 'sum'),
    balls_faced=('over', 'count')
).reset_index()

bowling_stats = df.groupby(
    ['match_number', 'bowler', 'team_A', 'team_B', 'venue']
).agg(
    wickets=('wickets', 'sum'),
    runs_conceded=('runs_total', 'sum')
).reset_index()

player_match = pd.merge(
    player_match,
    bowling_stats,
    left_on=['match_number','batter','team_A','team_B','venue'],
    right_on=['match_number','bowler','team_A','team_B','venue'],
    how='outer'
)

player_match.fillna(0, inplace=True)
player_match.head()


Unnamed: 0,match_number,batter,team_A,team_B,venue,runs_batter,balls_faced,bowler,wickets,runs_conceded
0,0.0,A Badoni,8,6,Chennai,1.0,7.0,0,0.0,0.0
1,0.0,0,5,3,RSA,0.0,0.0,A Kumble,24.0,31.0
2,0.0,0,5,4,Mumbai,0.0,0.0,A Kumble,21.0,16.0
3,0.0,A Kumble,5,4,RSA,1.0,1.0,A Kumble,24.0,18.0
4,0.0,A Kumble,5,8,Mumbai,1.0,3.0,A Kumble,26.0,32.0


In [None]:
player_match['runs_at_venue'] = player_match.groupby(
    ['batter','venue']
)['runs_batter'].transform('mean')

player_match['wickets_at_venue'] = player_match.groupby(
    ['bowler','venue']
)['wickets'].transform('mean')

player_match['runs_vs_opponent'] = player_match.groupby(
    ['batter','team_B']
)['runs_batter'].transform('mean')

player_match['wickets_vs_opponent'] = player_match.groupby(
    ['bowler','team_B']
)['wickets'].transform('mean')

player_match.fillna(0, inplace=True)


In [None]:
player_match = player_match.sort_values(['batter','match_number'])

player_match['career_runs'] = (
    player_match.groupby('batter')['runs_batter'].cumsum()
    - player_match['runs_batter']
)

player_match['career_wickets'] = (
    player_match.groupby('bowler')['wickets'].cumsum()
    - player_match['wickets']
)

player_match['career_runs_conceded'] = (
    player_match.groupby('bowler')['runs_conceded'].cumsum()
    - player_match['runs_conceded']
)

player_match.fillna(0, inplace=True)


In [None]:
player_match['batting_form'] = (
    player_match.groupby('batter')['runs_batter']
    .rolling(3, min_periods=1).mean()
    .reset_index(level=0, drop=True)
)

player_match['bowling_form'] = (
    player_match.groupby('bowler')['wickets']
    .rolling(3, min_periods=1).mean()
    .reset_index(level=0, drop=True)
)


In [None]:
player_match['next_match_runs'] = (
    player_match.groupby('batter')['runs_batter'].shift(-1)
)

player_match['next_match_wickets'] = (
    player_match.groupby('bowler')['wickets'].shift(-1)
)

player_match.dropna(
    subset=['next_match_runs','next_match_wickets'],
    inplace=True
)


In [None]:
player_match.to_csv("../data/dataset.csv", index=False)


In [None]:
feature_cols = [
    'runs_batter','balls_faced','wickets','runs_conceded',
    'runs_at_venue','wickets_at_venue',
    'runs_vs_opponent','wickets_vs_opponent',
    'career_runs','career_wickets','career_runs_conceded',
    'batting_form','bowling_form'
]

X = player_match[feature_cols]
y = player_match['next_match_runs']

split = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), feature_cols)
])

preprocessor.fit(X_train)

joblib.dump(preprocessor, "../models/feature_pipeline.pkl")


['../models/feature_pipeline.pkl']