In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

matches = pd.read_csv("C:\\Users\\Bhavani\\Downloads\\matches.csv")
deliveries = pd.read_csv("C:\\Users\\Bhavani\\Downloads\\deliveries.csv")
matches.head(), deliveries.head()


(   match_id   season        city       date match_type player_of_match  \
 0    335982  2007/08   Bangalore  4/18/2008     League     BB McCullum   
 1    335983  2007/08  Chandigarh  4/19/2008     League      MEK Hussey   
 2    335984  2007/08       Delhi  4/19/2008     League     MF Maharoof   
 3    335985  2007/08      Mumbai  4/20/2008     League      MV Boucher   
 4    335986  2007/08     Kolkata  4/20/2008     League       DJ Hussey   
 
                                         venue                        team1  \
 0                       M Chinnaswamy Stadium  Royal Challengers Bangalore   
 1  Punjab Cricket Association Stadium, Mohali              Kings XI Punjab   
 2                            Feroz Shah Kotla             Delhi Daredevils   
 3                            Wankhede Stadium               Mumbai Indians   
 4                                Eden Gardens        Kolkata Knight Riders   
 
                          team2                  toss_winner toss_decisi

In [10]:
matches_selected = matches[[
    "match_id", "date", "team1", "team2", "toss_winner", "toss_decision",
    "winner", "result", "result_margin"
]]

deliveries_selected = deliveries[[
    "match_id", "inning", "batting_team", "bowling_team", "over", "ball",
    "total_runs", "is_wicket"
]]

merged_df = deliveries_selected.merge(matches_selected, on="match_id", how="left")

merged_df["cumulative_runs"] = merged_df.groupby(["match_id", "inning"])["total_runs"].cumsum()
merged_df["cumulative_wickets"] = merged_df.groupby(["match_id", "inning"])["is_wicket"].cumsum()
merged_df["overs_completed"] = merged_df["over"] + (merged_df["ball"] / 6)
merged_df["current_run_rate"] = merged_df["cumulative_runs"] / merged_df["overs_completed"].replace(0, 1)

merged_df['target_score'] = merged_df.groupby('match_id')['cumulative_runs'].transform('max') + 1

total_overs = 50
merged_df['remaining_overs'] = total_overs - merged_df['overs_completed']

merged_df['remaining_overs'] = merged_df['remaining_overs'].replace(0, 1)

merged_df['required_run_rate'] = (merged_df['target_score'] - merged_df['cumulative_runs']) / merged_df['remaining_overs']


final_df = merged_df.copy()

final_df = final_df.rename(columns={
    "team1": "home_team",
    "team2": "away_team"
})
team_mapping = {
    "Royal Challengers Bengaluru": "Royal Challengers Bangalore",
    "Rising Pune Supergiant": "Rising Pune Supergiants",
    "Delhi Daredevils": "Delhi Capitals",
    "Kings XI Punjab": "Punjab Kings"
}

final_df['winner'] = final_df['winner'].replace(team_mapping)
final_df['home_team'] = final_df['home_team'].replace(team_mapping)
final_df['away_team'] = final_df['away_team'].replace(team_mapping)

final_df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,total_runs,is_wicket,date,home_team,...,winner,result,result_margin,cumulative_runs,cumulative_wickets,overs_completed,current_run_rate,target_score,remaining_overs,required_run_rate
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,1,0,4/18/2008,Royal Challengers Bangalore,...,Kolkata Knight Riders,runs,140.0,1,0,0.166667,6.0,223,49.833333,4.454849
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,0,0,4/18/2008,Royal Challengers Bangalore,...,Kolkata Knight Riders,runs,140.0,1,0,0.333333,3.0,223,49.666667,4.469799
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,1,0,4/18/2008,Royal Challengers Bangalore,...,Kolkata Knight Riders,runs,140.0,2,0,0.5,4.0,223,49.5,4.464646
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,0,0,4/18/2008,Royal Challengers Bangalore,...,Kolkata Knight Riders,runs,140.0,2,0,0.666667,3.0,223,49.333333,4.47973
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,0,0,4/18/2008,Royal Challengers Bangalore,...,Kolkata Knight Riders,runs,140.0,2,0,0.833333,2.4,223,49.166667,4.494915


In [11]:
categorical_cols = [
    "home_team", "away_team", "toss_winner", "toss_decision", "winner",
    "result", "batting_team", "bowling_team"
]

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    final_df[col] = le.fit_transform(final_df[col])
    label_encoders[col] = le

numerical_cols = [
    "result_margin", "over", "ball", "total_runs", "is_wicket", 
    "cumulative_runs", "cumulative_wickets", "overs_completed", 
    "current_run_rate", "required_run_rate"
]

scaler = StandardScaler()
final_df[numerical_cols] = scaler.fit_transform(final_df[numerical_cols])
X = final_df.drop(columns=["winner"])
y = final_df["winner"]

final_df.to_csv("processed_dataset.csv", index=False)

In [12]:
final_df.info()
final_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260920 entries, 0 to 260919
Data columns (total 23 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   match_id            260920 non-null  int64  
 1   inning              260920 non-null  int64  
 2   batting_team        260920 non-null  int32  
 3   bowling_team        260920 non-null  int32  
 4   over                260920 non-null  float64
 5   ball                260920 non-null  float64
 6   total_runs          260920 non-null  float64
 7   is_wicket           260920 non-null  float64
 8   date                260920 non-null  object 
 9   home_team           260920 non-null  int32  
 10  away_team           260920 non-null  int32  
 11  toss_winner         260920 non-null  int32  
 12  toss_decision       260920 non-null  int32  
 13  winner              260920 non-null  int32  
 14  result              260920 non-null  int32  
 15  result_margin       256796 non-nul

match_id                 0
inning                   0
batting_team             0
bowling_team             0
over                     0
ball                     0
total_runs               0
is_wicket                0
date                     0
home_team                0
away_team                0
toss_winner              0
toss_decision            0
winner                   0
result                   0
result_margin         4124
cumulative_runs          0
cumulative_wickets       0
overs_completed          0
current_run_rate         0
target_score             0
remaining_overs          0
required_run_rate        0
dtype: int64

In [13]:
final_df["result_margin"].fillna(0, inplace=True)

final_df.to_csv("processed_dataset_cleaned.csv", index=False)

print("✅ Null values in 'result_margin' have been replaced with 0.")
print(final_df.isnull().sum())

✅ Null values in 'result_margin' have been replaced with 0.
match_id              0
inning                0
batting_team          0
bowling_team          0
over                  0
ball                  0
total_runs            0
is_wicket             0
date                  0
home_team             0
away_team             0
toss_winner           0
toss_decision         0
winner                0
result                0
result_margin         0
cumulative_runs       0
cumulative_wickets    0
overs_completed       0
current_run_rate      0
target_score          0
remaining_overs       0
required_run_rate     0
dtype: int64


In [14]:
duplicates = final_df.duplicated()
print("✅ Number of duplicate rows:", duplicates.sum())

✅ Number of duplicate rows: 0


In [15]:
print("✅ Dataset processed successfully!")
print("Features shape:", X.shape)
print("Target shape:", y.shape)
batting_team_mapping = dict(zip(label_encoders['batting_team'].classes_, label_encoders['batting_team'].transform(label_encoders['batting_team'].classes_)))
bowling_team_mapping = dict(zip(label_encoders['bowling_team'].classes_, label_encoders['bowling_team'].transform(label_encoders['bowling_team'].classes_)))
print("\nBatting Team Encoding:", batting_team_mapping)
print("Bowling Team Encoding:", bowling_team_mapping)

✅ Dataset processed successfully!
Features shape: (260920, 22)
Target shape: (260920,)

Batting Team Encoding: {'Chennai Super Kings': 0, 'Deccan Chargers': 1, 'Delhi Capitals': 2, 'Delhi Daredevils': 3, 'Gujarat Lions': 4, 'Gujarat Titans': 5, 'Kings XI Punjab': 6, 'Kochi Tuskers Kerala': 7, 'Kolkata Knight Riders': 8, 'Lucknow Super Giants': 9, 'Mumbai Indians': 10, 'Pune Warriors': 11, 'Punjab Kings': 12, 'Rajasthan Royals': 13, 'Rising Pune Supergiant': 14, 'Rising Pune Supergiants': 15, 'Royal Challengers Bangalore': 16, 'Royal Challengers Bengaluru': 17, 'Sunrisers Hyderabad': 18}
Bowling Team Encoding: {'Chennai Super Kings': 0, 'Deccan Chargers': 1, 'Delhi Capitals': 2, 'Delhi Daredevils': 3, 'Gujarat Lions': 4, 'Gujarat Titans': 5, 'Kings XI Punjab': 6, 'Kochi Tuskers Kerala': 7, 'Kolkata Knight Riders': 8, 'Lucknow Super Giants': 9, 'Mumbai Indians': 10, 'Pune Warriors': 11, 'Punjab Kings': 12, 'Rajasthan Royals': 13, 'Rising Pune Supergiant': 14, 'Rising Pune Supergiants': 1