# MyModel

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
def get_initial_training_dataframes(training_data):
    ball_by_ball, matches_result = training_data
    
    ball_by_ball = ball_by_ball.rename(columns={
        'ID': 'match_id',
        'ballnumber': 'ball_number',
        'non-striker': 'non_striker',
        'BattingTeam': 'batting_team',
    }).loc[:, [
        'match_id',
        'innings',
        'batting_team',
        'overs',
        'ball_number',
        'batter',
        'bowler',
        'total_run',
    ]]
    
    matches_result = matches_result.rename(columns={
        'ID': 'match_id',
        'Team1': 'team_1',
        'Team2': 'team_2',
        'Venue': 'venue',
    }).loc[:, [
        'match_id',
        'team_1',
        'team_2',
        'venue',
    ]]

    return ball_by_ball, matches_result

In [3]:
venue_mapping = {
 'Arun Jaitley Stadium, Delhi': 'Arun Jaitley Stadium',
 'Arun Jaitley Stadium': 'Arun Jaitley Stadium',
 'Brabourne Stadium, Mumbai': 'Brabourne Stadium',
 'Brabourne Stadium': 'Brabourne Stadium',
 'Dr DY Patil Sports Academy, Mumbai': 'Dr DY Patil Sports Academy',
 'Dr DY Patil Sports Academy': 'Dr DY Patil Sports Academy',
 'Eden Gardens, Kolkata': 'Eden Gardens',
 'Eden Gardens': 'Eden Gardens',
 'M Chinnaswamy Stadium': 'M.Chinnaswamy Stadium',
 'M.Chinnaswamy Stadium': 'M.Chinnaswamy Stadium',
 'Maharashtra Cricket Association Stadium, Pune': 'Maharashtra Cricket Association Stadium',
 'Maharashtra Cricket Association Stadium': 'Maharashtra Cricket Association Stadium',
 'Narendra Modi Stadium, Ahmedabad': 'Narendra Modi Stadium',
 'Narendra Modi Stadium': 'Narendra Modi Stadium',
 'Rajiv Gandhi International Stadium, Uppal': 'Rajiv Gandhi International Stadium',
 'Rajiv Gandhi International Stadium': 'Rajiv Gandhi International Stadium',
 'Wankhede Stadium, Mumbai': 'Wankhede Stadium',
 'Wankhede Stadium': 'Wankhede Stadium',
 'Himachal Pradesh Cricket Association Stadium': 'Himachal Pradesh Cricket Association Stadium',
 'Sawai Mansingh Stadium': 'Sawai Mansingh Stadium',
 'MA Chidambaram Stadium, Chepauk': 'MA Chidambaram Stadium',
 'MA Chidambaram Stadium, Chepauk, Chennai': 'MA Chidambaram Stadium',
 'MA Chidambaram Stadium': 'MA Chidambaram Stadium',
 'Punjab Cricket Association IS Bindra Stadium, Mohali': 'Punjab Cricket Association IS Bindra Stadium',
 'Punjab Cricket Association Stadium, Mohali': 'Punjab Cricket Association IS Bindra Stadium',
 'Punjab Cricket Association IS Bindra Stadium': 'Punjab Cricket Association IS Bindra Stadium'
}

venue_mapping.update({
    'Wankhede Stadium , Mumbai': 'Wankhede Stadium',
    'Rajiv Gandhi International Stadium, Hyderabad': 'Rajiv Gandhi International Stadium',
    'Sawai Mansingh Stadium, Jaipur': 'Sawai Mansingh Stadium',
    'Punjab Cricket Association IS Bindra Stadium,Chandigarh': 'Punjab Cricket Association IS Bindra Stadium',
    'M Chinnaswamy Stadium, Bangalore': 'M.Chinnaswamy Stadium',
    'M.Chinnaswamy Stadium, Bangalore': 'M.Chinnaswamy Stadium',
    'M Chinnaswamy Stadium, Bengaluru': 'M.Chinnaswamy Stadium',
    'MA Chidambaram Stadium, Chennai': 'MA Chidambaram Stadium',
    ' Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow': 
})

In [4]:
team_mapping = {
 'Rajasthan Royals': 'Rajasthan Royals',
 'Gujarat Titans': 'Gujarat Titans',
 'Royal Challengers Bangalore': 'Royal Challengers Bangalore',
 'Lucknow Super Giants': 'Lucknow Super Giants',
 'Sunrisers Hyderabad': 'Sunrisers Hyderabad',
 'Mumbai Indians': 'Mumbai Indians',
 'Chennai Super Kings': 'Chennai Super Kings',
 'Kolkata Knight Riders': 'Kolkata Knight Riders',
 'Kings XI Punjab': 'Punjab Kings',
 'Punjab Kings': 'Punjab Kings',
 'Delhi Daredevils': 'Delhi Capitals',
 'Delhi Capitals': 'Delhi Capitals'
}

In [5]:
def do_venue_mapping(df):
    df.venue = df.venue.map(venue_mapping)
    return df

In [6]:
def do_team_mapping(ball_by_ball, matches_result):
    matches_result.team_1 = matches_result.team_1.map(team_mapping)
    matches_result.team_2 = matches_result.team_2.map(team_mapping)
    ball_by_ball.batting_team = ball_by_ball.batting_team.map(team_mapping)
    return ball_by_ball, matches_result

In [7]:
def remove_unnecessary_rows(ball_by_ball, matches_result):
    ball_by_ball = ball_by_ball.dropna(subset=['batting_team'])
    matches_result = matches_result.dropna(subset=['team_1', 'team_2', 'venue'])
    return ball_by_ball, matches_result

In [8]:
def select_innings_and_overs(ball_by_ball):
    ball_by_ball = ball_by_ball.loc[(ball_by_ball.overs <= 5) & (ball_by_ball.innings <= 2)]
    ball_by_ball.innings = ball_by_ball.innings.replace({1: 0, 2: 1})
    return ball_by_ball

In [9]:
def get_final_training_dataframe(ball_by_ball, matches_result):
    gb = ball_by_ball.groupby(['match_id', 'innings', 'batting_team'])
    
    total_runs = gb['total_run'].sum()
    batsmen = gb['batter'].unique()
    bowlers = gb['bowler'].unique()
    
    total_runs = total_runs.to_frame(name = 'total_runs').reset_index()
    batsmen = batsmen.to_frame(name = 'batsmen').reset_index()
    bowlers = bowlers.to_frame(name = 'bowlers').reset_index()
    
    data = total_runs.merge(batsmen, how='right', on=['match_id','innings','batting_team'])
    data = data.merge(bowlers, how='right', on=['match_id','innings','batting_team'])
    data = data.merge(matches_result, on=['match_id'])
    
    mask = data['batting_team'] == data['team_1']
    data.loc[mask, 'bowling_team'] = data['team_2']
    data.loc[~mask, 'bowling_team'] = data['team_1']
    
    # match_id == 829763, data for one innings is missing
    # match_id == 829813, total_runs for one innings is 2 (probably a mistake in data entry)
    data = data.drop(data[(data['match_id'] == 829763) | (data['match_id'] == 829813)].index)

    data['count_batsmen'] = [len(x) for x in data['batsmen']]
    data['count_bowlers'] = [len(x) for x in data['bowlers']]

    data = data.drop(columns=['match_id', 'batsmen', 'bowlers', 'team_1', 'team_2'])
    data = data[['venue', 'innings', 'batting_team', 'bowling_team', 'count_batsmen', 'count_bowlers', 'total_runs']]
    
    return data

In [10]:
def preprocess(training_data):
    ball_by_ball, matches_result = get_initial_training_dataframes(training_data)
    matches_result = do_venue_mapping(matches_result)
    ball_by_ball, matches_result = do_team_mapping(ball_by_ball, matches_result)
    ball_by_ball, matches_result = remove_unnecessary_rows(ball_by_ball, matches_result)
    ball_by_ball = select_innings_and_overs(ball_by_ball)
    data = get_final_training_dataframe(ball_by_ball, matches_result)
    return data

In [11]:
class MyModel:
    def __init__(self):
        pass

In [12]:
def get_trained_model(X_train, y_train):
    from sklearn.linear_model import LinearRegression
    return LinearRegression().fit(X_train, y_train)

In [13]:
def MyModel_fit(self, training_data):
    data = preprocess(training_data)
    
    X = data.iloc[:, :-1]
    y = data["total_runs"]

    self.ct = ColumnTransformer(transformers = [
        ('ohe', OneHotEncoder(categories = "auto", drop='first', sparse_output=False), ['venue', 'batting_team', 'bowling_team'])
    ], remainder = 'passthrough')

    self.scaler = StandardScaler()

    X_ohe = pd.DataFrame(self.ct.fit_transform(X))
    X_std = self.scaler.fit_transform(X_ohe)
    
    X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.2)
    self.model = get_trained_model(X_train, y_train)
    
    self.debug = {
        "data": data, 
        "X_ohe": X_ohe, 
        "X_std": X_std, 
        "X_train": X_train, 
        "X_test": X_test, 
        "y_train": y_train, 
        "y_test": y_test
    }

In [14]:
def MyModel_predict(self, test_data, dev=False):
    if (dev == False):
        test_data = test_data.iloc[:, 1:]
        test_data['count_batsmen'] = [len(x) for x in test_data['batsmen']]
        test_data['count_bowlers'] = [len(x) for x in test_data['bowlers']]
        test_data = do_venue_mapping(test_data)

    test_data_ohe = self.ct.transform(test_data)
    test_data_std = self.scaler.transform(test_data_ohe)
    return self.model.predict(test_data_std)

In [15]:
MyModel.fit = MyModel_fit
MyModel.predict = MyModel_predict

# Main.py

In [16]:
ball_by_ball = pd.read_csv('./Data/IPL_Ball_by_Ball_2008_2022.csv')
matches_result = pd.read_csv('./Data/IPL_Matches_Result_2008_2022.csv')

In [17]:
a_model = MyModel()

In [18]:
a_model.fit([ball_by_ball, matches_result])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ball_by_ball.innings = ball_by_ball.innings.replace({1: 0, 2: 1})


In [19]:
from sklearn.metrics import mean_absolute_error
sample = a_model.debug['data'].sample(frac=.5)
X = sample.iloc[:, 0:-1]
y = sample.iloc[:, -1]
y_pred = a_model.predict(X, dev=True)
mean_absolute_error(y, y_pred)

8.280540344552167

# FilesUsed

In [20]:
import os

In [21]:
files = os.listdir('./FilesUsed')

In [31]:
for file in files:
    if 'test_file_matchid' in file:
        X_file_name = './FilesUsed/' + file
        y_file_name = './FilesUsed/' + 'test_file_labels_matchid_' + file[-6:]
        
        print(f'X_file_name: {X_file_name}')
        
        X = pd.read_csv(X_file_name)
        y = pd.read_csv(y_file_name)['actual_runs']
        
        print(f"'{X['venue'][0]}'")
        print(f"{X['venue'][0] in venue_mapping}")
        
        y_pred = a_model.predict(X)
        print(*y)
        print(*y_pred)
        print(mean_absolute_error(y, y_pred), '\n')

X_file_name: ./FilesUsed/test_file_matchid_12.csv
'Wankhede Stadium , Mumbai'
True
61.0 68.0
155.7364196858824 202.8119920394721
114.77420586267725 

X_file_name: ./FilesUsed/test_file_matchid_13.csv
'Narendra Modi Stadium, Ahmedabad'
True
54.0 43.0
165.50119569745095 36.92874466147033
58.78622551799031 

X_file_name: ./FilesUsed/test_file_matchid_14.csv
'Rajiv Gandhi International Stadium, Hyderabad'
True
41.0 34.0
-28.67676118408346 180.74676569517428
108.21176343962887 

X_file_name: ./FilesUsed/test_file_matchid_15.csv
'M.Chinnaswamy Stadium, Bangalore'
True
56.0 37.0
170.30055647582594 24.884633908191685
63.20796128381713 

X_file_name: ./FilesUsed/test_file_matchid_16.csv
'Arun Jaitley Stadium, Delhi'
True
51.0 68.0
257.72018484872314 272.5678169811059
205.64400091491453 

X_file_name: ./FilesUsed/test_file_matchid_17.csv
'MA Chidambaram Stadium, Chennai'
True
57.0 45.0
107.88245080625184 180.36576003389644
93.12410542007413 

X_file_name: ./FilesUsed/test_file_matchid_18.csv
'Pu

ValueError: Found unknown categories [nan] in column 0 during transform