# MyModel

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
def to_kebab_case(string):
    return '-'.join(
        string.replace(",", "").replace(".", "").split()
    ).lower()

In [3]:
np.random.seed(2)

In [4]:
def prepare_input_dataframes(ball_by_ball, matches_result):
    ball_by_ball = ball_by_ball.rename(columns={
        'ID': 'match_id',
        'ballnumber': 'ball_number',
        'non-striker': 'non_striker',
        'BattingTeam': 'batting_team',
    }).loc[:, [
        'match_id',
        'innings',
        'batting_team',
        'overs',
        'ball_number',
        'batter',
        'bowler',
        'total_run',
    ]]
    
    matches_result = matches_result.rename(columns={
        'ID': 'match_id',
        'Team1': 'team_1',
        'Team2': 'team_2',
        'Venue': 'venue',
    }).loc[:, [
        'match_id',
        'team_1',
        'team_2',
        'venue',
    ]]

    return ball_by_ball, matches_result

In [5]:
venue_mapping_normal = {
  "Arun Jaitley Stadium": "Arun Jaitley Stadium",
  "Arun Jaitley Stadium, Delhi": "Arun Jaitley Stadium",
  "Feroz Shah Kotla": "Arun Jaitley Stadium",
  "Barsapara Cricket Stadium": "Barsapara Cricket Stadium",
  "Barsapara Cricket Stadium, Guwahati": "Barsapara Cricket Stadium",
  "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "Eden Gardens": "Eden Gardens",
  "Eden Gardens, Kolkata": "Eden Gardens",
  "Himachal Pradesh Cricket Association Stadium": "Himachal Pradesh Cricket Association Stadium",
  "Himachal Pradesh Cricket Association Stadium, Dharamsala": "Himachal Pradesh Cricket Association Stadium",
  "M Chinnaswamy Stadium": "M Chinnaswamy Stadium",
  "M Chinnaswamy Stadium, Bengaluru": "M Chinnaswamy Stadium",
  "M Chinnaswamy Stadium, Bangalore": "M Chinnaswamy Stadium",
  "M.Chinnaswamy Stadium": "M Chinnaswamy Stadium",
  "M.Chinnaswamy Stadium, Bengaluru": "M Chinnaswamy Stadium",
  "M.Chinnaswamy Stadium, Bangalore": "M Chinnaswamy Stadium",
  "MA Chidambaram Stadium": "MA Chidambaram Stadium",
  "MA Chidambaram Stadium, Chennai": "MA Chidambaram Stadium",
  "MA Chidambaram Stadium, Chepauk": "MA Chidambaram Stadium",
  "MA Chidambaram Stadium, Chepauk, Chennai": "MA Chidambaram Stadium",
  "Narendra Modi Stadium": "Narendra Modi Stadium",
  "Narendra Modi Stadium, Ahmedabad": "Narendra Modi Stadium",
  "Punjab Cricket Association IS Bindra Stadium": "Punjab Cricket Association IS Bindra Stadium",
  "Punjab Cricket Association IS Bindra Stadium, Mohali": "Punjab Cricket Association IS Bindra Stadium",
  "Punjab Cricket Association Stadium, Mohali": "Punjab Cricket Association IS Bindra Stadium",
  "Rajiv Gandhi International Stadium": "Rajiv Gandhi International Stadium",
  "Rajiv Gandhi International Stadium, Hyderabad": "Rajiv Gandhi International Stadium",
  "Rajiv Gandhi International Stadium, Uppal": "Rajiv Gandhi International Stadium",
  "Sawai Mansingh Stadium": "Sawai Mansingh Stadium",
  "Sawai Mansingh Stadium, Jaipur": "Sawai Mansingh Stadium",
  "Wankhede Stadium": "Wankhede Stadium",
  "Wankhede Stadium, Mumbai": "Wankhede Stadium"
}

In [6]:
venue_mapping_kebab = {
  "arun-jaitley-stadium": "Arun Jaitley Stadium",
  "arun-jaitley-stadium-delhi": "Arun Jaitley Stadium",
  "feroz-shah-kotla": "Arun Jaitley Stadium",
  "barsapara-cricket-stadium": "Barsapara Cricket Stadium",
  "barsapara-cricket-stadium-guwahati": "Barsapara Cricket Stadium",
  "bharat-ratna-shri-atal-bihari-vajpayee-ekana-cricket-stadium": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "bharat-ratna-shri-atal-bihari-vajpayee-ekana-cricket-stadium-lucknow": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "eden-gardens": "Eden Gardens",
  "eden-gardens-kolkata": "Eden Gardens",
  "himachal-pradesh-cricket-association-stadium": "Himachal Pradesh Cricket Association Stadium",
  "himachal-pradesh-cricket-association-stadium-dharamsala": "Himachal Pradesh Cricket Association Stadium",
  "m-chinnaswamy-stadium": "M Chinnaswamy Stadium",
  "m-chinnaswamy-stadium-bengaluru": "M Chinnaswamy Stadium",
  "m-chinnaswamy-stadium-bangalore": "M Chinnaswamy Stadium",
  "mchinnaswamy-stadium": "M Chinnaswamy Stadium",
  "mchinnaswamy-stadium-bengaluru": "M Chinnaswamy Stadium",
  "mchinnaswamy-stadium-bangalore": "M Chinnaswamy Stadium",
  "ma-chidambaram-stadium": "MA Chidambaram Stadium",
  "ma-chidambaram-stadium-chennai": "MA Chidambaram Stadium",
  "ma-chidambaram-stadium-chepauk": "MA Chidambaram Stadium",
  "ma-chidambaram-stadium-chepauk-chennai": "MA Chidambaram Stadium",
  "narendra-modi-stadium": "Narendra Modi Stadium",
  "narendra-modi-stadium-ahmedabad": "Narendra Modi Stadium",
  "punjab-cricket-association-is-bindra-stadium": "Punjab Cricket Association IS Bindra Stadium",
  "punjab-cricket-association-is-bindra-stadium-mohali": "Punjab Cricket Association IS Bindra Stadium",
  "punjab-cricket-association-stadium-mohali": "Punjab Cricket Association IS Bindra Stadium",
  "rajiv-gandhi-international-stadium": "Rajiv Gandhi International Stadium",
  "rajiv-gandhi-international-stadium-hyderabad": "Rajiv Gandhi International Stadium",
  "rajiv-gandhi-international-stadium-uppal": "Rajiv Gandhi International Stadium",
  "sawai-mansingh-stadium": "Sawai Mansingh Stadium",
  "sawai-mansingh-stadium-jaipur": "Sawai Mansingh Stadium",
  "wankhede-stadium": "Wankhede Stadium",
  "wankhede-stadium-mumbai": "Wankhede Stadium"
}

In [7]:
venue_mapping_tags = {
  "delhi": "Arun Jaitley Stadium",
  "arun jaitley": "Arun Jaitley Stadium",
  "guwahati": "Barsapara Cricket Stadium",
  "barsapara": "Barsapara Cricket Stadium",
  "bhupen hazarika": "Barsapara Cricket Stadium",
  "assam cricket association stadium": "Barsapara Cricket Stadium",
  "lucknow": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "ekana": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "atal bihari": "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium",
  "kolkata": "Eden Gardens",
  "eden gardens": "Eden Gardens",
  "dharamsala": "Himachal Pradesh Cricket Association Stadium",
  "himachal pradesh": "Himachal Pradesh Cricket Association Stadium",
  "bengaluru": "M Chinnaswamy Stadium",
  "bengalore": "M Chinnaswamy Stadium",
  "chinnaswamy": "M Chinnaswamy Stadium",
  "chennai": "MA Chidambaram Stadium",
  "chepauk": "MA Chidambaram Stadium",
  "chidambaram": "MA Chidambaram Stadium",
  "ahmedabad": "Narendra Modi Stadium",
  "narendra modi": "Narendra Modi Stadium",
  "mohali": "Punjab Cricket Association IS Bindra Stadium",
  "punjab cricket association": "Punjab Cricket Association IS Bindra Stadium",
  "is bindra": "Punjab Cricket Association IS Bindra Stadium",
  "hyderabad": "Rajiv Gandhi International Stadium",
  "rajiv gandhi": "Rajiv Gandhi International Stadium",
  "jaipur": "Sawai Mansingh Stadium",
  "sawai mansingh": "Sawai Mansingh Stadium",
  "mumbai": "Wankhede Stadium",
  "wankhede": "Wankhede Stadium"
}

In [8]:
team_mapping = { # 10 teams
 'Rajasthan Royals': 'Rajasthan Royals',
 'Gujarat Titans': 'Gujarat Titans',
 'Royal Challengers Bangalore': 'Royal Challengers Bangalore',
 'Lucknow Super Giants': 'Lucknow Super Giants',
 'Sunrisers Hyderabad': 'Sunrisers Hyderabad',
 'Mumbai Indians': 'Mumbai Indians',
 'Chennai Super Kings': 'Chennai Super Kings',
 'Kolkata Knight Riders': 'Kolkata Knight Riders',
    
 'Kings XI Punjab': 'Punjab Kings',
 'Punjab Kings': 'Punjab Kings',
    
 'Delhi Daredevils': 'Delhi Capitals',
 'Delhi Capitals': 'Delhi Capitals',
}

In [9]:
def do_mapping(ball_by_ball, matches_result):
    matches_result.venue = matches_result.venue.map(venue_mapping_normal).fillna('Other')

    matches_result.team_1 = matches_result.team_1.map(team_mapping).fillna('Other')
    matches_result.team_2 = matches_result.team_2.map(team_mapping).fillna('Other')

    ball_by_ball.batting_team = ball_by_ball.batting_team.map(team_mapping).fillna('Other')
    return ball_by_ball, matches_result

In [10]:
def select_innings_and_overs(ball_by_ball):
    ball_by_ball = ball_by_ball.loc[(ball_by_ball.overs <= 5) & (ball_by_ball.innings <= 2)]
    ball_by_ball.innings = ball_by_ball.innings.replace({1: 0, 2: 1})
    return ball_by_ball

In [11]:
def prepare_final_training_dataframe(ball_by_ball, matches_result):
    ball_by_ball_gb = ball_by_ball.groupby(['match_id', 'innings', 'batting_team'])
    
    total_runs = ball_by_ball_gb['total_run'].sum()
    batsmen = ball_by_ball_gb['batter'].unique()
    bowlers = ball_by_ball_gb['bowler'].unique()

    total_runs = total_runs.to_frame(name = 'total_runs').reset_index()
    batsmen = batsmen.to_frame(name = 'batsmen').reset_index()
    bowlers = bowlers.to_frame(name = 'bowlers').reset_index()

    data = total_runs.merge(batsmen, how='right', on=['match_id','innings','batting_team'])
    data = data.merge(bowlers, how='right', on=['match_id','innings','batting_team'])
    data = data.merge(matches_result, on=['match_id'])

    mask = data['batting_team'] == data['team_1']
    data.loc[mask, 'bowling_team'] = data['team_2']
    data.loc[~mask, 'bowling_team'] = data['team_1']

    # match_id == 829763, data for one innings is missing
    # match_id == 829813, total_runs for one innings is 2 (probably a mistake in data entry)
    data = data.drop(data[(data['match_id'] == 829763) | (data['match_id'] == 829813)].index)

    # get count of batsmen & bowlers for each innings 
    data['count_batsmen'] = [len(x) for x in data['batsmen']]
    data['count_bowlers'] = [len(x) for x in data['bowlers']]

    data = data[
        ['venue', 'innings', 'batting_team', 'bowling_team', 'count_batsmen', 'count_bowlers', 'total_runs']
    ]
    
    return data

In [12]:
def prepare_training_data(input_dataframes):
    ball_by_ball, matches_result = input_dataframes
    ball_by_ball, matches_result = prepare_input_dataframes(ball_by_ball, matches_result)
    ball_by_ball, matches_result = do_mapping(ball_by_ball, matches_result)
    ball_by_ball = select_innings_and_overs(ball_by_ball)
    return prepare_final_training_dataframe(ball_by_ball, matches_result)

In [13]:
class MyModel:
    def __init__(self):
        pass

In [14]:
def train_model(X_train, y_train):
    from sklearn.linear_model import LinearRegression
    return LinearRegression().fit(X_train, y_train)

In [15]:
def MyModel__fit(self, input_dataframes):
    data = prepare_training_data(input_dataframes)
    
    X = data.iloc[:, :-1]
    y = data["total_runs"]

    self.preprocessor = ColumnTransformer([
        ("onehot", OneHotEncoder(sparse_output=False), ["venue", "batting_team", "bowling_team"]),
        ("scaler", StandardScaler(), ["count_batsmen", "count_bowlers"])
    ], remainder='passthrough')

    X_preprocessed = self.preprocessor.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size = 0.2)
    self.model = train_model(X_train, y_train)

In [16]:
def MyModel__predict(self, X_IPL23):
    X_IPL23.innings = X_IPL23.innings.replace({1: 0, 2: 1})

    # get count of batsmen & bowlers for each innings 
    X_IPL23['count_batsmen'] = [len(x.split(",")) for x in X_IPL23['batsmen']]
    X_IPL23['count_bowlers'] = [len(x.split(",")) for x in X_IPL23['bowlers']]
    X_IPL23 = X_IPL23.drop(columns=['batsmen', 'bowlers'])[
        ['venue', 'innings', 'batting_team', 'bowling_team', 'count_batsmen', 'count_bowlers']
    ]
    
    ambiguous_venues = np.setdiff1d(X_IPL23.venue.unique(), list(venue_mapping_normal.keys()))
    ambiguous_venues_mapping = {}
    for venue in ambiguous_venues:
        venue_kebab_case = to_kebab_case(venue)
        if venue_kebab_case in venue_mapping_kebab:
            ambiguous_venues_mapping[venue] = venue_mapping_kebab[venue_kebab_case]
        else:
            venue_lower = venue.lower()
            for tag in venue_mapping_tags:
                if tag in venue_lower: ambiguous_venues_mapping[venue] = venue_mapping_tags[tag]

    venue_mapping_final = {**venue_mapping_normal, **ambiguous_venues_mapping}
    
    X_IPL23.venue = X_IPL23.venue.map(venue_mapping_final).fillna('Other').replace({
    'Barsapara Cricket Stadium': 'Other',
    'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium': 'Other'
    })
    
    X_IPL23_preprocessed = self.preprocessor.transform(X_IPL23)
    
    return np.round(
        self.model.predict(X_IPL23_preprocessed)
    ).astype(int)

In [17]:
MyModel.fit = MyModel__fit
MyModel.predict = MyModel__predict

# Main.py

In [18]:
ball_by_ball = pd.read_csv('./Data/IPL_Ball_by_Ball_2008_2022.csv')
matches_result = pd.read_csv('./Data/IPL_Matches_Result_2008_2022.csv')

In [19]:
a_model = MyModel()

# FilesUsed

In [20]:
import os

In [21]:
def evaluate():
    files = os.listdir('./FilesUsed')
    total_error = 0
    for file in files:
        if 'test_file_matchid' in file:
            match_no = file[-6:-4]

            if int(match_no) < 20: continue

            X_file_name = './FilesUsed/' + file
            y_file_name = './FilesUsed/' + 'test_file_labels_matchid_' + match_no + '.csv'

            X = pd.read_csv(X_file_name).drop(columns=['Unnamed: 0'])
            y = pd.read_csv(y_file_name)['actual_runs']

            print(match_no, X_file_name, y_file_name)

            y_pred = a_model.predict(X)
            y_real = y.to_numpy().astype(int)

            error = np.abs(y_real - y_pred).sum()
            total_error += error

            print(y_real, y_pred, error, '\n')
            print(pd.DataFrame(list(zip(y_real, y_pred)), columns=['Actual', 'Predicted']).to_markdown())

    print('total_error:', total_error)
    # return total_error

In [22]:
a_model.fit([ball_by_ball, matches_result])
evaluate()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ball_by_ball.innings = ball_by_ball.innings.replace({1: 0, 2: 1})


20 ./FilesUsed/test_file_matchid_20.csv ./FilesUsed/test_file_labels_matchid_20.csv
[47 32] [52 44] 17 

|    |   Actual |   Predicted |
|---:|---------:|------------:|
|  0 |       47 |          52 |
|  1 |       32 |          44 |
21 ./FilesUsed/test_file_matchid_21.csv ./FilesUsed/test_file_labels_matchid_21.csv
[49 45] [52 44] 4 

|    |   Actual |   Predicted |
|---:|---------:|------------:|
|  0 |       49 |          52 |
|  1 |       45 |          44 |
22 ./FilesUsed/test_file_matchid_22.csv ./FilesUsed/test_file_labels_matchid_22.csv
[57 72] [41 49] 39 

|    |   Actual |   Predicted |
|---:|---------:|------------:|
|  0 |       57 |          41 |
|  1 |       72 |          49 |
23 ./FilesUsed/test_file_matchid_23.csv ./FilesUsed/test_file_labels_matchid_23.csv
[42 26] [44 39] 15 

|    |   Actual |   Predicted |
|---:|---------:|------------:|
|  0 |       42 |          44 |
|  1 |       26 |          39 |
24 ./FilesUsed/test_file_matchid_24.csv ./FilesUsed/test_file_labels_

In [23]:
# best_n = None
# least_total_error = float('inf')

# for n in np.random.randint(low=0, high=100, size=20):
# for n in range(100):
#     random_state = n
    
#     # Set the random seed for NumPy
#     np.random.seed(n)
    
#     a_model.fit([ball_by_ball, matches_result])
#     total_error = evaluate()
    
#     print(n, total_error)
    
#     if total_error < least_total_error:
#         least_total_error = total_error
#         best_n = n
        
# print('best_n, least_total_error:', best_n, least_total_error)