## Importing useful packages and the dataset

In [170]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import joblib
X_raw=pd.read_csv('all_matches.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


## Data Preprocessing

In [171]:
# Selecting useful features
X_train=pd.DataFrame()
X_train=X_raw.loc[:,['match_id','venue','innings','batting_team','bowling_team','ball','runs_off_bat','extras','striker','non_striker','bowler']]
X_train

# Kings XI Punjab to Punjab Kings and Delhi Daredevils to Delhi Capitals
mapping = {
    'Kings XI Punjab' : 'Punjab Kings',
    'Delhi Daredevils' : 'Delhi Capitals',
    'Kolkata Knight Riders' : 'Kolkata Knight Riders',
    "Delhi Capitals" : "Delhi Capitals",
    "Mumbai Indians" : "Mumbai Indians",
    "Chennai Super Kings":"Chennai Super Kings",
    "Rajasthan Royals":"Rajasthan Royals",
    "Royal Challengers Bangalore":"Royal Challengers Bangalore",
    "Sunrisers Hyderabad":"Sunrisers Hyderabad"
}
X_train.batting_team=X_train.batting_team.map(mapping)
X_train.bowling_team=X_train.bowling_team.map(mapping)

# Mapping venues
mapping2 = {
    "Wankhede Stadium":"Wankhede Stadium",
    "Wankhede Stadium, Mumbai":"Wankhede Stadium",
    "MA Chidambaram Stadium, Chepauk":"MA Chidambaram Stadium",
    "MA Chidambaram Stadium":"MA Chidambaram Stadium",
    "MA Chidambaram Stadium, Chepauk, Chennai":"MA Chidambaram Stadium",
    "M Chinnaswamy Stadium":"M.Chinnaswamy Stadium",
    "M.Chinnaswamy Stadium":"M.Chinnaswamy Stadium",
    "Eden Gardens":"Eden Gardens",
    "Feroz Shah Kotla":"Arun Jaitley Stadium",
    "Arun Jaitley Stadium":"Arun Jaitley Stadium",
    "Sardar Patel Stadium, Motera":"Narendra Modi Stadium"
}
X_train.venue=X_train.venue.map(mapping2)

# Adding new feature: Totalruns
X_train['Totalruns'] = X_train['runs_off_bat'] + X_train['extras']

# Selecting first six overs and removing superover innings
X_train1=X_train.loc[(X_train.ball < 6.1) & (X_train.innings.isin([1,2]))]

# Removing old teams and unwanted venues
teams=["Delhi Capitals","Kolkata Knight Riders","Mumbai Indians","Chennai Super Kings","Sunrisers Hyderabad","Rajasthan Royals","Punjab Kings","Royal Challengers Bangalore"]
venues=["Wankhede Stadium","MA Chidambaram Stadium","M.Chinnaswamy Stadium","Eden Gardens","Arun Jaitley Stadium","Narendra Modi Stadium"]
X_train1=X_train1.loc[(X_train1.batting_team.isin(teams))]
X_train1=X_train1.loc[(X_train1.bowling_team.isin(teams))]
X_train1=X_train1.loc[(X_train1.venue.isin(venues))]

# Grouping by matches,innings,etc and creating dataframe for storing totalruns, batsmen list and bowler list
df = X_train1.groupby(['match_id','innings','venue','batting_team','bowling_team'])

c1=df.Totalruns.sum()
c2=df["striker"].unique()
c3=df["bowler"].unique()

df1=c1.to_frame(name = 'Totalruns').reset_index()
df2=c2.to_frame(name = 'batsmen').reset_index()
df3=c3.to_frame(name = 'bowlers').reset_index()

dff=df1.merge(df2, how='right', on=['match_id','venue','innings','batting_team','bowling_team'])
data=dff.merge(df3, how='right', on=['match_id','venue','innings','batting_team','bowling_team'])

#Removing this row as its a mistake
index_names = data[ data['Totalruns'] == 2 ].index
data.drop(index_names, inplace = True)

# Removing match_id as it is just an index
del data["match_id"]
data.reset_index(drop=True,inplace=True)

# Shifting Totalruns to rightmost column and interchanging pos of innings and venue: required output as per convention
data = data[["venue", "innings", "batting_team","bowling_team","batsmen","bowlers","Totalruns"]]






#Additional Processing for this level
# Taking number of batsmen and no of bowlers as new columns
data["NumBatsmen"]=data["batsmen"].apply(lambda x: len(x))
data["NumBowlers"]=data["bowlers"].apply(lambda x: len(x))

del data["batsmen"]
del data["bowlers"]
data.reset_index(drop=True,inplace=True)

# Shifting Totalruns to rightmost column and interchanging pos of innings and venue: required output as per convention
data = data[["venue", "innings", "batting_team","bowling_team","NumBatsmen","NumBowlers","Totalruns"]]

# FINAL DATAFRAME
data

Unnamed: 0,venue,innings,batting_team,bowling_team,NumBatsmen,NumBowlers,Totalruns
0,M.Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,3,3,61
1,M.Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,6,3,26
2,Arun Jaitley Stadium,1,Rajasthan Royals,Delhi Capitals,4,3,40
3,Arun Jaitley Stadium,2,Delhi Capitals,Rajasthan Royals,3,3,55
4,Wankhede Stadium,1,Mumbai Indians,Royal Challengers Bangalore,5,3,47
...,...,...,...,...,...,...,...
663,Wankhede Stadium,2,Rajasthan Royals,Kolkata Knight Riders,4,4,50
664,Wankhede Stadium,1,Chennai Super Kings,Royal Challengers Bangalore,2,4,51
665,Wankhede Stadium,2,Royal Challengers Bangalore,Chennai Super Kings,4,3,65
666,MA Chidambaram Stadium,1,Delhi Capitals,Sunrisers Hyderabad,2,4,51


## Encoding of categorical inputs and feature scaling

In [172]:
X=data.iloc[:,:-1]
y=data["Totalruns"]

In [173]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(categories="auto"), [0,1,2,3])], remainder='passthrough')
Xenc = pd.DataFrame(ct.fit_transform(X).toarray())
sc = StandardScaler()
Xenc = sc.fit_transform(Xenc)

In [174]:
Xenc.shape

(668, 26)

## Train-test split

In [175]:
X = Xenc
y = y

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Helper function

In [176]:
def evaluate(X,y):
    y_pred = np.round(regressor.predict(X))
    np.set_printoptions(precision=2)
    rmse = np.sqrt(np.mean((y-y_pred)**2))
    print("RMSE: ",rmse)

## Linear Regression

In [177]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print("Train: ",end=" ")
evaluate(X_train,y_train)
print("Test: ",end=" ")
evaluate(X_test,y_test)

Train:  RMSE:  10.570877243298396
Test:  RMSE:  10.220698927527618


In [178]:
model_enc=[regressor,ct,sc]
joblib.dump(model_enc, 'model1.pkl')

['model1.pkl']

## Decision Tree

In [179]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)
print("Train: ",end=" ")
evaluate(X_train,y_train)
print("Test: ",end=" ")
evaluate(X_test,y_test)

Train:  RMSE:  3.5439984612909976
Test:  RMSE:  13.726812198552452


In [180]:
model_enc=[regressor,ct,sc]
joblib.dump(model_enc, 'model2.pkl')

['model2.pkl']

## Random Forest Regression

In [181]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)
print("Train: ",end=" ")
evaluate(X_train,y_train)
print("Test: ",end=" ")
evaluate(X_test,y_test)

Train:  RMSE:  5.362513804841542
Test:  RMSE:  11.517993710038118


In [182]:
model_enc=[regressor,ct,sc]
joblib.dump(model_enc, 'model3.pkl')

['model3.pkl']

## KNN

In [183]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor()
regressor.fit(X_train, y_train)
print("Train: ",end=" ")
evaluate(X_train,y_train)
print("Test: ",end=" ")
evaluate(X_test,y_test)

Train:  RMSE:  10.096352291199777
Test:  RMSE:  11.855472947376251


In [184]:
model_enc=[regressor,ct,sc]
joblib.dump(model_enc, 'model4.pkl')

['model4.pkl']

## SVR

In [189]:
from sklearn.svm import SVR
regressor = SVR()
regressor.fit(X_train, y_train)
print("Train: ",end=" ")
evaluate(X_train,y_train)
print("Test: ",end=" ")
evaluate(X_test,y_test)

Train:  RMSE:  10.910943549417565
Test:  RMSE:  10.73041429977315


In [190]:
model_enc=[regressor,ct,sc]
joblib.dump(model_enc, 'model5.pkl')

['model5.pkl']

## XGBoost

In [191]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\Han\AppData\Local\Programs\Python\Python39\pythonw.exe -m pip install --upgrade pip' command.


In [192]:
import xgboost as xgb
regressor = xgb.XGBRegressor()
regressor.fit(X_train, y_train)
print("Train: ",end=" ")
evaluate(X_train,y_train)
print("Test: ",end=" ")
evaluate(X_test,y_test)

Train:  RMSE:  3.89684401258595
Test:  RMSE:  13.073557681356716


In [193]:
model_enc=[regressor,ct,sc]
joblib.dump(model_enc, 'model6.pkl')

['model6.pkl']