In [91]:
import pickle
import pandas as pd
df=pd.read_pickle('Dataset.pkl')

In [92]:
df

Unnamed: 0,over,score,wickets,batting,bowling,inning,Last5oversRuns,Final_score
0,10.1,75,0,Australia,Sri Lanka,1,30,168
1,10.2,76,0,Australia,Sri Lanka,1,31,168
2,10.3,76,1,Australia,Sri Lanka,1,31,168
3,10.4,78,1,Australia,Sri Lanka,1,32,168
4,10.5,78,1,Australia,Sri Lanka,1,31,168
...,...,...,...,...,...,...,...,...
103180,17.1,121,6,Australia,Sri Lanka,2,18,130
103181,17.2,124,6,Australia,Sri Lanka,2,21,130
103182,17.3,124,6,Australia,Sri Lanka,2,21,130
103183,17.4,124,6,Australia,Sri Lanka,2,21,130


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103185 entries, 0 to 103184
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   over            103185 non-null  float64
 1   score           103185 non-null  int64  
 2   wickets         103185 non-null  int64  
 3   batting         103185 non-null  object 
 4   bowling         103185 non-null  object 
 5   inning          103185 non-null  int64  
 6   Last5oversRuns  103185 non-null  int64  
 7   Final_score     103185 non-null  int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 6.3+ MB


#### Train test split

In [101]:
x = df.drop(columns=['Final_score'])
y = df['Final_score']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [102]:
y_train

59457    141
14439    180
51742    134
16062    139
73939    158
        ... 
50057    165
98047    127
5192     113
77708    168
98539    171
Name: Final_score, Length: 82548, dtype: int64

In [125]:
classes_list = df['bowling'].value_counts().index.tolist()
classes_list

['Pakistan',
 'India',
 'Australia',
 'New Zealand',
 'England',
 'West Indies',
 'Sri Lanka',
 'South Africa',
 'Bangladesh',
 'Ireland',
 'Afghanistan',
 'Zimbabwe',
 'Netherlands',
 'Scotland',
 'United Arab Emirates',
 'Hong Kong',
 'Oman',
 'Nepal',
 'Kenya',
 'Canada',
 'Malaysia',
 'Papua New Guinea',
 'Namibia',
 'Singapore',
 'Bermuda',
 'Jersey',
 'Vanuatu',
 'United States of America',
 'Germany',
 'Spain',
 'Kuwait',
 'Botswana',
 'Qatar',
 'Maldives',
 'Denmark',
 'Thailand',
 'Nigeria',
 'Guernsey',
 'ICC World XI',
 'Italy',
 'Uganda',
 'Philippines',
 'Belgium',
 'Cayman Islands',
 'Norway',
 'Ghana',
 'Portugal',
 'Romania',
 'Bahrain',
 'Luxembourg',
 'Gibraltar',
 'Bhutan',
 'Bulgaria',
 'Czech Republic',
 'Saudi Arabia',
 'Iran',
 'Isle of Man']

#### We need to convert object type columns to category. One hot encoding should be done here, since the classes don't have any ordinality

In [104]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error

In [105]:
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse_output=False,drop='first'),['batting','bowling'])
]
,remainder='passthrough')


### We will make two pipelines, one in which regression while NN model will be used in other

#### Regression model

In [106]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',StandardScaler()),
    ('step3',XGBRegressor(n_estimators=1000,learning_rate=0.2,max_depth=12,random_state=1))
])

In [107]:
pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.9721651140701852
3.2855147001164164


#### Neural network model

In [108]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error


In [115]:
def create_model(input_dim):
    model = Sequential()
    model.add(Dense(10, input_dim=input_dim, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    return model

In [116]:
input_dim = 117
neural_network_model = KerasRegressor(build_fn=create_model, input_dim=input_dim, epochs=50, batch_size=32, verbose=0)

pipe2 = Pipeline(steps=[
    ('step1', trf), 
    ('step2', StandardScaler()),
    ('step3', neural_network_model)
])

  neural_network_model = KerasRegressor(build_fn=create_model, input_dim=input_dim, epochs=50, batch_size=32, verbose=0)


In [117]:
t=pipe2.named_steps['step1'].transform(x_train)
t.shape

(82548, 117)

In [119]:
pipe2.fit(x_train,y_train)

In [120]:
y_pred = pipe2.predict(x_test)
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.8504197252083583
9.32971801911236


#### We observed that a very simple (not so dense) NN model is taking more than a minute to train and not even performing well, hence the XGBRegression model is to be accepted.

In [122]:
pickle.dump(pipe,open('pipe.pkl','wb'))