In [27]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, median_absolute_error, root_mean_squared_error

In [13]:
df: pd.DataFrame = pickle.load(open('../data/processed-data/feature-extraction-2.pkl', 'rb'))

In [14]:
features = df.drop(columns=['final_score'])
target = df['final_score']
training_features, testing_features, training_target, testing_target = train_test_split(features, target, test_size=0.2)

In [15]:
transformer = ColumnTransformer(
    [('one_hot_encoding', OneHotEncoder(sparse_output=False), ['batting_team', 'bowling_team', 'city'])],
    remainder='passthrough'
)

In [24]:
pipe = Pipeline(steps=[
    ('transform', transformer),
    ('model training', RandomForestRegressor())
])

In [29]:
pipe.fit(training_features, training_target)
prediction = pipe.predict(testing_features)

In [28]:
print(f"r2 score -> {r2_score(testing_target, prediction)}")
print(f"mean absolute error -> {mean_absolute_error(testing_target, prediction)}")
print(f"median absolute error -> {median_absolute_error(testing_target, prediction)}")
print(f"root mean square error -> {root_mean_squared_error(testing_target, prediction)}")

r2 score -> 0.953039143209624
mean absolute error -> 2.86556110295201
median absolute error -> 0.6899999999999977
root mean square error -> 7.137460134424673


In [23]:
pickle.dump(pipe, open('../models/xgboost-model.pkl', 'wb'))