In [10]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_absolute_error
from xgboost import XGBRegressor

In [11]:
df: pd.DataFrame = pickle.load(open('../data/processed-data/feature-extraction-2.pkl', 'rb'))

In [18]:
features = df.drop(columns=['final_score'])
target = df['final_score']
training_features, testing_features, training_target, testing_target = train_test_split(features, target, test_size=0.2)

In [20]:
transformer = ColumnTransformer(
    [('one_hot_encoding', OneHotEncoder(sparse_output=False, drop='first'), ['batting_team', 'bowling_team', 'city'])],
    remainder='passthrough'
)

In [21]:
pipe = Pipeline(steps=[
    ('step 1', transformer),
    ('step 2', StandardScaler()),
    ('step 3', XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12))
])

In [22]:
pipe.fit(training_features, training_target)
prediction = pipe.predict(testing_features)
print(r2_score(testing_target, prediction))
print(mean_absolute_error(testing_target, prediction))

0.9615518183628492
2.4315957466229814


In [23]:
pickle.dump(pipe, open('../models/xgboost-model.pkl', 'wb'))