In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, confusion_matrix, roc_auc_score,roc_curve, auc, precision_recall_curve
import pandas as pd

#Last three overs
# df = pd.read_pickle('dataset_lvl3_3.pkl')
    # Mean Absolute Error: 3.3719844032441086
    # MSE: 46.381124990387306
    # RMSE: 6.810368932032046
    # r squared: 0.9589562132275067

# Last four overs
# df = pd.read_pickle('dataset_lvl3_4.pkl')
    # Mean Absolute Error: 3.0909708904172466
    # MSE: 34.917820941126244
    # RMSE: 5.909130303278668
    # r squared: 0.9691003700844963

# Last five overs
df = pd.read_pickle('dataset_lvl3_5.pkl')
    # Mean Absolute Error: 3.018922742290367
    # MSE: 34.812845168106065
    # RMSE: 5.900241111014537
    # r squared: 0.9691932657019486

# Last 6 overs
# df = pd.read_pickle('dataset_lvl3_6.pkl')
    # Mean Absolute Error: 3.0687805082394335
    # MSE: 35.09432002639376
    # RMSE: 5.924045916972096
    # r squared: 0.9689441817466161

# Last 7 overs
# df = pd.read_pickle('dataset_lvl3_7.pkl')
    # Mean Absolute Error: 2.977061690177932
    # MSE: 36.384725174260666
    # RMSE: 6.031975229911066
    # r squared: 0.9678022708130165



In [13]:
data_columns = ['batting_team', 'bowling_team', 'current_score', 'wickets_remaining', 'city', 'balls_left', 'last_five']
target_columns = ['final_score']

X = df[data_columns]
y = df[target_columns]

X_encoded = pd.get_dummies(X, columns=['batting_team', 'bowling_team', 'city'])

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [14]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1200,subsample=1.0, learning_rate=0.46, max_depth=11,random_state=42)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"r squared: {r2}")


Mean Absolute Error: 3.018922742290367
MSE: 34.812845168106065
RMSE: 5.900241111014537
r squared: 0.9691932657019486


In [10]:
param_grid = {
    'max_depth': [11],
    'learning_rate': [0.46],
    'n_estimators': [1200, 1400, 1600],
    'subsample': [1.0]
}

xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_



Best parameters: {'learning_rate': 0.46, 'max_depth': 11, 'n_estimators': 1600, 'subsample': 1.0}


In [16]:
df[155:160]

Unnamed: 0,batting_team,bowling_team,current_score,wickets_remaining,city,balls_left,last_five,final_score
155,West Indies,India,35,5,Lauderhill,76,28.0,95
156,West Indies,India,36,5,Lauderhill,75,28.0,95
157,West Indies,India,37,5,Lauderhill,74,28.0,95
158,West Indies,India,37,5,Lauderhill,73,29.0,95
159,West Indies,India,37,5,Lauderhill,72,30.0,95


In [18]:
# test specific cases:
new_data = [
    {
        'batting_team': 'West Indies', 
        'bowling_team': 'India', 
        'current_score': 4, 
        'wickets_remaining': 5, 
        'city': 'Lauderhill', 
        'balls_left': 76, 
        'last_five': 28
    },
    # Add more dictionaries for additional test cases
]

df_new = pd.DataFrame(new_data)

df_new_encoded = pd.get_dummies(df_new, columns=['batting_team', 'bowling_team', 'city'])

# Make sure all columns in X_test are present in df_new_encoded, filling missing ones with 0
missing_cols = set(X_train.columns) - set(df_new_encoded.columns)
for c in missing_cols:
    df_new_encoded[c] = 0
df_new_encoded = df_new_encoded[X_train.columns]

# Load your trained model
# Assuming xgb_model is your trained XGBoost model

# Predict the final score
final_score_predictions = xgb_model.predict(df_new_encoded)

# Print the predictions
print("Predicted final scores:", final_score_predictions)




Predicted final scores: [104.65185]


  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0
  df_new_encoded[c] = 0


In [59]:
from joblib import dump

dump(xgb_model, 'xgb_model.joblib')

['xgb_model.joblib']

In [20]:

import pickle
import pandas as pd

data = pickle.load(open("dataset_lvl3_5.pkl.pkl", "rb"))

data.to_csv('data_set_lvl3_5.csv', index=False)


FileNotFoundError: [Errno 2] No such file or directory: 'data_set_lvl3_5.pkl'