In [12]:
import os
import pickle
import time
from os.path import join as pjoin

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from IPython.display import clear_output
from sklearn.model_selection import train_test_split
import csv
import nodegam
from feature_utils import process

In [13]:
parameters = {
    'format':'OD',
    'anneal_steps' : 5000,
    'quantile_noise':1e-4,
    'n_quantiles':3000,
    'min_temp':0.1,
    'num_trees':300,
    'num_layers':8,
    'depth':6,
    'lr':1e-4,
    'lr_warmup_steps':1000,
    'batch_size':128,
    'lr_decay_steps' : 5000,
    'early_stopping_rounds' : 1000,
    'output_dropout':0.2,
    'last_dropout':0.3,
    'colsample_bytree':0.5,
    'selectors_detach':0,
    'ga2m':1,
    'l2_lambda':0.3,
    'nus_min':0.7,
    'nus_max':1.0,
}

In [14]:
# Only use GPU 0
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
name = 'OD_ga2m_2024.12.04_07:45'
# Create directory
os.makedirs(pjoin('logs', name), exist_ok=True)

csv_path = "./logs/parameters.csv"
if os.path.exists(csv_path):
    parameters_df = pd.read_csv(csv_path)
    columns_to_check = [col for col in parameters_df.columns if col in parameters]
    matching_rows = parameters_df[columns_to_check].eq(pd.Series(parameters)).all(axis=1)
    if matching_rows.any():
        print(f"Already trained a model on this parameter with name : {parameters_df['name']} and test_error : {parameters_df['test_error']}")
        exit(0)

In [15]:
nodegam.utils.seed_everything(seed=83)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

format = "OD"
train_data_path = os.path.join("..", "data", "processed", "train", f"15_{format}.csv")
test_data_path = os.path.join("..", "data", "processed", "test", f"15_{format}.csv")

test_df = pd.read_csv(test_data_path)
train_df = pd.read_csv(train_data_path)

y_train = train_df["fantasy_points"].values.squeeze()
y_test_id = test_df["match_id"]
y_test = test_df["fantasy_points"].values.squeeze()


X_train = process(train_df, 15, False)
X_test = process(test_df, 15, False)

data = {
    'X_train' : X_train,
    'y_train' : y_train,
    'X_test' : X_test,
    'y_test' : y_test,
    'problem' : "regression"
}

preprocessor = nodegam.mypreprocessor.MyPreprocessor(
    cat_features=data.get('cat_features', None),
    y_normalize=(data['problem'] == 'regression'), # Normalize target y to mean 0 and 1 in regression
    random_state=1337, quantile_transform=True,
    quantile_noise=data.get('quantile_noise', parameters['quantile_noise']),
    n_quantiles=parameters['n_quantiles'],
)

X_train, y_train = data['X_train'], data['y_train']
X_test, y_test = data['X_test'], data['y_test']
preprocessor.fit(X_train, y_train)

X_train, y_train = preprocessor.transform(X_train, y_train)
X_test, y_test = preprocessor.transform(X_test, y_test)

anneal_steps = parameters['anneal_steps']

choice_fn = nodegam.nn_utils.EM15Temp(max_temp=1., min_temp=parameters['min_temp'], steps=anneal_steps)

# Temperature annealing for entmoid
model = nodegam.arch.GAMBlock(
    in_features=X_train.shape[1],
    num_trees=parameters['num_trees'],
    num_layers=parameters['num_layers'],
    num_classes=1,
    addi_tree_dim=1,
    depth=parameters['depth'],
    choice_function=choice_fn,
    bin_function=nodegam.nn_utils.entmoid15,
    output_dropout=parameters['output_dropout'],
    last_dropout=parameters['last_dropout'],
    colsample_bytree=parameters['colsample_bytree'],
    selectors_detach=parameters['selectors_detach'], # This is only used to save memory in large datasets like epsilon
    add_last_linear=True,
    ga2m=parameters['ga2m'],
    l2_lambda=parameters['l2_lambda'],
).to(device)

step_callbacks = [choice_fn.temp_step_callback]

from qhoptim.pyt import QHAdam
optimizer_params = {'nus': (parameters['nus_min'], parameters['nus_max']), 'betas': (0.95, 0.998)}

trainer = nodegam.trainer.Trainer(
    model=model,
    experiment_name=name,
    warm_start=True, # if True, will load latest checkpt in the saved dir logs/${name}
    Optimizer=QHAdam,
    optimizer_params=optimizer_params,
    lr=parameters['lr'],
    lr_warmup_steps=parameters['lr_warmup_steps'],
    verbose=False,
    n_last_checkpoints=5,
    step_callbacks=step_callbacks, # Temp annelaing
    fp16=1,
    problem=data['problem'],
)
batch_size = parameters['batch_size']

cuda
Normalize y. mean = 42.95601381439407, std = 40.35418349568578


In [16]:
trainer.load_checkpoint(tag='best')
test_err = trainer.evaluate_mse(X_test, y_test, device=device, batch_size=2 * batch_size)

print("Test Error rate: {}".format(test_err))

# Clean up
trainer.remove_old_temp_checkpoints(number_ckpts_to_keep=0)

  checkpoint = torch.load(path)


Test Error rate: 1.0612823963165283


In [17]:
from nodegam.utils import get_latest_file, check_numpy, process_in_chunks

X_test = torch.as_tensor(X_test, device=device)
y_test = check_numpy(y_test)
model.train(False)
with torch.no_grad():
    prediction = process_in_chunks(model, X_test, batch_size=batch_size)
    prediction = check_numpy(prediction)

mu = 42.95601381439407
sigma = 40.35418349568578

original_data_test = [mu + z * sigma for z in prediction]
original_data_true = [mu + z * sigma for z in y_test]

In [18]:
from feature_utils import compute_overlap_true_test

compute_overlap_true_test(original_data_true, original_data_test, y_test_id)

average_matching_indices :  6.350961538461538


6.350961538461538

In [19]:
import copy
gam_model = model

In [42]:

import torch
from datetime import datetime
import pandas as pd
from explainability import *
# from model_utils import MLPModel
from feature_utils import process
from nodegam.utils import process_in_chunks, check_numpy


test_df = {}
test_df_after_10nov = {}
model = {}
k = 15
for format in ["OD"]:#, "T20", "Test"]:
    test_df[format] = pd.read_csv(f"../data/processed/test/{k}_{format}.csv")
    test_df[format]['match_id'] = test_df[format]['match_id'].astype(str)
    test_df_after_10nov[format] = pd.read_csv(f"../data/processed/test_after10nov/{k}_{format}.csv")
    test_df_after_10nov[format]['match_id'] = test_df_after_10nov[format]['match_id'].astype(str)

    # model[format] = MLPModel(35, 64) ######################################################################33
    # state_dict = torch.load(f"../model_artifacts/{format}_model.pth", weights_only=True)
    # model[format].load_state_dict(state_dict)
    # model[format].eval()
    
    model[format] = gam_model.to(device)

    model[format].eval()

players_data = pd.read_csv("../data/raw/cricksheet/people.csv")

# {
#         "A Kumble": "0c2730df",
#         "Abdul Razzaq": "390ff45b", ###
#         "Arshad Khan": "0a4c6dfd", ###
#         "D Mongia": "99663fa5",
#         "Harbhajan Singh": "8b5b6769",
#         "Iftikhar Anjum": "6adf9347", ###
#         "Inzamam-ul-Haq": "b9a3d8c6", ###
#         "Kamran Akmal": "ff077124", ###
#         "L Balaji": "b2b50355",
#         "M Kaif": "d84378a4",
#         "MS Dhoni": "4a8a2e3b",
#         "Mohammad Hafeez": "9ab63e7b",
#         "Yousuf Youhana": "e237b28c", ###
#         "Naved-ul-Hasan": "33f28243", ###
#         "R Dravid": "0184dc35",
#         "SR Tendulkar": "d2c2b2d5",
#         "Salman Butt": "4d6d6280", ###
#         "Shahid Afridi": "0dc00542", ###
#         "Shoaib Malik": "64c34cd0", ###
#         "V Sehwag": "8ba8195d",
#         "Younis Khan": "33cb3411", ###
#         "Yuvraj Singh": "1c914163",
#         "Z Khan": "91a4a398"}

ids = [
    # Pakistan Players
    "390ff45b", "6adf9347", "b9a3d8c6", "ff077124", "64c34cd0",
    "e237b28c", "33f28243", "4d6d6280", "0dc00542", "0a4c6dfd",
    "33cb3411",

    # India Players
    "99663fa5", "8b5b6769", "b2b50355", "d84378a4", "4a8a2e3b", 
    "0184dc35", "0c2730df", "d2c2b2d5", "8ba8195d", "1c914163",
    "91a4a398",
]
date = "2005-04-15"
format = "OD"

# print(test_df[format])
def forward(date, format, players_id_list, match_id=None):
    date = datetime.strptime(date, "%Y-%m-%d")
    target_date = datetime.strptime("2005-04-15", "%Y-%m-%d")
    if date <= target_date:
        df = test_df[format]
        filtered_rows = df[df['match_id'] == match_id]
    
    else:
        df = test_df_after_10nov[format]
        filtered_rows = df[df['Player'].isin(players_id_list)]

    debut_player_points = 10
    debut_explaination = "This player is making his debut in this match, so he is given a default score of 10."

    non_debut_ids = list(filtered_rows['player_id'])
    debut_ids = list(set(players_id_list) - set(non_debut_ids))
    # print(filtered_rows.columns)
    test_data = process(filtered_rows, 15, False)
    columns = test_data.columns

    y_test = filtered_rows["fantasy_points"]
    X_test = test_data 
    X_test, y_test = preprocessor.transform(X_test, y_test)
    X_test = torch.tensor(X_test).to(device)
    # test_data = torch.tensor(test_data).float().to(device)

    # print(test_data.shape)
    # print(model[format])
    # Run the model with additive terms
    with torch.no_grad():  # No gradients needed during inference
        effects = model[format].run_with_additive_terms(X_test)
    # print(effects)
    additive_terms = model[format].get_additive_terms()  # List of 594 feature names (main features and tuples)

    players_data2 = pd.read_csv(f"../data/raw/cricksheet/people.csv")

    # Step 1: Identify main features and store their original indices
    main_feature_mapping = [
        (i, term) for i, term in enumerate(additive_terms) if not isinstance(term, tuple)
    ]
    original_main_feature_indices = [i for i, _ in main_feature_mapping]

    # Step 2: Filter tensor to keep only main features
    main_features_tensor = effects[:, original_main_feature_indices]  # Shape: (22, number_of_main_features)
    print(f"Shape of tensor with main features: {main_features_tensor.shape}")

    # Step 3: Extract top "k" main features for each row
    k = 5  # Number of top features to extract
    top_k_values, top_k_indices = torch.topk(main_features_tensor, k, dim=1)

    # Step 4: Map back to original feature indices and names
    players_data = []
    for i, row in enumerate(main_features_tensor):
        players_data.append({})
        for idx in top_k_indices[i]:
            _, feature = main_feature_mapping[idx]
            feature_name = columns[feature]
            players_data[i][feature_name] = main_features_tensor[i][idx].item()
    
    # model[format].eval()
    # with torch.no_grad():
    #     prediction = model[format](X_test)
    # output = prediction
    # print("Prediction = ", prediction)
    
    # output, non_debut_ids = zip(*sorted(zip(output, non_debut_ids), key=lambda x: x[0], reverse=True))
    # output = list(output)
    non_debut_ids = list(non_debut_ids)
    non_debut_names = players_data2.loc[players_data2['identifier'].isin(players_id_list), 'name'].values
    response = generate_explanations(players_data, non_debut_names)
    explainations = response.text
    separator = "\n---\n"
    explaination_list = explainations.split(separator)
    for text in explaination_list:
        print(text)
    # for explanation in explaination_list: print(explanation)
    # explaination_list = explain_outputs(model[format], test_data, columns, non_debut_names)
    # backup_explaination = backup_outputs(model[format], test_data, columns, non_debut_names)

    output = output + [debut_player_points] * len(debut_ids)
    explaination_list += [debut_explaination] * len(debut_ids)

    
        
    return output, explaination_list

forward(date, format, ids, match_id="1451893")

Shape of tensor with main features: torch.Size([22, 35, 1])
Abdul Razzaq's selection is due to his recent impressive run-outs and consistent bowling figures in the last 15 matches, coupled with a good venue average for wickets.  His overall bowling performance significantly contributed to his high predicted score.


A Kumble's selection is justified by his strong performance at the venue (high wicket average), combined with decent bowling stats in recent matches. His experience and ability to contain runs also played a role.


Arshad Khan made the team based on his strong bowling performance at the venue and his consistent dot ball bowling in recent matches. His experience also contributed to his selection.


D Mongia's high predicted score stems from his excellent dot ball bowling record in recent matches and his performance at the venue. His consistent batting in the last 15 games also added to his fantasy score.


Harbhajan Singh's selection is attributed to his bowling performance 

UnboundLocalError: local variable 'output' referenced before assignment

In [None]:
(gam_model.get_additive_terms())

[34,
 33,
 (33, 34),
 32,
 (32, 34),
 (32, 33),
 31,
 (31, 34),
 (31, 33),
 (31, 32),
 30,
 (30, 34),
 (30, 33),
 (30, 32),
 (30, 31),
 29,
 (29, 34),
 (29, 33),
 (29, 32),
 (29, 31),
 28,
 (28, 34),
 (28, 33),
 (28, 32),
 (28, 31),
 (28, 30),
 (28, 29),
 27,
 (27, 34),
 (27, 33),
 (27, 32),
 (27, 31),
 (27, 30),
 (27, 29),
 (27, 28),
 26,
 (26, 33),
 (26, 32),
 (26, 31),
 (26, 30),
 (26, 29),
 (26, 28),
 (26, 27),
 25,
 (25, 34),
 (25, 33),
 (25, 32),
 (25, 31),
 (25, 30),
 (25, 29),
 (25, 28),
 (25, 27),
 (25, 26),
 24,
 (24, 34),
 (24, 33),
 (24, 32),
 (24, 31),
 (24, 30),
 (24, 29),
 (24, 28),
 (24, 27),
 (24, 26),
 23,
 (23, 34),
 (23, 33),
 (23, 32),
 (23, 31),
 (23, 30),
 (23, 29),
 (23, 28),
 (23, 27),
 (23, 26),
 (23, 24),
 22,
 (22, 33),
 (22, 32),
 (22, 31),
 (22, 30),
 (22, 29),
 (22, 26),
 (22, 25),
 (22, 24),
 (22, 23),
 21,
 (21, 33),
 (21, 32),
 (21, 31),
 (21, 30),
 (21, 29),
 (21, 26),
 (21, 25),
 (21, 24),
 (21, 23),
 (21, 22),
 20,
 (20, 33),
 (20, 32),
 (20, 31),
 