# Optuna Trials For Baseline Model

In [1]:
# Standard library imports
import datetime
import os
from collections import deque
import time

# Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns


from sklearn.model_selection import train_test_split

from tqdm import tqdm

if os.path.exists('/workspace/data_2'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data_2/'
else:
    data_path = '../data/'
    
# if torch.cuda.is_available() == False:
#     RuntimeError("GPU detected: False")
#     print("GPU detected: False")
# else:
#     device = torch.device("cuda")
#     print("The GPU is detected.")



### Load Data

In [2]:
sets_df = pd.read_pickle(data_path + 'sets_with_results_df.pkl')
dataset_mini_df = pd.read_pickle(data_path + 'dataset_mini.pkl')

## Combine Data

In [3]:
sets_df.head()

Unnamed: 0,key_x,game,tournament_key,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,valid_score,...,result_1,result_2,result_3,result_4,result_5,result_6,result_7,result_8,result_9,result_10
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,Chillin,Chillin,5620,1,3,True,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),Aglet,15634,Aglet,15634,3,2,True,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),6126,1097,1097,6126,0,3,True,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
3,104675846,melee,mdva-invitational-2017-(challonge-mirror),1069,Chu,1069,Chu,3,0,True,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
4,104675847,melee,mdva-invitational-2017-(challonge-mirror),Rishi,Jerry,Jerry,Rishi,1,3,True,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


In [4]:
dataset_mini_df.head()

Unnamed: 0,p1_elo,p2_elo,p1_rd,p2_rd,p1_updates,p2_updates,p1_m1_usage,p1_m2_usage,p2_m1_usage,p2_m2_usage,...,p1/m2_elo,p1/m2_rd,p1/m2_updates,p2/m1_elo,p2/m1_rd,p2/m1_updates,p2/m2_elo,p2/m2_rd,p2/m2_updates,winner
681,1500.0,1500.0,350.0,350.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1500.0,350.0,0.0,1500.0,350.0,0.0,1500.0,350.0,0.0,0.0
682,1500.0,1500.0,350.0,350.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1500.0,350.0,0.0,1500.0,350.0,0.0,1500.0,350.0,0.0,0.0
683,1500.0,1500.0,350.0,350.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1500.0,350.0,0.0,1500.0,350.0,0.0,1500.0,350.0,0.0,1.0
684,1500.0,1500.0,350.0,350.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1500.0,350.0,0.0,1500.0,350.0,0.0,1500.0,350.0,0.0,1.0
685,1500.0,1500.0,350.0,350.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1500.0,350.0,0.0,1500.0,350.0,0.0,1500.0,350.0,0.0,1.0


Merge dataframes

In [5]:
dataset_df = pd.merge(sets_df, dataset_mini_df, how='left', left_index=True, right_index=True)
dataset_df.head()

Unnamed: 0,key_x,game,tournament_key,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,valid_score,...,p1/m2_elo,p1/m2_rd,p1/m2_updates,p2/m1_elo,p2/m1_rd,p2/m1_updates,p2/m2_elo,p2/m2_rd,p2/m2_updates,winner
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,Chillin,Chillin,5620,1,3,True,...,1500.0,350.0,0.0,1613.001275,70.60079,5.0,1500.0,216.482281,0.0,0.0
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),Aglet,15634,Aglet,15634,3,2,True,...,1500.0,350.0,0.0,1637.162591,77.931512,5.0,1408.326897,271.489866,1.0,1.0
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),6126,1097,1097,6126,0,3,True,...,1500.0,350.0,0.0,1618.921028,81.744391,4.0,1500.0,350.0,0.0,0.0
3,104675846,melee,mdva-invitational-2017-(challonge-mirror),1069,Chu,1069,Chu,3,0,True,...,1279.022079,284.579342,1.0,1500.0,350.0,0.0,1500.0,350.0,0.0,1.0
4,104675847,melee,mdva-invitational-2017-(challonge-mirror),Rishi,Jerry,Jerry,Rishi,1,3,True,...,1500.0,350.0,0.0,1500.0,350.0,0.0,1500.0,350.0,0.0,0.0


Combine the score so that we can do a regression model.

In [6]:
dataset_df['regression_score'] = None
dataset_df.loc[(dataset_df['valid_score']==True),'regression_score'] = dataset_df['p1_score'] / (dataset_df['p1_score'] + dataset_df['p2_score'])



In [7]:
# Drop rows with missing values in specified columns
train_df = dataset_df.dropna(subset=dataset_df.columns[36:])

((train_df['regression_score'] > .5) == train_df['winner']).value_counts()


True    1193953
Name: count, dtype: int64

Identify columns for training.

In [8]:
# for i, col in enumerate(dataset_df.columns):
#     print(i, col)
    

In [9]:
results = dataset_df.columns[36:46]
general_elo = dataset_df.columns[46:50]
all_elo = dataset_df.columns[46:-2]
features = dataset_df.columns[36:-2]
targets=['winner','regression_score']



Train a model just on the general elo.

In [10]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from scipy.special import expit  # For sigmoid function

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

# Train-test splits
df_train, df_test = train_test_split(df, stratify=df['winner'], train_size=.8, random_state=42)

X_train_c = df_train[features]
X_test_c = df_test[features]
y_train_c = df_train['winner']
y_test_c = df_test['winner']

# Filter out the samples with valid scores
df_train_r = df_train.dropna(subset=['regression_score'])
df_test_r = df_test.dropna(subset=['regression_score'])

X_train_r = df_train_r[features]
X_test_r = df_test_r[features]
y_train_r = df_train_r['regression_score']
y_test_r = df_test_r['regression_score']

# Train the Classifier
model_classifier = XGBClassifier()
model_classifier.fit(X_train_c, y_train_c)

# Train the Regressor
model_regressor = XGBRegressor()
model_regressor.fit(X_train_r, y_train_r)



# Predictions for Classifier on classification test set
probs_c_test_c = model_classifier.predict_proba(X_test_c)[:, 1]  # Probability for class 1 (player 1 wins)
preds_c_test_c = model_classifier.predict(X_test_c)

# Metrics for Classifier on classification test set
accuracy_c_test_c = accuracy_score(y_test_c, preds_c_test_c)
log_loss_c_test_c = log_loss(y_test_c, probs_c_test_c)



# Predictions for Classifier on regression test set
probs_c_test_r = model_classifier.predict_proba(X_test_r)[:, 1]
preds_c_test_r = model_classifier.predict(X_test_r)

# Metrics for Classifier on regression test set
accuracy_c_test_r = accuracy_score(df_test_r['winner'], preds_c_test_r)
log_loss_c_test_r = log_loss(df_test_r['winner'], probs_c_test_r)



# Predictions for Regressor on classification test set
preds_r_test_c = model_regressor.predict(X_test_c)  # Regressor predictions on classification test set

# Metrics for Regressor on classification test set
accuracy_r_test_c = accuracy_score(y_test_c, preds_r_test_c > 0.5)



# Predictions for Regressor on regression test set
preds_r_test_r = model_regressor.predict(X_test_r)

# Metrics for Regressor on regression test set
accuracy_r_test_c = accuracy_score(df_test_r['winner'], preds_r_test_r > 0.5)
mse_r_test_r = mean_squared_error(y_test_r, preds_r_test_r)



# Display Results
print("Classifier Results on Classification Test Set:")
print(f" - Accuracy: {accuracy_c_test_c:.2%}")
print(f" - Log-Loss: {log_loss_c_test_c:.4f}")

print("\nRegressor Results on Classification Test Set:")
print(f" - Accuracy: {accuracy_r_test_c:.2%}")
# print(f" - Log-Loss (derived): {log_loss_r_c_test_c:.4f}")

print("\nClassifier Results on Regression Test Set:")
print(f" - Accuracy: {accuracy_c_test_r:.2%}")
print(f" - Log-Loss: {log_loss_c_test_r:.4f}")

print("\nRegressor Results on Regression Test Set:")
print(f" - Accuracy: {accuracy_c_test_r:.2%}")
print(f" - MSE: {mse_r_test_r:.4f}")


Classifier Results on Classification Test Set:
 - Accuracy: 76.73%
 - Log-Loss: 0.4833

Regressor Results on Classification Test Set:
 - Accuracy: 79.95%

Classifier Results on Regression Test Set:
 - Accuracy: 79.85%
 - Log-Loss: 0.4289

Regressor Results on Regression Test Set:
 - Accuracy: 79.85%
 - MSE: 0.0952


In [11]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from scipy.special import expit  # For sigmoid function

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

# Train-test splits
df_train, df_test = train_test_split(df, stratify=df['winner'], train_size=.8, random_state=42)

X_train_c = df_train[features]
X_test_c = df_test[features]
y_train_c = df_train['winner']
y_test_c = df_test['winner']

# Filter out the samples with valid scores
df_train_r = df_train.dropna(subset=['regression_score'])
df_test_r = df_test.dropna(subset=['regression_score'])

X_train_r = df_train_r[features]
X_test_r = df_test_r[features]
y_train_r = df_train_r['regression_score']
y_test_r = df_test_r['regression_score']

# Train the Classifier
model_classifier = XGBClassifier()
model_classifier.fit(X_train_r, df_train_r['winner'])

# Train the Regressor
model_regressor = XGBRegressor()
model_regressor.fit(X_train_r, y_train_r)



# Predictions for Classifier on classification test set
probs_c_test_c = model_classifier.predict_proba(X_test_c)[:, 1]  # Probability for class 1 (player 1 wins)
preds_c_test_c = model_classifier.predict(X_test_c)

# Metrics for Classifier on classification test set
accuracy_c_test_c = accuracy_score(y_test_c, preds_c_test_c)
log_loss_c_test_c = log_loss(y_test_c, probs_c_test_c)



# Predictions for Classifier on regression test set
probs_c_test_r = model_classifier.predict_proba(X_test_r)[:, 1]
preds_c_test_r = model_classifier.predict(X_test_r)

# Metrics for Classifier on regression test set
accuracy_c_test_r = accuracy_score(df_test_r['winner'], preds_c_test_r)
log_loss_c_test_r = log_loss(df_test_r['winner'], probs_c_test_r)



# Predictions for Regressor on classification test set
preds_r_test_c = model_regressor.predict(X_test_c)  # Regressor predictions on classification test set

# Metrics for Regressor on classification test set
accuracy_r_test_c = accuracy_score(y_test_c, preds_r_test_c > 0.5)



# Predictions for Regressor on regression test set
preds_r_test_r = model_regressor.predict(X_test_r)

# Metrics for Regressor on regression test set
accuracy_r_test_c = accuracy_score(df_test_r['winner'], preds_r_test_r > 0.5)
mse_r_test_r = mean_squared_error(y_test_r, preds_r_test_r)



# Display Results
print("Classifier Results on Classification Test Set:")
print(f" - Accuracy: {accuracy_c_test_c:.2%}")
print(f" - Log-Loss: {log_loss_c_test_c:.4f}")

print("\nRegressor Results on Classification Test Set:")
print(f" - Accuracy: {accuracy_r_test_c:.2%}")
# print(f" - Log-Loss (derived): {log_loss_r_c_test_c:.4f}")

print("\nClassifier Results on Regression Test Set:")
print(f" - Accuracy: {accuracy_c_test_r:.2%}")
print(f" - Log-Loss: {log_loss_c_test_r:.4f}")

print("\nRegressor Results on Regression Test Set:")
print(f" - Accuracy: {accuracy_c_test_r:.2%}")
print(f" - MSE: {mse_r_test_r:.4f}")


Classifier Results on Classification Test Set:
 - Accuracy: 76.70%
 - Log-Loss: 0.4999

Regressor Results on Classification Test Set:
 - Accuracy: 79.95%

Classifier Results on Regression Test Set:
 - Accuracy: 79.93%
 - Log-Loss: 0.4181

Regressor Results on Regression Test Set:
 - Accuracy: 79.93%
 - MSE: 0.0952
