# Optuna Trials For Baseline Model

In [1]:
# Standard library imports
import datetime
import os
from collections import deque
import time

# Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns


from sklearn.model_selection import train_test_split

from tqdm import tqdm

if os.path.exists('/workspace/data_2'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data_2/'
else:
    data_path = '../data/'
    
# if torch.cuda.is_available() == False:
#     RuntimeError("GPU detected: False")
#     print("GPU detected: False")
# else:
#     device = torch.device("cuda")
#     print("The GPU is detected.")



### Load Data

In [2]:
sets_df = pd.read_pickle(data_path + 'sets_with_results_df.pkl')
dataset_mini_df = pd.read_pickle(data_path + 'dataset_mini.pkl')

## Combine Data

In [3]:
sets_df.head()

Unnamed: 0,key_x,game,tournament_key,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,valid_score,...,result_1,result_2,result_3,result_4,result_5,result_6,result_7,result_8,result_9,result_10
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,Chillin,Chillin,5620,1,3,True,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),Aglet,15634,Aglet,15634,3,2,True,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),6126,1097,1097,6126,0,3,True,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
3,104675846,melee,mdva-invitational-2017-(challonge-mirror),1069,Chu,1069,Chu,3,0,True,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
4,104675847,melee,mdva-invitational-2017-(challonge-mirror),Rishi,Jerry,Jerry,Rishi,1,3,True,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


In [4]:
dataset_mini_df.head()

Unnamed: 0,p1_elo,p2_elo,p1_rd,p2_rd,p1_updates,p2_updates,p1_m1_usage,p1_m2_usage,p2_m1_usage,p2_m2_usage,...,p1/m2_elo,p1/m2_rd,p1/m2_updates,p2/m1_elo,p2/m1_rd,p2/m1_updates,p2/m2_elo,p2/m2_rd,p2/m2_updates,winner
681,1500.0,1500.0,350.0,350.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1500.0,350.0,0.0,1500.0,350.0,0.0,1500.0,350.0,0.0,0.0
682,1500.0,1500.0,350.0,350.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1500.0,350.0,0.0,1500.0,350.0,0.0,1500.0,350.0,0.0,0.0
683,1500.0,1500.0,350.0,350.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1500.0,350.0,0.0,1500.0,350.0,0.0,1500.0,350.0,0.0,1.0
684,1500.0,1500.0,350.0,350.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1500.0,350.0,0.0,1500.0,350.0,0.0,1500.0,350.0,0.0,1.0
685,1500.0,1500.0,350.0,350.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1500.0,350.0,0.0,1500.0,350.0,0.0,1500.0,350.0,0.0,1.0


Merge dataframes

In [5]:
dataset_df = pd.merge(sets_df, dataset_mini_df, how='left', left_index=True, right_index=True)
dataset_df.head()

Unnamed: 0,key_x,game,tournament_key,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,valid_score,...,p1/m2_elo,p1/m2_rd,p1/m2_updates,p2/m1_elo,p2/m1_rd,p2/m1_updates,p2/m2_elo,p2/m2_rd,p2/m2_updates,winner
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,Chillin,Chillin,5620,1,3,True,...,1500.0,350.0,0.0,1613.001275,70.60079,5.0,1500.0,216.482281,0.0,0.0
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),Aglet,15634,Aglet,15634,3,2,True,...,1500.0,350.0,0.0,1637.162591,77.931512,5.0,1408.326897,271.489866,1.0,1.0
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),6126,1097,1097,6126,0,3,True,...,1500.0,350.0,0.0,1618.921028,81.744391,4.0,1500.0,350.0,0.0,0.0
3,104675846,melee,mdva-invitational-2017-(challonge-mirror),1069,Chu,1069,Chu,3,0,True,...,1279.022079,284.579342,1.0,1500.0,350.0,0.0,1500.0,350.0,0.0,1.0
4,104675847,melee,mdva-invitational-2017-(challonge-mirror),Rishi,Jerry,Jerry,Rishi,1,3,True,...,1500.0,350.0,0.0,1500.0,350.0,0.0,1500.0,350.0,0.0,0.0


Combine the score so that we can do a regression model.

In [6]:
dataset_df['regression_score'] = None
dataset_df.loc[(dataset_df['valid_score']==True),'regression_score'] = dataset_df['p1_score'] / (dataset_df['p1_score'] + dataset_df['p2_score'])



In [7]:
(dataset_df['valid_score']==True).sum()/dataset_df.shape[0]

0.6650847227319329

In [8]:
# Drop rows with missing values in specified columns
train_df = dataset_df.dropna(subset=dataset_df.columns[36:])

((train_df['regression_score'] > .5) == train_df['winner']).value_counts()




True    1193953
Name: count, dtype: int64

Identify columns for training.

In [12]:
for i, col in enumerate(dataset_df.columns):
    print(i, col)
    

0 key_x
1 game
2 tournament_key
3 winner_id
4 loser_id
5 p1_id
6 p2_id
7 p1_score
8 p2_score
9 valid_score
10 best_of
11 location_names
12 bracket_name
13 bracket_order
14 set_order
15 game_data
16 top_8
17 top_8_location_names
18 valid_top_8_bracket
19 top_8_bracket_location_names
20 major
21 key_y
22 start
23 end
24 start_week
25 p1_characters
26 p2_characters
27 p1_consistent
28 p2_consistent
29 matchup_strings
30 end_week
31 players_have_history
32 (p1/p2)_sorted
33 (p1/p2)_was_sorted
34 results_sorted
35 results
36 result_1
37 result_2
38 result_3
39 result_4
40 result_5
41 result_6
42 result_7
43 result_8
44 result_9
45 result_10
46 p1_elo
47 p2_elo
48 p1_rd
49 p2_rd
50 p1_updates
51 p2_updates
52 p1_m1_usage
53 p1_m2_usage
54 p2_m1_usage
55 p2_m2_usage
56 p1/m1/m1_elo
57 p1/m1/m1_rd
58 p1/m1/m1_updates
59 p1/m1/m2_elo
60 p1/m1/m2_rd
61 p1/m1/m2_updates
62 p1/m2/m1_elo
63 p1/m2/m1_rd
64 p1/m2/m1_updates
65 p1/m2/m2_elo
66 p1/m2/m2_rd
67 p1/m2/m2_updates
68 p2/m1/m1_elo
69 p2/m1/m

In [None]:
results = dataset_df.columns[36:46]
general_elo = dataset_df.columns[46:50]
all_elo = dataset_df.columns[46:-2]
features = dataset_df.columns[36:-2]
targets=['winner','regression_score']




Train a model just on the general elo.

In [None]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])
# Train-test splits
df_train, df_test = train_test_split(df, stratify=df['winner'], train_size=.8, random_state=42)

# model = XGBClassifier()
# model.fit(df_train[general_elo], df_train['winner'])
# preds = model.predict(df_test[general_elo]) 
# print(f"Only general ELO accuracy: {accuracy_score(df_test['winner'], preds):.3%}")

# model = XGBClassifier()
# model.fit(df_train[all_elo], df_train['winner'])
# preds = model.predict(df_test[all_elo]) 
# print(f"All ELO accuracy: {accuracy_score(df_test['winner'], preds):.3%}")

model = XGBClassifier()
model.fit(df_train[features], df_train['winner'])
preds = model.predict(df_test[features]) 
print(f"All features accuracy: {accuracy_score(df_test['winner'], preds):.3%}")


Only general ELO accuracy: 76.172%


KeyboardInterrupt: 

In [18]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

# Train-test splits
df_train, df_test = train_test_split(df, stratify=df['winner'], train_size=.8, random_state=42)

df_train_results = df_train[df_train['result_1']!=.5]
df_test_results = df_test[df_test['result_1']!=.5]

df_train_no_results = df_train[df_train['result_1']==.5]
df_test_no_results = df_test[df_test['result_1']==.5]

model = XGBClassifier()
model.fit(df_train[features], df_train['winner'])
preds_all = model.predict(df_test[features])
preds_results = model.predict(df_test_results[features])
preds_no_results = model.predict(df_test_no_results[features])

print("Trained on all data:")
print(f"All accuracy: {accuracy_score(df_test['winner'], preds_all):.3%}")
print(f"With results accuracy: {accuracy_score(df_test_results['winner'], preds_results):.3%}")
print(f"Without results accuracy: {accuracy_score(df_test_no_results['winner'], preds_no_results):.3%}")
print()

model = XGBClassifier()
model.fit(df_train_results[features], df_train_results['winner'])
preds_all = model.predict(df_test[features])
preds_results = model.predict(df_test_results[features])
preds_no_results = model.predict(df_test_no_results[features])

print("Trained with results only:")
print(f"All accuracy: {accuracy_score(df_test['winner'], preds_all):.3%}")
print(f"With results accuracy: {accuracy_score(df_test_results['winner'], preds_results):.3%}")
print(f"Without results accuracy: {accuracy_score(df_test_no_results['winner'], preds_no_results):.3%}")

Trained on all data:
All accuracy: 76.730%
With results accuracy: 78.603%
Without results accuracy: 75.787%

Trained with results only:
All accuracy: 72.621%
With results accuracy: 78.388%
Without results accuracy: 69.720%


In [33]:
model = XGBClassifier()
model.fit(df_train[general_elo.append(results)], df_train['winner'])
preds = model.predict(df_test[general_elo.append(results)]) 
print(f"All features accuracy: {accuracy_score(df_test['winner'], preds):.3%}")

All features accuracy: 76.403%


In [11]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from scipy.special import expit  # For sigmoid function

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

# Train-test splits
df_train, df_test = train_test_split(df, stratify=df['winner'], train_size=.8, random_state=42)

X_train_c = df_train[features]
X_test_c = df_test[features]
y_train_c = df_train['winner']
y_test_c = df_test['winner']

# Filter out the samples with valid scores
df_train_r = df_train.dropna(subset=['regression_score'])
df_test_r = df_test.dropna(subset=['regression_score'])

X_train_r = df_train_r[features]
X_test_r = df_test_r[features]
y_train_r = df_train_r['regression_score']
y_test_r = df_test_r['regression_score']

# Train the Classifier
model_classifier = XGBClassifier()
model_classifier.fit(X_train_c, y_train_c)

# Train the Regressor
model_regressor = XGBRegressor()
model_regressor.fit(X_train_r, y_train_r)



# Predictions for Classifier on classification test set
probs_c_test_c = model_classifier.predict_proba(X_test_c)[:, 1]  # Probability for class 1 (player 1 wins)
preds_c_test_c = model_classifier.predict(X_test_c)

# Metrics for Classifier on classification test set
accuracy_c_test_c = accuracy_score(y_test_c, preds_c_test_c)
log_loss_c_test_c = log_loss(y_test_c, probs_c_test_c)



# Predictions for Classifier on regression test set
probs_c_test_r = model_classifier.predict_proba(X_test_r)[:, 1]
preds_c_test_r = model_classifier.predict(X_test_r)

# Metrics for Classifier on regression test set
accuracy_c_test_r = accuracy_score(df_test_r['winner'], preds_c_test_r)
log_loss_c_test_r = log_loss(df_test_r['winner'], probs_c_test_r)



# Predictions for Regressor on classification test set
preds_r_test_c = model_regressor.predict(X_test_c)  # Regressor predictions on classification test set

# Metrics for Regressor on classification test set
accuracy_r_test_c = accuracy_score(y_test_c, preds_r_test_c > 0.5)



# Predictions for Regressor on regression test set
preds_r_test_r = model_regressor.predict(X_test_r)

# Metrics for Regressor on regression test set
accuracy_r_test_r = accuracy_score(df_test_r['winner'], preds_r_test_r > 0.5)
mse_r_test_r = mean_squared_error(y_test_r, preds_r_test_r)



# Display Results
print("Classifier Results on Classification Test Set:")
print(f" - Accuracy: {accuracy_c_test_c:.2%}")
print(f" - Log-Loss: {log_loss_c_test_c:.4f}")

print("\nRegressor Results on Classification Test Set:")
print(f" - Accuracy: {accuracy_r_test_c:.2%}")
# print(f" - Log-Loss (derived): {log_loss_r_c_test_c:.4f}")

print("\nClassifier Results on Regression Test Set:")
print(f" - Accuracy: {accuracy_c_test_r:.2%}")
print(f" - Log-Loss: {log_loss_c_test_r:.4f}")

print("\nRegressor Results on Regression Test Set:")
print(f" - Accuracy: {accuracy_r_test_r:.2%}")
print(f" - MSE: {mse_r_test_r:.4f}")


Classifier Results on Classification Test Set:
 - Accuracy: 76.73%
 - Log-Loss: 0.4833

Regressor Results on Classification Test Set:
 - Accuracy: 76.72%

Classifier Results on Regression Test Set:
 - Accuracy: 79.85%
 - Log-Loss: 0.4289

Regressor Results on Regression Test Set:
 - Accuracy: 79.95%
 - MSE: 0.0952


In [12]:
testing_df = df_test[df_test['valid_top_8_bracket']==True]



# Predictions for Classifier on classification test set
probs_c_test_c = model_classifier.predict_proba(testing_df[features])[:, 1]  # Probability for class 1 (player 1 wins)
preds_c_test_c = model_classifier.predict(testing_df[features])

# Metrics for Classifier on classification test set
accuracy_c_test_c = accuracy_score(testing_df['winner'], preds_c_test_c)
log_loss_c_test_c = log_loss(testing_df['winner'], probs_c_test_c)



# Predictions for Regressor on classification test set
preds_r_test_c = model_regressor.predict(testing_df[features])  # Regressor predictions on classification test set

# Metrics for Regressor on classification test set
accuracy_r_test_c = accuracy_score(testing_df['winner'], preds_r_test_c > 0.5)



# # Predictions for Regressor on regression test set
# preds_r_test_r = model_regressor.predict(X_test_r)

# # Metrics for Regressor on regression test set
# accuracy_r_test_r = accuracy_score(df_test_r['winner'], preds_r_test_r > 0.5)
# mse_r_test_r = mean_squared_error(y_test_r, preds_r_test_r)



# Display Results
print("Classifier Results on Top 8 Test Set:")
print(f" - Accuracy: {accuracy_c_test_c:.2%}")
print(f" - Log-Loss: {log_loss_c_test_c:.4f}")

print("\nRegressor Results on Top 8 Test Set:")
print(f" - Accuracy: {accuracy_r_test_c:.2%}")
# print(f" - Log-Loss (derived): {log_loss_r_c_test_c:.4f}")

# print("\nClassifier Results on Regression Test Set:")
# print(f" - Accuracy: {accuracy_c_test_r:.2%}")
# print(f" - Log-Loss: {log_loss_c_test_r:.4f}")

# print("\nRegressor Results on Regression Test Set:")
# print(f" - Accuracy: {accuracy_r_test_r:.2%}")
# print(f" - MSE: {mse_r_test_r:.4f}")

Classifier Results on Top 8 Test Set:
 - Accuracy: 73.14%
 - Log-Loss: 0.5356

Regressor Results on Top 8 Test Set:
 - Accuracy: 73.03%


In [13]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from scipy.special import expit  # For sigmoid function

# Drop rows with missing values in specified columns
df = df.dropna(subset=dataset_df.columns[36:-1])

# Train-test splits
df_train, df_test = train_test_split(df, stratify=df['winner'], train_size=.8, random_state=42)

X_train_c = df_train[features]
X_test_c = df_test[features]
y_train_c = df_train['winner']
y_test_c = df_test['winner']

# Filter out the samples with valid scores
df_train_r = df_train.dropna(subset=['regression_score'])
df_test_r = df_test.dropna(subset=['regression_score'])

X_train_r = df_train_r[features]
X_test_r = df_test_r[features]
y_train_r = df_train_r['regression_score']
y_test_r = df_test_r['regression_score']

# Train the Classifier
model_classifier = XGBClassifier()
model_classifier.fit(X_train_r, df_train_r['winner'])

# Train the Regressor
model_regressor = XGBRegressor()
model_regressor.fit(X_train_r, y_train_r)



# Predictions for Classifier on classification test set
probs_c_test_c = model_classifier.predict_proba(X_test_c)[:, 1]  # Probability for class 1 (player 1 wins)
preds_c_test_c = model_classifier.predict(X_test_c)

# Metrics for Classifier on classification test set
accuracy_c_test_c = accuracy_score(y_test_c, preds_c_test_c)
log_loss_c_test_c = log_loss(y_test_c, probs_c_test_c)



# Predictions for Classifier on regression test set
probs_c_test_r = model_classifier.predict_proba(X_test_r)[:, 1]
preds_c_test_r = model_classifier.predict(X_test_r)

# Metrics for Classifier on regression test set
accuracy_c_test_r = accuracy_score(df_test_r['winner'], preds_c_test_r)
log_loss_c_test_r = log_loss(df_test_r['winner'], probs_c_test_r)



# Predictions for Regressor on classification test set
preds_r_test_c = model_regressor.predict(X_test_c)  # Regressor predictions on classification test set

# Metrics for Regressor on classification test set
accuracy_r_test_c = accuracy_score(y_test_c, preds_r_test_c > 0.5)



# Predictions for Regressor on regression test set
preds_r_test_r = model_regressor.predict(X_test_r)

# Metrics for Regressor on regression test set
accuracy_r_test_r = accuracy_score(df_test_r['winner'], preds_r_test_r > 0.5)
mse_r_test_r = mean_squared_error(y_test_r, preds_r_test_r)



# Display Results
print("Classifier Results on Classification Test Set:")
print(f" - Accuracy: {accuracy_c_test_c:.2%}")
print(f" - Log-Loss: {log_loss_c_test_c:.4f}")

print("\nRegressor Results on Classification Test Set:")
print(f" - Accuracy: {accuracy_r_test_c:.2%}")
# print(f" - Log-Loss (derived): {log_loss_r_c_test_c:.4f}")

print("\nClassifier Results on Regression Test Set:")
print(f" - Accuracy: {accuracy_c_test_r:.2%}")
print(f" - Log-Loss: {log_loss_c_test_r:.4f}")

print("\nRegressor Results on Regression Test Set:")
print(f" - Accuracy: {accuracy_r_test_r:.2%}")
print(f" - MSE: {mse_r_test_r:.4f}")


Classifier Results on Classification Test Set:
 - Accuracy: 76.70%
 - Log-Loss: 0.4999

Regressor Results on Classification Test Set:
 - Accuracy: 76.72%

Classifier Results on Regression Test Set:
 - Accuracy: 79.93%
 - Log-Loss: 0.4181

Regressor Results on Regression Test Set:
 - Accuracy: 79.95%
 - MSE: 0.0952


In [14]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from scipy.special import expit  # For sigmoid function

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

valid_score_false_df = df[df['valid_score']==False]
# Train-test splits
df_train, df_test = train_test_split(valid_score_false_df, stratify=valid_score_false_df['winner'], train_size=.8, random_state=42)

# Train the Classifier
model_classifier = XGBClassifier()
model_classifier.fit(df_train[features], df_train['winner'])


# Predictions for Classifier on classification test set
probs = model_classifier.predict_proba(df_test[features])[:, 1]  # Probability for class 1 (player 1 wins)
preds = model_classifier.predict(df_test[features])

# Metrics for Classifier on classification test set
accuracy = accuracy_score(df_test['winner'], preds)
log_loss_score = log_loss(df_test['winner'], probs)



print(f"Accuracy: {accuracy:.4%}")
print(f"Log-Loss: {log_loss_score:.4f}")




Accuracy: 70.9072%
Log-Loss: 0.5602


In [15]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from scipy.special import expit  # For sigmoid function

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

valid_score_false_df = df[df['valid_score']==False]
valid_score_true_df = df[df['valid_score']==True]
# Train-test splits
df_false_train, df_false_test = train_test_split(valid_score_false_df, stratify=valid_score_false_df['winner'], train_size=.8, random_state=42)
df_true_train, df_true_test = train_test_split(valid_score_true_df, stratify=valid_score_true_df['winner'], train_size=.8, random_state=42)

df_train = pd.concat([df_false_train, df_true_train])
# df_train = df_true_train
# df_train = df_false_train

# Train the Classifier
model_classifier = XGBClassifier()
model_classifier.fit(df_train[features], df_train['winner'])


# Predictions for Classifier on classification test set
probs = model_classifier.predict_proba(df_false_test[features])[:, 1]  # Probability for class 1 (player 1 wins)
preds = model_classifier.predict(df_false_test[features])

# Metrics for Classifier on classification test set
accuracy = accuracy_score(df_false_test['winner'], preds)
log_loss_score = log_loss(df_false_test['winner'], probs)

print("Classifier Trained on valid and not valid score")
print("Accuracy when test set has no valid score")
print(f"Accuracy: {accuracy:.2%}")
print(f"Log-Loss: {log_loss_score:.4f}")
print()

# Predictions for Classifier on classification test set
probs = model_classifier.predict_proba(df_true_test[features])[:, 1]  # Probability for class 1 (player 1 wins)
preds = model_classifier.predict(df_true_test[features])

# Metrics for Classifier on classification test set
accuracy = accuracy_score(df_true_test['winner'], preds)
log_loss_score = log_loss(df_true_test['winner'], probs)

print("Accuracy when test set has a valid score")
print(f"Accuracy: {accuracy:.2%}")
print(f"Log-Loss: {log_loss_score:.4f}")


Classifier Trained on valid and not valid score
Accuracy when test set has no valid score
Accuracy: 70.64%
Log-Loss: 0.5911

Accuracy when test set has a valid score
Accuracy: 79.96%
Log-Loss: 0.4269


In [16]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
import numpy as np
import pandas as pd

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

# Split data into valid_score=True and valid_score=False subsets
valid_score_false_df = df[df['valid_score'] == False]
valid_score_true_df = df[df['valid_score'] == True]

# Train-test splits for each subset
df_false_train, df_false_test = train_test_split(
    valid_score_false_df, stratify=valid_score_false_df['winner'], train_size=0.8, random_state=42
)
df_true_train, df_true_test = train_test_split(
    valid_score_true_df, stratify=valid_score_true_df['winner'], train_size=0.8, random_state=42
)

# Combine training sets
df_train = pd.concat([df_false_train, df_true_train])

# Train the Classifier
model_classifier = XGBClassifier()
model_classifier.fit(df_train[features], df_train['winner'])

# Predictions and metrics for df_false_test
probs_false = model_classifier.predict_proba(df_false_test[features])[:, 1]
preds_false = model_classifier.predict(df_false_test[features])
accuracy_false = accuracy_score(df_false_test['winner'], preds_false)
log_loss_false = log_loss(df_false_test['winner'], probs_false)

# Predictions and metrics for df_true_test
probs_true = model_classifier.predict_proba(df_true_test[features])[:, 1]
preds_true = model_classifier.predict(df_true_test[features])
accuracy_true = accuracy_score(df_true_test['winner'], preds_true)
log_loss_true = log_loss(df_true_test['winner'], probs_true)

# Combine predictions and probabilities across both sets
y_test_combined = pd.concat([df_false_test['winner'], df_true_test['winner']])
probs_combined = np.concatenate([probs_false, probs_true])
preds_combined = np.concatenate([preds_false, preds_true])

# Calculate combined metrics
accuracy_combined = accuracy_score(y_test_combined, preds_combined)
log_loss_combined = log_loss(y_test_combined, probs_combined)

# Print results
print("Classifier Trained on valid and not valid score")
print("Accuracy and Log-Loss for df_false_test:")
print(f" - Accuracy: {accuracy_false:.4%}")
print(f" - Log-Loss: {log_loss_false:.4f}")
print()
print("Accuracy and Log-Loss for df_true_test:")
print(f" - Accuracy: {accuracy_true:.4%}")
print(f" - Log-Loss: {log_loss_true:.4f}")
print()
print("Combined Accuracy and Log-Loss across both test sets:")
print(f" - Combined Accuracy: {accuracy_combined:.4%}")
print(f" - Combined Log-Loss: {log_loss_combined:.4f}")


Classifier Trained on valid and not valid score
Accuracy and Log-Loss for df_false_test:
 - Accuracy: 70.6377%
 - Log-Loss: 0.5911

Accuracy and Log-Loss for df_true_test:
 - Accuracy: 79.9632%
 - Log-Loss: 0.4269

Combined Accuracy and Log-Loss across both test sets:
 - Combined Accuracy: 76.8399%
 - Combined Log-Loss: 0.4819


In [17]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from scipy.special import expit  # For sigmoid function

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

valid_score_false_df = df[df['valid_score']==False]
valid_score_true_df = df[df['valid_score']==True]
# Train-test splits
df_false_train, df_false_test = train_test_split(valid_score_false_df, stratify=valid_score_false_df['winner'], train_size=.8, random_state=42)
df_true_train, df_true_test = train_test_split(valid_score_true_df, stratify=valid_score_true_df['winner'], train_size=.8, random_state=42)

# df_train = pd.concat([df_false_train, df_true_train])
df_train = df_true_train
# df_train = df_false_train

# Train the Classifier
model_classifier = XGBClassifier()
model_classifier.fit(df_train[features], df_train['winner'])


# Predictions for Classifier on classification test set
probs = model_classifier.predict_proba(df_false_test[features])[:, 1]  # Probability for class 1 (player 1 wins)
preds = model_classifier.predict(df_false_test[features])

# Metrics for Classifier on classification test set
accuracy = accuracy_score(df_false_test['winner'], preds)
log_loss_score = log_loss(df_false_test['winner'], probs)

print("Classifier Trained on valid score")
print("Accuracy when test set has no valid score")
print(f"Accuracy: {accuracy:.4%}")
print(f"Log-Loss: {log_loss_score:.4f}")
print()

# Predictions for Classifier on classification test set
probs = model_classifier.predict_proba(df_true_test[features])[:, 1]  # Probability for class 1 (player 1 wins)
preds = model_classifier.predict(df_true_test[features])

# Metrics for Classifier on classification test set
accuracy = accuracy_score(df_true_test['winner'], preds)
log_loss_score = log_loss(df_true_test['winner'], probs)

print("Accuracy when test set has a valid score")
print(f"Accuracy: {accuracy:.4%}")
print(f"Log-Loss: {log_loss_score:.4f}")


Classifier Trained on valid score
Accuracy when test set has no valid score
Accuracy: 70.2494%
Log-Loss: 0.6606

Accuracy when test set has a valid score
Accuracy: 80.0931%
Log-Loss: 0.4161


In [18]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from scipy.special import expit  # For sigmoid function

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

valid_score_false_df = df[df['valid_score']==False]
valid_score_true_df = df[df['valid_score']==True]
# Train-test splits
df_false_train, df_false_test = train_test_split(valid_score_false_df, stratify=valid_score_false_df['winner'], train_size=.8, random_state=42)
df_true_train, df_true_test = train_test_split(valid_score_true_df, stratify=valid_score_true_df['winner'], train_size=.8, random_state=42)

# df_train = pd.concat([df_false_train, df_true_train])
# df_train = df_true_train
df_train = df_false_train

# Train the Classifier
model_classifier = XGBClassifier()
model_classifier.fit(df_train[features], df_train['winner'])


# Predictions for Classifier on classification test set
probs = model_classifier.predict_proba(df_false_test[features])[:, 1]  # Probability for class 1 (player 1 wins)
preds = model_classifier.predict(df_false_test[features])

# Metrics for Classifier on classification test set
accuracy = accuracy_score(df_false_test['winner'], preds)
log_loss_score = log_loss(df_false_test['winner'], probs)

print("Classifier Trained on no valid score")
print("Accuracy when test set has no valid score")
print(f"Accuracy: {accuracy:.2%}")
print(f"Log-Loss: {log_loss_score:.4f}")
print()

# Predictions for Classifier on classification test set
probs = model_classifier.predict_proba(df_true_test[features])[:, 1]  # Probability for class 1 (player 1 wins)
preds = model_classifier.predict(df_true_test[features])

# Metrics for Classifier on classification test set
accuracy = accuracy_score(df_true_test['winner'], preds)
log_loss_score = log_loss(df_true_test['winner'], probs)

print("Accuracy when test set has a valid score")
print(f"Accuracy: {accuracy:.2%}")
print(f"Log-Loss: {log_loss_score:.4f}")


Classifier Trained on no valid score
Accuracy when test set has no valid score
Accuracy: 70.91%
Log-Loss: 0.5602

Accuracy when test set has a valid score
Accuracy: 77.18%
Log-Loss: 0.5027


In [19]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
import numpy as np

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

# Split based on 'valid_score'
valid_score_false_df = df[df['valid_score'] == False]
valid_score_true_df = df[df['valid_score'] == True]

# Train-test splits
df_false_train, df_false_test = train_test_split(
    valid_score_false_df, stratify=valid_score_false_df['winner'], train_size=0.8, random_state=42
)
df_true_train, df_true_test = train_test_split(
    valid_score_true_df, stratify=valid_score_true_df['winner'], train_size=0.8, random_state=42
)

# Train and predict on false valid_score data
model_classifier = XGBClassifier()
model_classifier.fit(df_false_train[features], df_false_train['winner'])

probs_false = model_classifier.predict_proba(df_false_test[features])[:, 1]  # Probability for class 1
preds_false = model_classifier.predict(df_false_test[features])

# Train and predict on true valid_score data
model_classifier = XGBClassifier()
model_classifier.fit(df_true_train[features], df_true_train['winner'])

probs_true = model_classifier.predict_proba(df_true_test[features])[:, 1]  # Probability for class 1
preds_true = model_classifier.predict(df_true_test[features])

# Combine predictions and probabilities
y_test = pd.concat([df_false_test, df_true_test])['winner']
probs = np.concatenate([probs_false, probs_true])  # Probabilities for log_loss
preds = np.concatenate([preds_false, preds_true])  # Predictions for accuracy

# Metrics for Classifier on classification test set
accuracy = accuracy_score(y_test, preds)
log_loss_score = log_loss(y_test, probs)

print("Accuracy a classifier predicts the corresponding valid_score that it was trained on")
print(f"Accuracy: {accuracy:.4%}")
print(f"Log-Loss: {log_loss_score:.4f}")


Accuracy a classifier predicts the corresponding valid_score that it was trained on
Accuracy: 77.0165%
Log-Loss: 0.4644


In [20]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

# Split data into subsets based on `valid_score`
valid_score_false_df = df[df['valid_score'] == False]
valid_score_true_df = df[df['valid_score'] == True]

# Train-test splits for each subset
df_false_train, df_false_test = train_test_split(
    valid_score_false_df, stratify=valid_score_false_df['winner'], train_size=0.8, random_state=44
)
df_true_train, df_true_test = train_test_split(
    valid_score_true_df, stratify=valid_score_true_df['winner'], train_size=0.8, random_state=44
)

# Use only `df_true_train` for training the regressor
df_train = df_true_train

# Train the Regressor
model_regressor = XGBRegressor()
model_regressor.fit(df_train[features], df_train['regression_score'])

# Predictions and accuracy for df_false_test
preds_false = model_regressor.predict(df_false_test[features])
accuracy_false = accuracy_score(df_false_test['winner'], preds_false > 0.5)

print("Regressor Trained on some valid score")
print("Accuracy when test set has no valid score:")
print(f"Accuracy: {accuracy_false:.4%}")
print()

# Predictions and accuracy for df_true_test
preds_true = model_regressor.predict(df_true_test[features])
accuracy_true = accuracy_score(df_true_test['winner'], preds_true > 0.5)

print("Accuracy when test set has a valid score:")
print(f"Accuracy: {accuracy_true:.4%}")
print()

# Combined accuracy across both test sets
y_test_combined = pd.concat([df_false_test['winner'], df_true_test['winner']])
preds_combined = np.concatenate([preds_false, preds_true])
accuracy_combined = accuracy_score(y_test_combined, preds_combined > 0.5)

print("Combined Accuracy across both test sets:")
print(f"Accuracy: {accuracy_combined:.4%}")


Regressor Trained on some valid score
Accuracy when test set has no valid score:
Accuracy: 70.2053%

Accuracy when test set has a valid score:
Accuracy: 79.9992%

Combined Accuracy across both test sets:
Accuracy: 76.7190%


In [21]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from scipy.special import expit  # For sigmoid function

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

valid_score_false_df = df[df['valid_score']==False]
valid_score_true_df = df[df['valid_score']==True]
# Train-test splits
df_false_train, df_false_test = train_test_split(valid_score_false_df, stratify=valid_score_false_df['winner'], train_size=.8, random_state=42)
df_true_train, df_true_test = train_test_split(valid_score_true_df, stratify=valid_score_true_df['winner'], train_size=.8, random_state=42)

# df_train = pd.concat([df_false_train, df_true_train])
# df_train = df_true_train
df_train = df_false_train

# Train the Classifier
model_classifier = XGBClassifier()
model_classifier.fit(df_train[features], df_train['winner'])


# Predictions for Classifier on classification test set
preds_false = model_classifier.predict(df_false_test[features])

# Metrics for Classifier on classification test set
accuracy_false = accuracy_score(df_false_test['winner'], preds_false)
df_train = df_true_train

model_regressor = XGBRegressor()
model_regressor.fit(df_train[features], df_train['regression_score'])

preds_true = model_regressor.predict(df_true_test[features]) > .5
accuracy_true = accuracy_score(df_true_test['winner'], preds_true)

y_test = pd.concat([df_false_test, df_true_test])['winner']
preds = np.concatenate([preds_false, preds_true])  # Probabilities for log_loss
accuracy = accuracy_score(y_test, preds)

print(f"accuracy_false: {accuracy_false:.4%}")
print(f"accuracy_true: {accuracy_true:.4%}")
print(f"Accuracy: {accuracy:.4%}")

accuracy_false: 70.9072%
accuracy_true: 80.0981%
Accuracy: 77.0198%


In [22]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from scipy.special import expit  # For sigmoid function

# Drop rows with missing values in specified columns
df = dataset_df.dropna(subset=dataset_df.columns[36:-1])

valid_score_false_df = df[df['valid_score']==False]
valid_score_true_df = df[df['valid_score']==True]
# Train-test splits
df_false_train, df_false_test = train_test_split(valid_score_false_df, stratify=valid_score_false_df['winner'], train_size=.8, random_state=42)
df_true_train, df_true_test = train_test_split(valid_score_true_df, stratify=valid_score_true_df['winner'], train_size=.8, random_state=42)

# df_train = pd.concat([df_false_train, df_true_train])
df_train = df_true_train
# df_train = df_false_train

# Train the Classifier
model_classifier = XGBRegressor()
model_classifier.fit(df_train[features], df_train['regression_score'])

preds_false = model_classifier.predict(df_false_test[features])

# Metrics for Classifier on classification test set
accuracy = accuracy_score(df_false_test['winner'], preds_false>.5)
# log_loss_score = log_loss(df_false_test['winner'], probs)

print("Regressor Trained on some valid score")
print("Accuracy when test set has no valid score")
print(f"Accuracy: {accuracy:.4%}")
# print(f"Log-Loss: {log_loss_score:.4f}")
print()

# Predictions for Classifier on classification test set
# probs = model_classifier.predict_proba(df_true_test[features])[:, 1]  # Probability for class 1 (player 1 wins)
preds_true = model_classifier.predict(df_true_test[features])

# Metrics for Classifier on classification test set
accuracy = accuracy_score(df_true_test['winner'], preds_true>.5)
# log_loss_score = log_loss(df_true_test['winner'], probs)

print("Accuracy when test set has a valid score")
print(f"Accuracy: {accuracy:.4%}")
# print(f"Log-Loss: {log_loss_score:.4f}")

y_test = pd.concat([df_false_test, df_true_test])['winner']
preds = np.concatenate([preds_false, preds_true])  # Probabilities for log_loss
accuracy = accuracy_score(y_test, preds >.5)
print()
print("Accuracy across both:")
print(f"Accuracy: {accuracy:.4%}")
# print(f"Log-Loss: {log_loss_score:.4f}")

Regressor Trained on some valid score
Accuracy when test set has no valid score
Accuracy: 70.4132%

Accuracy when test set has a valid score
Accuracy: 80.0981%

Accuracy across both:
Accuracy: 76.8544%


In [29]:
-preds.shape[0]*(.768399-.770198)

645.9201559999982

In [None]:
import optuna
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

# Define the objective function for Optuna
def objective(trial):
    # Hyperparameter search space
    params = {
        "max_depth": trial.suggest_int("max_depth", 1, 13, step=2),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
    }
    
    # Train the model using the current set of hyperparameters
    model = XGBClassifier(**params)
    model.fit(X_train_r, y_train_r)
    
    # Predictions for Regressor on regression test set
    preds_r_test_c = model.predict(X_test_c)
    
    # Metrics for Regressor on regression test set
    accuracy_r_test_c = accuracy_score(y_test_c, preds_r_test_c > 0.5)
    
    return accuracy_r_test_c

# Create a study object
study = optuna.create_study(direction="maximize")  # Minimize MSE

# Run the optimization
study.optimize(objective, n_trials=50, timeout=600, show_progress_bar=True)  # 50 trials or 1-hour time limit

# Display the best hyperparameters and the corresponding MSE
print("Best Hyperparameters:", study.best_params)
print("Best MSE:", study.best_value)


[I 2024-11-23 19:23:27,024] A new study created in memory with name: no-name-4b161953-a507-4409-8f56-9fcbf0c2c5c9


  0%|          | 0/50 [00:00<?, ?it/s]

[W 2024-11-23 19:23:34,354] Trial 0 failed with parameters: {'max_depth': 13, 'learning_rate': 0.24981048744100315, 'n_estimators': 700, 'subsample': 0.7901309435654792, 'colsample_bytree': 0.6560449755412733, 'gamma': 0.3102013628173872, 'min_child_weight': 6, 'lambda': 0.6632431546429975, 'alpha': 2.8932031398538576} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_18446/4098588530.py", line 22, in objective
    model.fit(X_train_r, y_train_r)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1108, in fit
    self._Booster = train(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/usr/l

KeyboardInterrupt: 

In [None]:
# Set up the figure with 3 subplots in a single row
fig, axes = plt.subplots(1, 3, figsize=(18, 12), sharey=False)  # 1 row, 3 columns

# Plot Weight
importance_df.sort_values(by='Weight', ascending=False).set_index('Feature')['Weight'].plot(
    kind='barh', ax=axes[0], title="Feature Importance - Weight (Descending)"
)
axes[0].set_xlabel("Weight Score")
axes[0].set_ylabel("Features")
axes[0].invert_yaxis()  # Invert y-axis for descending order

# Plot Gain
importance_df.sort_values(by='Gain', ascending=False).set_index('Feature')['Gain'].plot(
    kind='barh', ax=axes[1], title="Feature Importance - Gain (Descending)"
)
axes[1].set_xlabel("Gain Score")
axes[1].set_ylabel("Features")
axes[1].invert_yaxis()  # Invert y-axis for descending order

# Plot Cover
importance_df.sort_values(by='Cover', ascending=False).set_index('Feature')['Cover'].plot(
    kind='barh', ax=axes[2], title="Feature Importance - Cover (Descending)"
)
axes[2].set_xlabel("Cover Score")
axes[2].set_ylabel("Features")
axes[2].invert_yaxis()  # Invert y-axis for descending order

# Adjust layout and spacing
plt.tight_layout()
plt.show()
