## Data Processing

### Reading in Engineered Data

In [1]:
# imports for processing

import pandas as pd
import numpy as np
from collections import Counter

In [2]:
scores = pd.read_csv('data/hearts_anon_processed.csv').drop(columns = "Unnamed: 0") # removing automatically generated index
print(scores.shape)
scores.head()

(504, 17)


Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,game_winner,hand_position,swing_down,swing_up,no_swing,swings
0,1,1,player_1,player_4,6,6.0,6.12,0,none,player_2,player_1,player_2,3.0,0,0,0,start
1,1,2,player_1,player_2,6,0.0,0.0,0,none,player_1,player_1,player_2,2.0,0,0,1,same
2,1,3,player_1,player_3,10,4.0,4.08,0,none,player_2,player_1,player_2,2.0,0,0,1,same
3,1,4,player_1,none,26,16.0,16.33,1,none,player_4,player_1,player_2,2.0,0,0,1,same
4,1,5,player_1,player_4,26,0.0,0.0,0,none,player_1,player_1,player_2,2.0,0,0,1,same


## Machine Learning

In [3]:
# imports for ML
 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, PolynomialFeatures
from sklearn.metrics import recall_score, precision_score, classification_report, f1_score
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import (RandomizedSearchCV, cross_validate, train_test_split)
from sklearn.preprocessing import LabelEncoder
from scipy.stats import loguniform
from numpy.linalg import norm
import altair as alt
import eli5

### Info about processing/encoding
- For full descriptions of each column, see repo `README`


In [4]:
# Some initial processing
scores_processed = scores
scores_processed = scores_processed.rename(columns = {"game_winner": "target"})

# encoding categorical cols as ints instead of strings
scores_processed = scores_processed.replace(
    to_replace = {
        "None": 0, "none": 0,
        "player_1": 1, "player_2": 2, "player_3": 3, "player_4": 4,
        "start": 0, "down": 1, "up": 2, "same": 3})

scores_processed

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,target,hand_position,swing_down,swing_up,no_swing,swings
0,1,1,1,4,6,6.0,6.12,0,0,2,1,2,3.0,0,0,0,0
1,1,2,1,2,6,0.0,0.00,0,0,1,1,2,2.0,0,0,1,3
2,1,3,1,3,10,4.0,4.08,0,0,2,1,2,2.0,0,0,1,3
3,1,4,1,0,26,16.0,16.33,1,0,4,1,2,2.0,0,0,1,3
4,1,5,1,4,26,0.0,0.00,0,0,1,1,2,2.0,0,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,14,3,4,2,55,25.0,25.00,1,0,1,1,1,4.0,0,0,1,3
500,14,4,4,0,55,0.0,0.00,0,0,2,1,1,4.0,0,0,1,3
501,14,5,4,3,69,14.0,14.00,1,0,1,1,1,4.0,0,0,1,3
502,14,6,4,1,94,25.0,25.00,1,0,1,1,1,4.0,0,0,1,3


In [5]:
raw_col_vals = []
for col in scores.columns.tolist():
    raw_col_vals.append(len(scores[col].unique().tolist()))

proc_col_vals = []
for col in scores_processed.columns.tolist():
    proc_col_vals.append(len(scores_processed[col].unique().tolist()))

print(raw_col_vals)
print(proc_col_vals)

assert raw_col_vals == proc_col_vals
print("Encoding successfully did not practically change the values of the df")

[14, 12, 4, 5, 105, 29, 237, 2, 5, 4, 4, 4, 4, 2, 2, 2, 4]
[14, 12, 4, 5, 105, 29, 237, 2, 5, 4, 4, 4, 4, 2, 2, 2, 4]
Encoding successfully did not practically change the values of the df


In [6]:
### Splitting data into X/y train/test splits

train_df, test_df = train_test_split(scores_processed, test_size = 0.3, random_state = 123)
X_train, y_train = train_df.drop("target", axis = 1), train_df["target"]
X_test, y_test = test_df.drop("target", axis = 1), test_df["target"]

X_train

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,hand_position,swing_down,swing_up,no_swing,swings
52,2,2,3,4,25,24.0,36.36,1,0,1,1,4.0,0,1,0,2
229,6,10,3,4,90,5.0,5.56,0,0,1,1,3.0,0,0,1,3
246,7,3,1,3,69,26.0,24.07,0,4,4,2,4.0,0,0,1,3
164,5,1,1,4,0,0.0,0.00,0,0,1,1,1.0,0,0,0,0
327,9,5,3,2,58,0.0,0.00,0,0,3,1,3.0,0,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,3,7,3,2,94,19.0,18.63,1,0,2,4,4.0,0,0,1,3
476,14,1,1,4,0,0.0,0.00,0,1,1,1,1.0,0,0,0,0
322,9,5,4,3,50,5.0,10.00,0,0,3,1,2.0,0,0,1,3
382,11,5,2,1,27,0.0,0.00,0,0,4,4,2.0,0,0,1,3


In [7]:
train_df.describe()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,target,hand_position,swing_down,swing_up,no_swing,swings
count,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0
mean,7.232955,5.235795,2.5,2.002841,41.764205,8.434659,11.135455,0.235795,0.278409,2.380682,1.693182,2.383523,2.545455,0.139205,0.15625,0.585227,2.207386
std,4.080004,2.950411,1.117078,1.451001,28.484385,9.545288,13.18734,0.425099,0.871708,1.138536,1.087174,0.822103,1.090326,0.346653,0.363609,0.493384,1.077959
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,4.0,3.0,1.0,1.0,20.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,0.0,0.0,0.0,1.0
50%,7.0,5.0,3.0,2.0,38.0,4.0,5.785,0.0,0.0,2.0,1.0,2.0,3.0,0.0,0.0,1.0,3.0
75%,11.0,7.0,3.0,3.0,61.25,16.0,19.7825,0.0,0.0,3.0,2.0,3.0,3.0,0.0,0.0,1.0,3.0
max,14.0,12.0,4.0,4.0,121.0,39.0,81.82,1.0,4.0,4.0,4.0,4.0,4.0,1.0,1.0,1.0,3.0


In [8]:
train_df["target"].value_counts(normalize = True) # True return proportions, False returns counts

2    0.553977
3    0.227273
4    0.125000
1    0.093750
Name: target, dtype: float64

In [9]:
corr_matrx = train_df.corr('spearman').style.background_gradient()
corr_matrx

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,target,hand_position,swing_down,swing_up,no_swing,swings
game_id,1.0,-0.078969,0.001918,0.058141,0.064713,0.120118,0.092304,-0.003402,0.185376,-0.03379,0.174464,-0.04457,0.059386,-0.031711,7.7e-05,0.017272,0.015969
hand_id,-0.078969,1.0,0.007108,-0.136378,0.767906,-0.013021,-0.02195,0.053131,-0.068456,0.086759,-0.100076,0.011191,0.025717,0.077251,0.007353,0.311657,0.403635
player,0.001918,0.007108,1.0,-0.197574,0.012069,-0.026327,-0.010276,-0.056534,-0.003004,-0.025456,-0.032918,-0.040444,0.051842,0.012097,0.052136,-0.046834,-0.035674
received_cards_from,0.058141,-0.136378,-0.197574,1.0,-0.118998,0.041286,0.022067,-0.020911,0.077059,-0.049616,0.030092,0.014522,-0.052204,-0.006926,-0.075976,-0.010804,-0.044389
total_score,0.064713,0.767906,0.012069,-0.118998,1.0,0.302338,0.23165,0.229174,0.098586,0.048678,-0.052111,-0.032487,0.508361,0.088091,0.093648,0.178585,0.273952
points_per_hand,0.120118,-0.013021,-0.026327,0.041286,0.302338,1.0,0.970774,0.574372,0.300515,0.020135,0.010715,-0.050053,0.418333,0.059034,0.040921,-0.067315,-0.057339
percent_points_per_hand,0.092304,-0.02195,-0.010276,0.022067,0.23165,0.970774,1.0,0.592515,0.242499,0.01329,0.039933,-0.033364,0.332376,0.036416,0.030227,-0.039688,-0.031138
queen_spades,-0.003402,0.053131,-0.056534,-0.020911,0.229174,0.574372,0.592515,1.0,-0.187175,-0.016182,-0.010511,-0.042593,0.327196,0.027957,0.000576,0.019372,0.02917
moon_shooter,0.185376,-0.068456,-0.003004,0.077059,0.098586,0.300515,0.242499,-0.187175,1.0,0.119297,0.088097,0.062992,0.039223,0.025504,0.059028,-0.131369,-0.13569
best_player_of_hand,-0.03379,0.086759,-0.025456,-0.049616,0.048678,0.020135,0.01329,-0.016182,0.119297,1.0,0.243566,0.200813,0.042836,-0.046581,0.079299,0.099387,0.146448


### Preprocessing

In [10]:
X_train.columns.tolist()

['game_id',
 'hand_id',
 'player',
 'received_cards_from',
 'total_score',
 'points_per_hand',
 'percent_points_per_hand',
 'queen_spades',
 'moon_shooter',
 'best_player_of_hand',
 'best_player_of_game',
 'hand_position',
 'swing_down',
 'swing_up',
 'no_swing',
 'swings']

In [11]:
categorical_features = ['player', 'received_cards_from', 'moon_shooter', 'best_player_of_hand', 'best_player_of_game']
binary_features = ['queen_spades', 'no_swing']
ordinal_features = ['hand_position', 'swing_down', 'swing_up', 'swings']

# list of lists -- one ordering (list) for each col
ordinal_feats_order = [
    [1, 2, 3, 4],
    [0, 1],
    [1, 0],
    [2, 3, 0, 1]
    ]

numeric_features = ['total_score', 'points_per_hand', 'percent_points_per_hand']
passthrough_features = ['game_id', 'hand_id'] # not sure if I want to drop these two, or leave them unaffected by preprocessing
drop_features = ['game_id', 'hand_id']

feat_set = set(categorical_features + binary_features + ordinal_features + numeric_features + drop_features + passthrough_features) # set so there are no duplicates

# lazy way of checking to make sure I put each feature into a preprocessor list. Dimensionality is low, so not concerned about computational cost
for col in X_train.columns.tolist():
    if col not in feat_set:
        print (col)

preprocessor = make_column_transformer(
    (OneHotEncoder(), categorical_features),
    (OneHotEncoder(drop = 'if_binary'), binary_features),
    (OrdinalEncoder(categories = ordinal_feats_order), ordinal_features),
    (StandardScaler(), numeric_features),
    # ("passthrough", passthrough_features)#,
    ("drop", drop_features)
)

# Encoding y_train to be 0:x instead of string values
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_train

array([1, 1, 2, 3, 0, 1, 3, 3, 1, 1, 2, 1, 1, 0, 2, 1, 1, 1, 1, 2, 1, 2,
       3, 0, 3, 1, 1, 1, 1, 2, 3, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2,
       2, 1, 1, 1, 0, 0, 2, 2, 2, 1, 3, 1, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1,
       3, 0, 0, 1, 1, 1, 1, 1, 1, 3, 1, 2, 2, 3, 1, 1, 1, 3, 2, 1, 2, 0,
       2, 1, 0, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 3, 1, 0, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 3, 2, 1, 3, 1, 1, 2, 1, 1, 3, 0, 1, 3, 2, 2, 1, 1,
       1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 3, 0,
       2, 2, 3, 1, 1, 3, 1, 1, 3, 2, 3, 1, 0, 3, 2, 1, 1, 2, 1, 1, 3, 1,
       1, 1, 2, 0, 2, 1, 2, 3, 2, 2, 2, 2, 2, 0, 1, 2, 1, 1, 2, 1, 1, 1,
       1, 1, 2, 1, 2, 0, 1, 3, 2, 1, 1, 0, 2, 0, 1, 1, 0, 2, 1, 1, 0, 2,
       1, 3, 3, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 2,
       1, 1, 2, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 0,
       1, 3, 1, 3, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 0, 1, 2, 1, 0, 2, 2, 1,
       1, 1, 1, 0, 2, 1, 1, 0, 0, 1, 3, 1, 1, 2, 1,

### Training Baseline Model

In [12]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data = out_col, index = mean_scores.index)

In [26]:
# Define dictionary to store results
cross_val_results = {}

# Establish baseline by scoring training set on dummy classifier
dc = DummyClassifier(random_state = 123)
cross_val_results["Dummy"] = mean_std_cross_val_scores(
    dc, X_train, y_train, return_train_score = True, scoring = "accuracy", n_jobs = -1)

# Display results
pd.DataFrame(cross_val_results).T

Unnamed: 0,fit_time,score_time,test_score,train_score
Dummy,0.000 (+/- 0.000),0.000 (+/- 0.000),0.554 (+/- 0.004),0.554 (+/- 0.001)


In [27]:
# Establish parameter grid for optimising hyperparameters

lr_params = {
    'logisticregression__C': loguniform(1e-3, 1e3),
    'logisticregression__class_weight': [None, "balanced"]
}

# Perform cross-validation on logistic regression model with default hyperparameters
pipe_lr = make_pipeline(preprocessor, LogisticRegression(random_state = 123, max_iter = 10000))
cross_val_results["Logistic Regression"] = mean_std_cross_val_scores(pipe_lr, X_train, y_train, return_train_score = True,
    scoring = "accuracy", n_jobs = -1)#, error_score = "raise")

# Perform hyperparameter tuning to optimise recall
random_search_lr = RandomizedSearchCV(pipe_lr, lr_params, n_iter = 20, n_jobs = -1, scoring = 'accuracy', random_state = 123)

# Perform cross-validation on optimised logistic regression model
cross_val_results["Tuned Logistic Regression"] = mean_std_cross_val_scores(pipe_lr, X_train, y_train, return_train_score = True,
    scoring = "accuracy", n_jobs = -1)

# Display results
pd.DataFrame(cross_val_results).T

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/pipeline.py", line 870, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 690, in fit_transform
    result = self._fit_transform(X, y, _fit_transform_one)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 621, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/joblib/parallel.py", line 1046, in __call__
    while self.dispatch_one_batch(iterator):
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/joblib/_parallel_backends.py", line 572, in __init__
    self.results = batch()
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/joblib/parallel.py", line 262, in __call__
    return [func(*args, **kwargs)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/joblib/parallel.py", line 262, in <listcomp>
    return [func(*args, **kwargs)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/utils/fixes.py", line 117, in __call__
    return self.function(*args, **kwargs)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/pipeline.py", line 870, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/base.py", line 870, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py", line 1294, in fit
    self._fit(X, handle_unknown=self.handle_unknown, force_all_finite="allow-nan")
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py", line 117, in _fit
    raise ValueError(error_msg)
ValueError: Unsorted categories are not supported for numerical categories


In [None]:
random_search_lr.fit(X_train, y_train)

In [None]:
# Optimised hyperparameter values
lg_C = random_search_lr.best_params_["logisticregression__C"]
print("Logistic Regression C:", lg_C)
print("Logistic Regression Alpha:", 1/lg_C)
print("Class Weight:", random_search_lr.best_params_["logisticregression__class_weight"])

Logistic Regression C: 0.22527090779355338
Logistic Regression Alpha: 4.439099614746691
Class Weight: None


### Pipelines for Other Models

In [None]:
pipe_svc = make_pipeline(preprocessor, SVC(random_state = 123))
pipe_rf = make_pipeline(preprocessor, RandomForestClassifier(random_state = 123))
pipe_lgbm = make_pipeline(preprocessor, LGBMClassifier(random_state = 123))
pipe_xgb = make_pipeline(preprocessor, XGBClassifier(random_state = 123))

classifiers = {
    "Logistic Regression": pipe_lr,
    "LightGBM": pipe_lgbm,
    "XGBoost": pipe_xgb
}

models = {
    "SVC": pipe_svc,
    "Random Forest": pipe_rf,
    "Stacking Model": StackingClassifier(list(classifiers.items()))
}

# Perform crossvalidation on each model
for model_name, model in models.items():
    cross_val_results[model_name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score = True,
        scoring = "accuracy", n_jobs = -1
        )

In [None]:
pd.DataFrame(cross_val_results).T.sort_values(by = "test_score", ascending = False)

Unnamed: 0,fit_time,score_time,test_score,train_score
Stacking Model,2.062 (+/- 0.014),0.026 (+/- 0.002),0.656 (+/- 0.051),0.884 (+/- 0.053)
Logistic Regression,0.034 (+/- 0.005),0.007 (+/- 0.001),0.628 (+/- 0.040),0.713 (+/- 0.012)
Tuned Logistic Regression,0.031 (+/- 0.003),0.006 (+/- 0.001),0.628 (+/- 0.040),0.713 (+/- 0.012)
SVC,0.047 (+/- 0.008),0.023 (+/- 0.005),0.622 (+/- 0.042),0.687 (+/- 0.011)
Random Forest,0.231 (+/- 0.013),0.032 (+/- 0.003),0.617 (+/- 0.061),0.995 (+/- 0.002)
Dummy,0.001 (+/- 0.000),0.001 (+/- 0.000),0.554 (+/- 0.004),0.554 (+/- 0.001)


### Hyperparameter Optimization

In [None]:
# Param grids
params = [
    {
        "svc__class_weight": [None, "balanced"],
        "svc__gamma": loguniform(1e-3, 1e3),
        "svc__C": loguniform(1e-3, 1e3)
    },
    {
        "logisticregression__class_weight": [None, "balanced"],
        "logisticregression__C": loguniform(1e-3, 1e3),
    },
    {
        "xgbclassifier__gamma": loguniform(1e-3, 1e3)
    },
    {
        "lgbmclassifier__class_weight": [None, "balanced"],
        "lgbmclassifier__max_depth": np.arange(10, 100, 1)
    },
    {
        "randomforestclassifier__max_features": ["sqrt", "log2", None],
        "randomforestclassifier__max_depth": np.arange(10, 50),
        "randomforestclassifier__class_weight": ["Balanced", None]
    }
]

classifiers_tuning = {
    "SVC": pipe_svc,
    "Logistic Regression": pipe_lr,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "Random Forest": pipe_rf
}

optim_models = {}

# Perform hyperparameter tuning on each model and display optimal hyperparameter values
for i, model_name in enumerate(classifiers_tuning):
    print(model_name)
    param_grid = params[i]
    model = classifiers_tuning[model_name]
    random_search = RandomizedSearchCV(
        model, param_grid, n_iter = 10, n_jobs = -1, random_state = 123,
        scoring = "accuracy", return_train_score = True
    )
    random_search.fit(X_train, y_train)
    optim_models[model_name] = random_search.best_estimator_
    print(random_search.best_params_)

SVC
{'svc__C': 3.0072240235870313, 'svc__class_weight': 'balanced', 'svc__gamma': 0.22527090779355338}
Logistic Regression
{'logisticregression__C': 0.22527090779355338, 'logisticregression__class_weight': None}
XGBoost
{'xgbclassifier__gamma': 0.7684071705306554}
LightGBM
{'lgbmclassifier__max_depth': 72, 'lgbmclassifier__class_weight': None}
Random Forest
{'randomforestclassifier__max_features': None, 'randomforestclassifier__max_depth': 38, 'randomforestclassifier__class_weight': None}


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 371, in fit
    y, expanded_class_weight = self._validate_y_class_weight(y)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packa

In [None]:
### Cross-validation on each tuned model
tuned_classifiers = {
    "Logistic Regression": optim_models["Logistic Regression"],
    "LightGBM": optim_models["LightGBM"],
    "XGBoost": optim_models["XGBoost"]
}

tuned_models = {
    "Tuned SVC": optim_models["SVC"],
    "Tuned Random Forest": optim_models["Random Forest"],
    "Tuned Stacking Model": StackingClassifier(list(tuned_classifiers.items()))
}

for model_name, model in tuned_models.items():
    cross_val_results[model_name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score = True,
        scoring = "accuracy", n_jobs = -1
        )

In [None]:
pd.DataFrame(cross_val_results).T.sort_values(by = "test_score", ascending = False)

Unnamed: 0,fit_time,score_time,test_score,train_score
Tuned Stacking Model,2.226 (+/- 0.016),0.026 (+/- 0.002),0.659 (+/- 0.047),0.854 (+/- 0.042)
Stacking Model,2.062 (+/- 0.014),0.026 (+/- 0.002),0.656 (+/- 0.051),0.884 (+/- 0.053)
Tuned Random Forest,0.214 (+/- 0.009),0.020 (+/- 0.002),0.631 (+/- 0.067),0.995 (+/- 0.002)
Logistic Regression,0.034 (+/- 0.005),0.007 (+/- 0.001),0.628 (+/- 0.040),0.713 (+/- 0.012)
Tuned Logistic Regression,0.031 (+/- 0.003),0.006 (+/- 0.001),0.628 (+/- 0.040),0.713 (+/- 0.012)
SVC,0.047 (+/- 0.008),0.023 (+/- 0.005),0.622 (+/- 0.042),0.687 (+/- 0.011)
Random Forest,0.231 (+/- 0.013),0.032 (+/- 0.003),0.617 (+/- 0.061),0.995 (+/- 0.002)
Tuned SVC,0.019 (+/- 0.001),0.010 (+/- 0.001),0.568 (+/- 0.048),0.950 (+/- 0.012)
Dummy,0.001 (+/- 0.000),0.001 (+/- 0.000),0.554 (+/- 0.004),0.554 (+/- 0.001)


In [None]:
# # import lazypredict

# from lazypredict.Supervised import LazyClassifier
# # from sklearn.datasets import load_breast_cancer
# # from sklearn.model_selection import train_test_split

# # data = load_breast_cancer()
# # X = data.data
# # y= data.target

# # X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)

# clf = LazyClassifier(verbose = 0, ignore_warnings = True, custom_metric = None)
# models,predictions = clf.fit(X_train, X_test, y_train, y_test)

# # print(models)
# models