## Data Processing

### Reading in Data

In [108]:
# imports for processing

import pandas as pd
import numpy as np
from collections import Counter

In [109]:
scores = pd.read_csv('hearts_scores_anon.csv')
scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score
0,1,1,player_1,player_4,6
1,1,2,player_1,player_2,6
2,1,3,player_1,player_3,10
3,1,4,player_1,none,26
4,1,5,player_1,player_4,26


In [110]:
### Dictionary of the number of hands in each game

num_games = scores['game_id'].unique() # returns list of unique values of game_id column, which is the number of games played
player_list = scores.sort_values(by = "player", ascending = True)['player'].unique() # ordered list of player names
games_and_hands_dict = {} # dict with keys as game_id and values as total number of hands in each game

for game in num_games:

    hands_per_game = scores.query(f'game_id == {game}')['hand_id'].unique() # returns an array of ints from 1:max number of hands
    games_and_hands_dict[f'{game}'] = len(hands_per_game)

print (scores['received_cards_from'].unique()) # making sure there are only 4 players + "none" (and no spelling errors)
print (player_list) # making sure there are only 4 players (and no spelling errors)

games_and_hands_dict

['player_4' 'player_2' 'player_3' 'none' 'player_1']
['player_1' 'player_2' 'player_3' 'player_4']


{'1': 10,
 '2': 11,
 '3': 8,
 '4': 12,
 '5': 8,
 '6': 12,
 '7': 7,
 '8': 9,
 '9': 5,
 '10': 8,
 '11': 9,
 '12': 8,
 '13': 12,
 '14': 7}

### Computing 'points_per_hand' column
- Done

In [111]:
### points_per_hand -- DONE

for game in num_games:
    game_df = scores.query(f'game_id == {game}') # subsets original df per game_id, into game_df

    for player_name in player_list:
        player_df = game_df.query(f'player == "{player_name}"').sort_values(by = "hand_id", ascending = True) # subsets game_df per player, ascending by hand
        player_indices = player_df.index.tolist() # indices of player df

        for index in player_indices:
            if index == player_indices[0]:
                scores.loc[index, 'points_per_hand'] = scores.loc[index, 'total_score']
            else:
                scores.loc[index, 'points_per_hand'] = scores.loc[index, 'total_score'] - scores.loc[(index - 1), 'total_score']

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand
0,1,1,player_1,player_4,6,6.0
1,1,2,player_1,player_2,6,0.0
2,1,3,player_1,player_3,10,4.0
3,1,4,player_1,none,26,16.0
4,1,5,player_1,player_4,26,0.0


### Computing 'percent_points_per_hand' column
- Done

In [112]:
### percent_points_per_hand -- DONE

for game in num_games:

    game_df = scores.query(f'game_id == {game}') # subsets original df per game_id, into game_df

    max_score_dict = {} # dictionary where keys = player names, values = each player's final score
    for player_name in player_list:
        game_per_player = game_df.query(f'player == "{player_name}"')
        game_per_player_indices = game_per_player.index.tolist()
        max_score_dict[f'{player_name}'] = game_per_player['total_score'].max()

        # print (game_per_player_indices)

        for index in game_per_player_indices:
            scores.loc[index, 'percent_points_per_hand'] = round((scores.loc[index, 'points_per_hand'] / max_score_dict[f'{player_name}'] * 100), 2)

    # print (max_score_dict)

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand
0,1,1,player_1,player_4,6,6.0,6.12
1,1,2,player_1,player_2,6,0.0,0.0
2,1,3,player_1,player_3,10,4.0,4.08
3,1,4,player_1,none,26,16.0,16.33
4,1,5,player_1,player_4,26,0.0,0.0


### Computing 'queen_spades' column
- Done

In [113]:
### queen_spades -- DONE

queen_spades_list = []
for value in scores['points_per_hand']:
    if 13 <= value <= 25: # not a perfect solution but should get most of the cases accurate
        queen_spades_list.append(1)
    else:
        queen_spades_list.append(0)

queen_spades_list  
scores['queen_spades'] = queen_spades_list

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades
0,1,1,player_1,player_4,6,6.0,6.12,0
1,1,2,player_1,player_2,6,0.0,0.0,0
2,1,3,player_1,player_3,10,4.0,4.08,0
3,1,4,player_1,none,26,16.0,16.33,1
4,1,5,player_1,player_4,26,0.0,0.0,0


### Computing 'moon_shooter' column
- Done

In [114]:
### moon_shooter -- DONE

scores['moon_shooter'] = "none" # sets all 'moon_shooter' values to "none" by default, since this is quicker than setting an if condition

for game in num_games:

    game_df = scores.query(f'game_id == {game}') # subsets original df per game_id, into game_df
    hands_per_game = game_df.query(f'game_id == {game}')['hand_id'].unique() # gets list of number of hands per game, 1 : x
    
    for hand in hands_per_game:

        hand_df = game_df.query(f'hand_id == {hand}') # subsets game_df per hand of game, into hand_df. Should be one row per player

        if hand_df['points_per_hand'].sum() == 78: # 78 = 26*3, which only happens when one player shoots the moon

            shooter_index = hand_df.query('points_per_hand == 0').index # returns index of player who shot the moon in that particular hand (1 int value)
            shooter_name = scores.loc[shooter_index]['player'].tolist()[0] # returns string of name player who shot the moon in that particular hand

            hand_index_list = hand_df.index.tolist() # gets list of indices of that hand
            
            scores.loc[hand_index_list, 'moon_shooter'] = shooter_name # changes 'moon_shooter' value to name of player who shot the moon at respective indices from index_list

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter
0,1,1,player_1,player_4,6,6.0,6.12,0,none
1,1,2,player_1,player_2,6,0.0,0.0,0,none
2,1,3,player_1,player_3,10,4.0,4.08,0,none
3,1,4,player_1,none,26,16.0,16.33,1,none
4,1,5,player_1,player_4,26,0.0,0.0,0,none


### Computing 'best_player_of_hand' column
- Done

In [115]:
### best_player_of_hand -- DONE

for game in num_games:

    game_df = scores.query(f'game_id == {game}') # subsets original df per game_id, into game_df
    hands_per_game = game_df.query(f'game_id == {game}')['hand_id'].unique() # gets list of number of hands per game, 1 : x
    
    for hand in hands_per_game:

        hand_df = game_df.query(f'hand_id == {hand}') # subsets game_df per hand of game, into hand_df. Should be one row per player
        
        best_score_of_hand = hand_df['points_per_hand'].min() # best score of that hand
        best_player_of_hand = hand_df.query(f'points_per_hand == {best_score_of_hand}').iloc[0]['player'] # name of best player of that hand
        
        hand_index_list = hand_df.index.tolist() # list of indices of that hand
        scores.loc[hand_index_list, 'best_player_of_hand'] = best_player_of_hand # assigning name of best player to all col vals for that hand

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand
0,1,1,player_1,player_4,6,6.0,6.12,0,none,player_2
1,1,2,player_1,player_2,6,0.0,0.0,0,none,player_1
2,1,3,player_1,player_3,10,4.0,4.08,0,none,player_2
3,1,4,player_1,none,26,16.0,16.33,1,none,player_4
4,1,5,player_1,player_4,26,0.0,0.0,0,none,player_1


### Computing 'best_player_of_game' column
- To be done

In [116]:
### best_player_of_game -- DONE

for game in num_games:
    game_df = scores.query(f'game_id == {game}') # subsets original df per game_id, into game_df
    best_player_of_game = game_df['best_player_of_hand'].mode().tolist()[0] # most common val (player name) in this col
    game_indices = game_df.index.tolist()
    
    scores.loc[game_indices, 'best_player_of_game'] = best_player_of_game # change vals of scores df based on this

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game
0,1,1,player_1,player_4,6,6.0,6.12,0,none,player_2,player_1
1,1,2,player_1,player_2,6,0.0,0.0,0,none,player_1,player_1
2,1,3,player_1,player_3,10,4.0,4.08,0,none,player_2,player_1
3,1,4,player_1,none,26,16.0,16.33,1,none,player_4,player_1
4,1,5,player_1,player_4,26,0.0,0.0,0,none,player_1,player_1


### Computing 'game_winner' column
- Done

In [117]:
### game_winner -- DONE

for game in num_games:

    game_df = scores.query(f'game_id == {game}') # subsets original df per game_id, into game_df
    game_indices = game_df.index.tolist()
    
    last_hand_value = len(game_df.query(f'game_id == {game}')['hand_id'].unique()) # gets last hand of each game
    last_hand_df = game_df.query(f'hand_id == {last_hand_value}') # df of just the last hand
    game_winner_name = last_hand_df.sort_values(by = "total_score", ascending = True)["player"].tolist()[0]

    scores.loc[game_indices, 'game_winner'] = game_winner_name # assigning name of game winner to game_winner col vals for that game

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,game_winner
0,1,1,player_1,player_4,6,6.0,6.12,0,none,player_2,player_1,player_2
1,1,2,player_1,player_2,6,0.0,0.0,0,none,player_1,player_1,player_2
2,1,3,player_1,player_3,10,4.0,4.08,0,none,player_2,player_1,player_2
3,1,4,player_1,none,26,16.0,16.33,1,none,player_4,player_1,player_2
4,1,5,player_1,player_4,26,0.0,0.0,0,none,player_1,player_1,player_2


### Final scores df

In [118]:
# scores.head(60) # can only show first 60 rows in output
# scores.head()
scores

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,game_winner
0,1,1,player_1,player_4,6,6.0,6.12,0,none,player_2,player_1,player_2
1,1,2,player_1,player_2,6,0.0,0.00,0,none,player_1,player_1,player_2
2,1,3,player_1,player_3,10,4.0,4.08,0,none,player_2,player_1,player_2
3,1,4,player_1,none,26,16.0,16.33,1,none,player_4,player_1,player_2
4,1,5,player_1,player_4,26,0.0,0.00,0,none,player_1,player_1,player_2
...,...,...,...,...,...,...,...,...,...,...,...,...
499,14,3,player_4,player_2,55,25.0,25.00,1,none,player_1,player_1,player_1
500,14,4,player_4,none,55,0.0,0.00,0,none,player_2,player_1,player_1
501,14,5,player_4,player_3,69,14.0,14.00,1,none,player_1,player_1,player_1
502,14,6,player_4,player_1,94,25.0,25.00,1,none,player_1,player_1,player_1


### Summary Stats About scores df

In [119]:
# initialize blank df with same columns as "scores"
# this df consists of only the final hands of each game, so that only final scores are considered in these statistics
all_last_hands = pd.DataFrame(columns = scores.columns.tolist())

last_hands_list = []
for game, last_hand in games_and_hands_dict.items():
    last_hand_df = scores.query(f'game_id == {game} & hand_id == {last_hand}')
    last_hands_list.append(last_hand_df)
    all_last_hands = pd.concat(last_hands_list)

stats_dict = {} # dict of summary stats of each player in last hand 
player_score_mean = [] # dict of player: avg total score at end of game
player_score_std = [] # dict of player: std total score at end of game
player_score_max = [] # dict of player: max total score at end of game
player_score_min = [] # dict of player: min total score at end of game
player_mean_ppg = [] # dict of player: avg percentage of points per hand

for player_name in player_list:
    player_mean_ppg.append(round(all_last_hands.query(f'player == "{player_name}"')['percent_points_per_hand'].mean(), 2))
    player_score_std.append(round(all_last_hands.query(f'player == "{player_name}"')['total_score'].std(), 2))
    player_score_max.append(round(all_last_hands.query(f'player == "{player_name}"')['total_score'].max(), 2))
    player_score_min.append(round(all_last_hands.query(f'player == "{player_name}"')['total_score'].min(), 2))
    player_score_mean.append(round(all_last_hands.query(f'player == "{player_name}"')['total_score'].mean(), 2))

# stats_dict['Player'] = player_list
stats_dict['Player'] = ['Kai', 'Dad', 'Aidan', 'Diane']
stats_dict['Highest Final Score'] = player_score_max
stats_dict['Lowest Final Score'] = player_score_min
stats_dict['Average Final Score'] = player_score_mean
stats_dict['Final Score Standard Deviation'] = player_score_std
stats_dict['Average Percentage of Points Per Hand'] = player_mean_ppg

stats_df = pd.DataFrame(stats_dict)
stats_df
# stats_df.sort_values(by = 'Highest Final Score', ascending = False)

Unnamed: 0,Player,Highest Final Score,Lowest Final Score,Average Final Score,Final Score Standard Deviation,Average Percentage of Points Per Hand
0,Kai,121,22,79.71,31.08,18.86
1,Dad,117,31,67.57,26.77,12.05
2,Aidan,109,36,68.64,23.74,9.51
3,Diane,121,15,76.71,28.45,10.36


### Average Points When Passed From x

In [120]:
passed_points_dict = {}
points_from_kai = []
points_from_dad = []
points_from_aidan = []
points_from_diane = []

for player in player_list:
    points_from_kai.append(scores.query(f'player == "player_1" & received_cards_from == "{player}"')['points_per_hand'].mean())
    points_from_dad.append(scores.query(f'player == "player_2" & received_cards_from == "{player}"')['points_per_hand'].mean())
    points_from_aidan.append(scores.query(f'player == "player_3" & received_cards_from == "{player}"')['points_per_hand'].mean())
    points_from_diane.append(scores.query(f'player == "player_4" & received_cards_from == "{player}"')['points_per_hand'].mean())

passed_points_dict['Player'] = ['Kai', 'Dad', 'Aidan', 'Diane']
passed_points_dict['Avg Points from Kai'] = points_from_kai
passed_points_dict['Avg Points from Dad'] = points_from_dad
passed_points_dict['Avg Points from Aidan'] = points_from_aidan
passed_points_dict['Avg Points from Diane'] = points_from_diane

passed_points_df = pd.DataFrame(passed_points_dict).fillna(0)
passed_points_df.round(2)

Unnamed: 0,Player,Avg Points from Kai,Avg Points from Dad,Avg Points from Aidan,Avg Points from Diane
0,Kai,0.0,8.77,6.81,9.12
1,Dad,6.75,0.0,7.74,10.23
2,Aidan,10.81,7.22,0.0,8.69
3,Diane,6.29,8.03,8.19,0.0


In [121]:
# player_1_stats = all_last_hands.query('player == "player_1"').describe().reset_index().rename(columns = {"index": "player_1"})
# player_1_stats.drop(columns = ['game_id', 'hand_id'])

In [122]:
# player_2_stats = all_last_hands.query('player == "player_2"').describe().reset_index().rename(columns = {"index": "player_2"})
# player_2_stats.drop(columns = ['game_id', 'hand_id'])

In [123]:
# player_3_stats = all_last_hands.query('player == "player_3"').describe().reset_index().rename(columns = {"index": "player_3"})
# player_3_stats.drop(columns = ['game_id', 'hand_id'])

In [124]:
# player_4_stats = all_last_hands.query('player == "player_1"').describe().reset_index().rename(columns = {"index": "player_4"})
# player_4_stats.drop(columns = ['game_id', 'hand_id'])

## Machine Learning

In [125]:
# imports for ML
 
import sklearn # for tests
from sklearn.preprocessing import (StandardScaler, OneHotEncoder, OrdinalEncoder, PolynomialFeatures)
from sklearn.metrics import recall_score, precision_score, classification_report
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import (RandomizedSearchCV, cross_validate, train_test_split)
from sklearn.metrics import f1_score
from scipy.stats import loguniform
from numpy.linalg import norm
import altair as alt
import eli5

### Info about processing/encoding
Columns:
- `game_id`: unique identifier of game number. Not related to time/a specific order
- `hand_id`: unique identifier of each hand within each game. Order DOES matter
- `player`: indicates who that row data pertains to
- `received_cards_from`: who the "player" received 3 cards from in that hand. For "no passing" hand, "none"
- `total_score`: player's total score throughout the game
- `points_per_hand`: how many points player got that hand
- `percent_points_per_hand`: percentage of points gotten that hand (compared to player's total points of that game)
- `queen_spades`: boolean (0 = no, 1 = yes) of whether player got the queen of spades that hand or not
- `moon_shooter`: name of player who shot the moon that hand, if applicable, otherwise "none"
- `best_player_of_hand`: player who got the least points in that hand/did the "best" that hand
- `best_player_of_game`: which player was `best_player_of_hand` the most frequently in each game (this is different than `game_winner` player)
- `game_winner`: person who won the game -- target


In [126]:
# renaming 'game_winner' column to 'target'

# Some initial processing
scores_processed = scores
# scores_processed = scores_processed.drop("game_id", axis = 1)
scores_processed = scores_processed.rename(columns = {"game_winner": "target"})

# encoding categorical cols -- not sure if this is necessary?
scores_processed = scores_processed.replace(to_replace = {"None": 0, "none": 0, "player_1": 1, "player_2": 2, "player_3": 3, "player_4": 4})
scores_processed.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,target
0,1,1,1,4,6,6.0,6.12,0,0,2,1,2
1,1,2,1,2,6,0.0,0.0,0,0,1,1,2
2,1,3,1,3,10,4.0,4.08,0,0,2,1,2
3,1,4,1,0,26,16.0,16.33,1,0,4,1,2
4,1,5,1,4,26,0.0,0.0,0,0,1,1,2


In [127]:
### Splitting data into X/y train/test splits

train_df, test_df = train_test_split(scores_processed, test_size = 0.3, random_state = 123)
X_train, y_train = train_df.drop("target", axis = 1), train_df["target"]
X_test, y_test = test_df.drop("target", axis = 1), test_df["target"]

X_train

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game
52,2,2,3,4,25,24.0,36.36,1,0,1,1
229,6,10,3,4,90,5.0,5.56,0,0,1,1
246,7,3,1,3,69,26.0,24.07,0,4,4,2
164,5,1,1,4,0,0.0,0.00,0,0,1,1
327,9,5,3,2,58,0.0,0.00,0,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...
98,3,7,3,2,94,19.0,18.63,1,0,2,4
476,14,1,1,4,0,0.0,0.00,0,1,1,1
322,9,5,4,3,50,5.0,10.00,0,0,3,1
382,11,5,2,1,27,0.0,0.00,0,0,4,4


In [128]:
train_df.describe()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,target
count,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0,352.0
mean,7.232955,5.235795,2.5,2.002841,41.764205,8.434659,11.135455,0.235795,0.278409,2.380682,1.693182,2.383523
std,4.080004,2.950411,1.117078,1.451001,28.484385,9.545288,13.18734,0.425099,0.871708,1.138536,1.087174,0.822103
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,4.0,3.0,1.0,1.0,20.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0
50%,7.0,5.0,3.0,2.0,38.0,4.0,5.785,0.0,0.0,2.0,1.0,2.0
75%,11.0,7.0,3.0,3.0,61.25,16.0,19.7825,0.0,0.0,3.0,2.0,3.0
max,14.0,12.0,4.0,4.0,121.0,39.0,81.82,1.0,4.0,4.0,4.0,4.0


In [138]:
train_df["target"].value_counts(normalize = True) # True return proportions, False returns counts

2    0.553977
3    0.227273
4    0.125000
1    0.093750
Name: target, dtype: float64

In [139]:
corr_matrx = train_df.corr('spearman').style.background_gradient()
corr_matrx

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,target
game_id,1.0,-0.078969,0.001918,0.058141,0.064713,0.120118,0.092304,-0.003402,0.185376,-0.03379,0.174464,-0.04457
hand_id,-0.078969,1.0,0.007108,-0.136378,0.767906,-0.013021,-0.02195,0.053131,-0.068456,0.086759,-0.100076,0.011191
player,0.001918,0.007108,1.0,-0.197574,0.012069,-0.026327,-0.010276,-0.056534,-0.003004,-0.025456,-0.032918,-0.040444
received_cards_from,0.058141,-0.136378,-0.197574,1.0,-0.118998,0.041286,0.022067,-0.020911,0.077059,-0.049616,0.030092,0.014522
total_score,0.064713,0.767906,0.012069,-0.118998,1.0,0.302338,0.23165,0.229174,0.098586,0.048678,-0.052111,-0.032487
points_per_hand,0.120118,-0.013021,-0.026327,0.041286,0.302338,1.0,0.970774,0.574372,0.300515,0.020135,0.010715,-0.050053
percent_points_per_hand,0.092304,-0.02195,-0.010276,0.022067,0.23165,0.970774,1.0,0.592515,0.242499,0.01329,0.039933,-0.033364
queen_spades,-0.003402,0.053131,-0.056534,-0.020911,0.229174,0.574372,0.592515,1.0,-0.187175,-0.016182,-0.010511,-0.042593
moon_shooter,0.185376,-0.068456,-0.003004,0.077059,0.098586,0.300515,0.242499,-0.187175,1.0,0.119297,0.088097,0.062992
best_player_of_hand,-0.03379,0.086759,-0.025456,-0.049616,0.048678,0.020135,0.01329,-0.016182,0.119297,1.0,0.243566,0.200813


### Preprocessing

In [147]:
scores_processed.columns.tolist()

['game_id',
 'hand_id',
 'player',
 'received_cards_from',
 'total_score',
 'points_per_hand',
 'percent_points_per_hand',
 'queen_spades',
 'moon_shooter',
 'best_player_of_hand',
 'best_player_of_game',
 'target']

In [148]:
categorical_features = ['player', 'received_cards_from', 'moon_shooter', 'best_player_of_hand', 'best_player_of_game']
binary_features = ['queen_spades']
numeric_features = ['total_score', 'points_per_hand', 'percent_points_per_hand', ]
passthrough_features = ['game_id', 'hand_id']

preprocessor = make_column_transformer(
    (OneHotEncoder(), categorical_features),
    (OneHotEncoder(drop = 'if_binary'), binary_features),
    (StandardScaler(), numeric_features),
    ("passthrough", passthrough_features)
)

### Training Baseline Model

In [149]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data = out_col, index = mean_scores.index)

In [150]:
# Define dictionary to store results
cross_val_results = {}

# Establish scoring metrics
# classification_metrics = ["accuracy", "precision", "recall", "f1"] # only able to use accuracy because target is multi-class
# should be using f1 be can't evaluate this for f1 yet because predictions haven't been made

# Establish baseline by scoring training set on dummy classifier
dc = DummyClassifier(random_state = 123)
cross_val_results["Dummy"] = mean_std_cross_val_scores(
    dc, X_train, y_train, return_train_score = True, scoring = "accuracy", n_jobs = -1)

# Display results
pd.DataFrame(cross_val_results).T

Unnamed: 0,fit_time,score_time,test_score,train_score
Dummy,0.001 (+/- 0.000),0.001 (+/- 0.000),0.554 (+/- 0.004),0.554 (+/- 0.001)


In [151]:
# Establish parameter grid for optimising of hyperparameters
lr_params = {
    'logisticregression__C': loguniform(1e-3, 1e3),
    'logisticregression__class_weight': [None, "balanced"]
}

# Perform cross-validation on logistic regression model with default hyperparameters
pipe_lr = make_pipeline(preprocessor, LogisticRegression(random_state = 123, max_iter = 1000))
cross_val_results["Logistic Regression"] = mean_std_cross_val_scores(pipe_lr, X_train, y_train, return_train_score = True,
    scoring = "accuracy", n_jobs = -1)

# Perform hyperparameter tuning to optimise recall
random_search_lr = RandomizedSearchCV(pipe_lr, lr_params, n_iter = 20, n_jobs = -1, scoring = 'accuracy', random_state = 123)

# Perform cross-validation on optimised logistic regression model
cross_val_results["Tuned Logistic Regression"] = mean_std_cross_val_scores(pipe_lr, X_train, y_train, return_train_score = True,
    scoring = "accuracy", n_jobs = -1)

# Display results
pd.DataFrame(cross_val_results).T

Unnamed: 0,fit_time,score_time,test_score,train_score
Dummy,0.001 (+/- 0.000),0.001 (+/- 0.000),0.554 (+/- 0.004),0.554 (+/- 0.001)
Logistic Regression,0.069 (+/- 0.006),0.006 (+/- 0.001),0.733 (+/- 0.031),0.829 (+/- 0.013)
Tuned Logistic Regression,0.095 (+/- 0.005),0.007 (+/- 0.001),0.733 (+/- 0.031),0.829 (+/- 0.013)


In [152]:
random_search_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [153]:
# Print optimised hyperparameter values
lg_C = random_search_lr.best_params_["logisticregression__C"]
print("Logistic Regression C:", lg_C)
print("Logistic Regression Alpha:", 1/lg_C)
print("Class Weight:", random_search_lr.best_params_["logisticregression__class_weight"])

Logistic Regression C: 766.6289057556017
Logistic Regression Alpha: 0.001304412072767311
Class Weight: None


### Pipelines for Other Models

In [154]:
pipe_svc = make_pipeline(preprocessor, SVC(random_state = 123))
pipe_rf = make_pipeline(preprocessor, RandomForestClassifier(random_state = 123))
pipe_lgbm = make_pipeline(preprocessor, LGBMClassifier(random_state = 123))
pipe_xgb = make_pipeline(preprocessor, XGBClassifier(random_state = 123))

classifiers = {
    "Logistic Regression": pipe_lr,
    "LightGBM": pipe_lgbm,
    "XGBoost": pipe_xgb
}

models = {
    "SVC": pipe_svc,
    "Random Forest": pipe_rf,
    "Stacking Model": StackingClassifier(list(classifiers.items()))
}

# Perform crossvalidation on each model
for model_name, model in models.items():
    cross_val_results[model_name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score = True,
        scoring = "accuracy", n_jobs = -1
        )

In [155]:
pd.DataFrame(cross_val_results).T.sort_values(by = "test_score", ascending = False)

Unnamed: 0,fit_time,score_time,test_score,train_score
Stacking Model,1.635 (+/- 0.014),0.029 (+/- 0.005),1.000 (+/- 0.000),1.000 (+/- 0.000)
Random Forest,0.169 (+/- 0.006),0.018 (+/- 0.002),0.886 (+/- 0.039),1.000 (+/- 0.000)
Logistic Regression,0.069 (+/- 0.006),0.006 (+/- 0.001),0.733 (+/- 0.031),0.829 (+/- 0.013)
Tuned Logistic Regression,0.095 (+/- 0.005),0.007 (+/- 0.001),0.733 (+/- 0.031),0.829 (+/- 0.013)
SVC,0.015 (+/- 0.002),0.008 (+/- 0.000),0.719 (+/- 0.030),0.724 (+/- 0.014)
Dummy,0.001 (+/- 0.000),0.001 (+/- 0.000),0.554 (+/- 0.004),0.554 (+/- 0.001)


### Hyperparameter Optimization

In [156]:
# Param grids
params = [
    {
        "svc__class_weight": [None, "balanced"],
        "svc__gamma": loguniform(1e-3, 1e3),
        "svc__C": loguniform(1e-3, 1e3)
    },
    {
        "logisticregression__class_weight": [None, "balanced"],
        "logisticregression__C": loguniform(1e-3, 1e3),
    },
    {
        "xgbclassifier__gamma": loguniform(1e-3, 1e3)
    },
    {
        "lgbmclassifier__class_weight": [None, "balanced"],
        "lgbmclassifier__max_depth": np.arange(10, 100, 1)
    },
    {
        "randomforestclassifier__max_features": ["sqrt", "log2", None],
        "randomforestclassifier__max_depth": np.arange(10, 100)
    }
]

classifiers_tuning = {
    "SVC": pipe_svc,
    "Logistic Regression": pipe_lr,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "Random Forest": pipe_rf
}

optim_models = {}

# Perform hyperparameter tuning on each model and display optimal hyperparameter values
for i, model_name in enumerate(classifiers_tuning):
    print(model_name)
    param_grid = params[i]
    model = classifiers_tuning[model_name]
    random_search = RandomizedSearchCV(
        model, param_grid, n_iter = 10, n_jobs = -1, random_state = 123,
        scoring = "accuracy", return_train_score = True
    )
    random_search.fit(X_train, y_train)
    optim_models[model_name] = random_search.best_estimator_
    print(random_search.best_params_)

SVC
{'svc__C': 3.0072240235870313, 'svc__class_weight': 'balanced', 'svc__gamma': 0.22527090779355338}
Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'logisticregression__C': 766.6289057556017, 'logisticregression__class_weight': None}
XGBoost


ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/opt/miniconda3/envs/hearts/lib/python3.10/site-packages/xgboost/sklearn.py", line 1466, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3], got [1 2 3 4]


In [None]:
### Cross-validation on each tuned model
tuned_classifiers = {
    "Logistic Regression": optim_models["Logistic Regression"],
    "LightGBM": optim_models["LightGBM"],
    "XGBoost": optim_models["XGBoost"]
}

tuned_models = {
    "Tuned SVC": optim_models["SVC"],
    "Tuned Random Forest": optim_models["Random Forest"],
    "Tuned Stacking Model": StackingClassifier(list(tuned_classifiers.items()))
}

for model_name, model in tuned_models.items():
    cross_val_results[model_name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score = True,
        scoring = "accuracy", n_jobs = -1
        )

In [None]:
pd.DataFrame(cross_val_results).T.sort_values(by = "test_score", ascending = False)

In [None]:
### Classification report at the end

# print(classification_report(y_test, y_pred, target_names=labels))