## Data Processing

### Reading in Data

In [1]:
# imports for processing

import pandas as pd
import numpy as np
from collections import Counter

In [2]:
scores = pd.read_csv('hearts_scores_anon.csv')
scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score
0,1,1,player_1,player_4,6
1,1,2,player_1,player_2,6
2,1,3,player_1,player_3,10
3,1,4,player_1,none,26
4,1,5,player_1,player_4,26


In [3]:
### Dictionary of the number of hands in each game

num_games = scores['game_id'].unique() # returns list of unique values of game_id column, which is the number of games played
games_and_hands_dict = {} # dict with keys as game_id and values as total number of hands in each game

for game in num_games:

    hands_per_game = scores.query(f'game_id == {game}')['hand_id'].unique() # returns an array of ints from 1:max number of hands
    games_and_hands_dict[f'{game}'] = len(hands_per_game)

print (scores['received_cards_from'].unique()) # making sure there are only 4 players + "none" (and no spelling errors)
print (scores['player'].unique()) # making sure there are only 4 players (and no spelling errors)

games_and_hands_dict

['player_4' 'player_2' 'player_3' 'none' 'player_1']
['player_1' 'player_3' 'player_4' 'player_2']


{'1': 10,
 '2': 11,
 '3': 8,
 '4': 12,
 '5': 8,
 '6': 12,
 '7': 7,
 '8': 9,
 '9': 5,
 '10': 8,
 '11': 9,
 '12': 8,
 '13': 12,
 '14': 7}

### Computing 'points_per_hand' column
- Done

In [4]:
### points_per_hand -- DONE

for game in num_games:
    game_df = scores.query(f'game_id == {game}') # subsets original df per game_id, into game_df
    player_list = game_df['player'].unique() # list of players in each game

    for player_name in player_list:
        player_df = game_df.query(f'player == "{player_name}"').sort_values(by = "hand_id", ascending = True) # subsets game_df per player, ascending by hand
        player_indices = player_df.index.tolist() # indices of player df

        for index in player_indices:
            if index == player_indices[0]:
                scores.loc[index, 'points_per_hand'] = scores.loc[index, 'total_score']
            else:
                scores.loc[index, 'points_per_hand'] = scores.loc[index, 'total_score'] - scores.loc[(index - 1), 'total_score']

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand
0,1,1,player_1,player_4,6,6.0
1,1,2,player_1,player_2,6,0.0
2,1,3,player_1,player_3,10,4.0
3,1,4,player_1,none,26,16.0
4,1,5,player_1,player_4,26,0.0


### Computing 'percent_points_per_hand' column
- Done

In [5]:
### percent_points_per_hand -- DONE

for game in num_games:

    game_df = scores.query(f'game_id == {game}') # subsets original df per game_id, into game_df
    player_list = game_df['player'].unique() # initialize player list per game (same as for each hand) so we can later get a max score per player

    max_score_dict = {} # dictionary where keys = player names, values = each player's final score
    for player_name in player_list:
        game_per_player = game_df.query(f'player == "{player_name}"')
        game_per_player_indices = game_per_player.index.tolist()
        max_score_dict[f'{player_name}'] = game_per_player['total_score'].max()

        # print (game_per_player_indices)

        for index in game_per_player_indices:
            scores.loc[index, 'percent_points_per_hand'] = round((scores.loc[index, 'points_per_hand'] / max_score_dict[f'{player_name}'] * 100), 2)

    # print (max_score_dict)

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand
0,1,1,player_1,player_4,6,6.0,6.12
1,1,2,player_1,player_2,6,0.0,0.0
2,1,3,player_1,player_3,10,4.0,4.08
3,1,4,player_1,none,26,16.0,16.33
4,1,5,player_1,player_4,26,0.0,0.0


### Computing 'queen_spades' column
- Done

In [6]:
### queen_spades -- DONE

queen_spades_list = []
for value in scores['points_per_hand']:
    if 13 <= value <= 25: # not a perfect solution but should get most of the cases accurate
        queen_spades_list.append(1)
    else:
        queen_spades_list.append(0)

queen_spades_list  
scores['queen_spades'] = queen_spades_list

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades
0,1,1,player_1,player_4,6,6.0,6.12,0
1,1,2,player_1,player_2,6,0.0,0.0,0
2,1,3,player_1,player_3,10,4.0,4.08,0
3,1,4,player_1,none,26,16.0,16.33,1
4,1,5,player_1,player_4,26,0.0,0.0,0


### Computing 'moon_shooter' column
- Done

In [7]:
### moon_shooter -- DONE

scores['moon_shooter'] = "none" # sets all 'moon_shooter' values to "none" by default, since this is quicker than setting an if condition

for game in num_games:

    game_df = scores.query(f'game_id == {game}') # subsets original df per game_id, into game_df
    hands_per_game = game_df.query(f'game_id == {game}')['hand_id'].unique() # gets list of number of hands per game, 1 : x
    
    for hand in hands_per_game:

        hand_df = game_df.query(f'hand_id == {hand}') # subsets game_df per hand of game, into hand_df. Should be one row per player

        if hand_df['points_per_hand'].sum() == 78: # 78 = 26*3, which only happens when one player shoots the moon

            shooter_index = hand_df.query('points_per_hand == 0').index # returns index of player who shot the moon in that particular hand (1 int value)
            shooter_name = scores.loc[shooter_index]['player'].tolist()[0] # returns string of name player who shot the moon in that particular hand

            hand_index_list = hand_df.index.tolist() # gets list of indices of that hand
            
            scores.loc[hand_index_list, 'moon_shooter'] = shooter_name # changes 'moon_shooter' value to name of player who shot the moon at respective indices from index_list

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter
0,1,1,player_1,player_4,6,6.0,6.12,0,none
1,1,2,player_1,player_2,6,0.0,0.0,0,none
2,1,3,player_1,player_3,10,4.0,4.08,0,none
3,1,4,player_1,none,26,16.0,16.33,1,none
4,1,5,player_1,player_4,26,0.0,0.0,0,none


### Computing 'best_player_of_hand' column
- Done

In [8]:
### best_player_of_hand -- DONE

for game in num_games:

    game_df = scores.query(f'game_id == {game}') # subsets original df per game_id, into game_df
    hands_per_game = game_df.query(f'game_id == {game}')['hand_id'].unique() # gets list of number of hands per game, 1 : x
    
    for hand in hands_per_game:

        hand_df = game_df.query(f'hand_id == {hand}') # subsets game_df per hand of game, into hand_df. Should be one row per player
        
        best_score_of_hand = hand_df['points_per_hand'].min() # best score of that hand
        best_player_of_hand = hand_df.query(f'points_per_hand == {best_score_of_hand}').iloc[0]['player'] # name of best player of that hand
        
        hand_index_list = hand_df.index.tolist() # list of indices of that hand
        scores.loc[hand_index_list, 'best_player_of_hand'] = best_player_of_hand # assigning name of best player to all col vals for that hand

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand
0,1,1,player_1,player_4,6,6.0,6.12,0,none,player_2
1,1,2,player_1,player_2,6,0.0,0.0,0,none,player_1
2,1,3,player_1,player_3,10,4.0,4.08,0,none,player_2
3,1,4,player_1,none,26,16.0,16.33,1,none,player_4
4,1,5,player_1,player_4,26,0.0,0.0,0,none,player_1


### Computing 'best_player_of_game' column
- To be done

In [9]:
### best_player_of_game -- DONE

for game in num_games:
    game_df = scores.query(f'game_id == {game}') # subsets original df per game_id, into game_df
    best_player_of_game = game_df['best_player_of_hand'].mode().tolist()[0] # most common val (player name) in this col
    game_indices = game_df.index.tolist()
    
    scores.loc[game_indices, 'best_player_of_game'] = best_player_of_game # change vals of scores df based on this

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game
0,1,1,player_1,player_4,6,6.0,6.12,0,none,player_2,player_1
1,1,2,player_1,player_2,6,0.0,0.0,0,none,player_1,player_1
2,1,3,player_1,player_3,10,4.0,4.08,0,none,player_2,player_1
3,1,4,player_1,none,26,16.0,16.33,1,none,player_4,player_1
4,1,5,player_1,player_4,26,0.0,0.0,0,none,player_1,player_1


### Computing 'game_winner' column
- Done

In [10]:
### game_winner -- DONE

for game in num_games:

    game_df = scores.query(f'game_id == {game}') # subsets original df per game_id, into game_df
    game_indices = game_df.index.tolist()
    
    last_hand_value = len(game_df.query(f'game_id == {game}')['hand_id'].unique()) # gets last hand of each game
    last_hand_df = game_df.query(f'hand_id == {last_hand_value}') # df of just the last hand
    game_winner_name = last_hand_df.sort_values(by = "total_score", ascending = True)["player"].tolist()[0]

    scores.loc[game_indices, 'game_winner'] = game_winner_name # assigning name of game winner to game_winner col vals for that game

scores.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,game_winner
0,1,1,player_1,player_4,6,6.0,6.12,0,none,player_2,player_1,player_2
1,1,2,player_1,player_2,6,0.0,0.0,0,none,player_1,player_1,player_2
2,1,3,player_1,player_3,10,4.0,4.08,0,none,player_2,player_1,player_2
3,1,4,player_1,none,26,16.0,16.33,1,none,player_4,player_1,player_2
4,1,5,player_1,player_4,26,0.0,0.0,0,none,player_1,player_1,player_2


### Final scores df

In [11]:
# scores.head(60) # can only show first 60 rows in output
# scores.head()
scores

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,game_winner
0,1,1,player_1,player_4,6,6.0,6.12,0,none,player_2,player_1,player_2
1,1,2,player_1,player_2,6,0.0,0.00,0,none,player_1,player_1,player_2
2,1,3,player_1,player_3,10,4.0,4.08,0,none,player_2,player_1,player_2
3,1,4,player_1,none,26,16.0,16.33,1,none,player_4,player_1,player_2
4,1,5,player_1,player_4,26,0.0,0.00,0,none,player_1,player_1,player_2
...,...,...,...,...,...,...,...,...,...,...,...,...
499,14,3,player_4,player_2,55,25.0,25.00,1,none,player_1,player_1,player_1
500,14,4,player_4,none,55,0.0,0.00,0,none,player_2,player_1,player_1
501,14,5,player_4,player_3,69,14.0,14.00,1,none,player_1,player_1,player_1
502,14,6,player_4,player_1,94,25.0,25.00,1,none,player_1,player_1,player_1


### Summary Stats About scores df

In [12]:
# initialize blank df with same columns as "scores"
# this df consists of only the final hands of each game, so that only final scores are considered in these statistics
all_last_hands = pd.DataFrame(columns = scores.columns.tolist())

last_hands_list = []
for game, last_hand in games_and_hands_dict.items():
    last_hand_df = scores.query(f'game_id == {game} & hand_id == {last_hand}')
    last_hands_list.append(last_hand_df)
    all_last_hands = pd.concat(last_hands_list)

mean_ppg = {} # dict of player: avg total score at end of game
std_ppg = {} # dict of player: std total score at end of game
max_ppg = {} # dict of player: max total score at end of game
min_ppg = {} # dict of player: min total score at end of game
mean_pppg = {} # dict of player: avg percentage of points per hand

for player_name in player_list:
    # all_last_hands.query(f'player == {player_name}')
    mean_ppg[f'{player_name}'] = round(all_last_hands.query(f'player == "{player_name}"')['total_score'].mean(), 2)
    std_ppg[f'{player_name}'] = round(all_last_hands.query(f'player == "{player_name}"')['total_score'].std(), 2)
    max_ppg[f'{player_name}'] = round(all_last_hands.query(f'player == "{player_name}"')['total_score'].max(), 2)
    min_ppg[f'{player_name}'] = round(all_last_hands.query(f'player == "{player_name}"')['total_score'].min(), 2)
    mean_pppg[f'{player_name}'] = round(all_last_hands.query(f'player == "{player_name}"')['percent_points_per_hand'].mean(), 2)

print (f"Each player's average final score is:\n{mean_ppg}\n")
print (f"The Standard Deviation of each player's average final score is:\n{std_ppg}\n")
print (f"Each player's highest final score is:\n{max_ppg}\n")
print (f"Each player's lowest final score is:\n{min_ppg}\n")
print (f"Each player's average percentage of points per hand is:\n{mean_pppg}")

# all_last_hands.head()
all_last_hands.describe()

Each player's average final score is:
{'player_1': 79.71, 'player_3': 68.64, 'player_2': 67.57, 'player_4': 76.71}

The Standard Deviation of each player's average final score is:
{'player_1': 31.08, 'player_3': 23.74, 'player_2': 26.77, 'player_4': 28.45}

Each player's highest final score is:
{'player_1': 121, 'player_3': 109, 'player_2': 117, 'player_4': 121}

Each player's lowest final score is:
{'player_1': 22, 'player_3': 36, 'player_2': 31, 'player_4': 15}

Each player's average percentage of points per hand is:
{'player_1': 18.86, 'player_3': 9.51, 'player_2': 12.05, 'player_4': 10.36}


Unnamed: 0,game_id,hand_id,total_score,points_per_hand,percent_points_per_hand,queen_spades
count,56.0,56.0,56.0,56.0,56.0,56.0
mean,7.5,9.0,73.160714,9.392857,12.695893,0.25
std,4.06761,2.088932,27.378676,10.312708,14.150245,0.436931
min,1.0,5.0,15.0,0.0,0.0,0.0
25%,4.0,8.0,52.75,0.0,0.0,0.0
50%,7.5,8.5,71.0,4.0,7.695,0.0
75%,11.0,11.0,99.0,18.5,20.7925,0.25
max,14.0,12.0,121.0,36.0,63.89,1.0


## Machine Learning

In [13]:
# imports for ML
 
import sklearn # for tests
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures
)
from sklearn.metrics import recall_score, precision_score
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import (
    RandomizedSearchCV, cross_validate, train_test_split
)
from scipy.stats import loguniform
import pandas as pd
import numpy as np
from numpy.linalg import norm
import altair as alt
from pandas_profiling import ProfileReport
import eli5
# import shap
# import matplotlib
# %matplotlib inline

  from pandas import MultiIndex, Int64Index


### Info about processing/encoding
Columns:
- `game_id`: unique identifier of game number. Not related to time/a specific order
- `hand_id`: unique identifier of each hand within each game. Order DOES matter
- `player`: indicates who that row data pertains to
- `received_cards_from`: who the "player" received 3 cards from in that hand. For "no passing" hand, "none"
- `total_score`: player's total score throughout the game
- `points_per_hand`: how many points player got that hand
- `percent_points_per_hand`: percentage of points gotten that hand (compared to player's total points of that game)
- `queen_spades`: boolean (0 = no, 1 = yes) of whether player got the queen of spades that hand or not
- `moon_shooter`: name of player who shot the moon that hand, if applicable, otherwise "none"
- `best_player_of_hand`: player who got the least points in that hand/did the "best" that hand
- `best_player_of_game`: which player was `best_player_of_hand` the most frequently in each game (this is different than `game_winner` player)
- `game_winner`: person who won the game -- target


In [14]:
# renaming 'game_winner' column to 'target'

# Some initial processing
scores_processed = scores
# scores_processed = scores_processed.drop("game_id", axis = 1)
scores_processed = scores_processed.rename(columns = {"game_winner": "target"})

# encoding categorical cols -- not sure if this is necessary?
# scores_processed = scores_processed.replace(to_replace = {"None": 0, "Kai": 1, "Dad": 2, "Aidan": 3, "Diane": 4})

scores_processed.head()

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,target
0,1,1,player_1,player_4,6,6.0,6.12,0,none,player_2,player_1,player_2
1,1,2,player_1,player_2,6,0.0,0.0,0,none,player_1,player_1,player_2
2,1,3,player_1,player_3,10,4.0,4.08,0,none,player_2,player_1,player_2
3,1,4,player_1,none,26,16.0,16.33,1,none,player_4,player_1,player_2
4,1,5,player_1,player_4,26,0.0,0.0,0,none,player_1,player_1,player_2


In [15]:
### Splitting data into X/y train/test splits

train_df, test_df = train_test_split(scores_processed, test_size = 0.3, random_state = 123)
X_train, y_train = train_df.drop("target", axis = 1), train_df["target"]
X_test, y_test = test_df.drop("target", axis = 1), test_df["target"]

X_train

Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game
52,2,2,player_3,player_4,25,24.0,36.36,1,none,player_1,player_1
229,6,10,player_3,player_4,90,5.0,5.56,0,none,player_1,player_1
246,7,3,player_1,player_3,69,26.0,24.07,0,player_4,player_4,player_2
164,5,1,player_1,player_4,0,0.0,0.00,0,none,player_1,player_1
327,9,5,player_3,player_2,58,0.0,0.00,0,none,player_3,player_1
...,...,...,...,...,...,...,...,...,...,...,...
98,3,7,player_3,player_2,94,19.0,18.63,1,none,player_2,player_4
476,14,1,player_1,player_4,0,0.0,0.00,0,player_1,player_1,player_1
322,9,5,player_4,player_3,50,5.0,10.00,0,none,player_3,player_1
382,11,5,player_2,player_1,27,0.0,0.00,0,none,player_4,player_4


In [16]:
train_df.describe().reset_index()

Unnamed: 0,index,game_id,hand_id,total_score,points_per_hand,percent_points_per_hand,queen_spades
0,count,352.0,352.0,352.0,352.0,352.0,352.0
1,mean,7.232955,5.235795,41.764205,8.434659,11.135455,0.235795
2,std,4.080004,2.950411,28.484385,9.545288,13.18734,0.425099
3,min,1.0,1.0,0.0,0.0,0.0,0.0
4,25%,4.0,3.0,20.0,0.0,0.0,0.0
5,50%,7.0,5.0,38.0,4.0,5.785,0.0
6,75%,11.0,7.0,61.25,16.0,19.7825,0.0
7,max,14.0,12.0,121.0,39.0,81.82,1.0


In [17]:
train_df["target"].value_counts(normalize = True) # True return proportions, False returns counts

player_2    0.553977
player_3    0.227273
player_4    0.125000
player_1    0.093750
Name: target, dtype: float64

In [18]:
corr_matrx = train_df.corr('spearman').style.background_gradient()
corr_matrx

Unnamed: 0,game_id,hand_id,total_score,points_per_hand,percent_points_per_hand,queen_spades
game_id,1.0,-0.078969,0.064713,0.120118,0.092304,-0.003402
hand_id,-0.078969,1.0,0.767906,-0.013021,-0.02195,0.053131
total_score,0.064713,0.767906,1.0,0.302338,0.23165,0.229174
points_per_hand,0.120118,-0.013021,0.302338,1.0,0.970774,0.574372
percent_points_per_hand,0.092304,-0.02195,0.23165,0.970774,1.0,0.592515
queen_spades,-0.003402,0.053131,0.229174,0.574372,0.592515,1.0
