## Data Processing & Engineering

### Reading in Data

In [3]:
# imports for processing

import pandas as pd
import numpy as np

In [6]:
scores = pd.read_csv('data/hearts_anon_processed.csv').drop(columns = "Unnamed: 0")
print(scores.shape)
scores.head()

(504, 12)


Unnamed: 0,game_id,hand_id,player,received_cards_from,total_score,points_per_hand,percent_points_per_hand,queen_spades,moon_shooter,best_player_of_hand,best_player_of_game,game_winner
0,1,1,player_1,player_4,6,6.0,6.12,0,none,player_2,player_1,player_2
1,1,2,player_1,player_2,6,0.0,0.0,0,none,player_1,player_1,player_2
2,1,3,player_1,player_3,10,4.0,4.08,0,none,player_2,player_1,player_2
3,1,4,player_1,none,26,16.0,16.33,1,none,player_4,player_1,player_2
4,1,5,player_1,player_4,26,0.0,0.0,0,none,player_1,player_1,player_2


In [7]:
### Dictionary of the number of hands in each game

num_games = scores['game_id'].unique() # returns list of unique values of game_id column, which is the number of games played
player_list = scores.sort_values(by = "player", ascending = True)['player'].unique() # ordered list of player names
games_and_hands_dict = {} # dict with keys as game_id and values as total number of hands in each game

for game in num_games:

    hands_per_game = scores.query(f'game_id == {game}')['hand_id'].unique() # returns an array of ints from 1:max number of hands
    games_and_hands_dict[f'{game}'] = len(hands_per_game)

print (scores['received_cards_from'].unique()) # making sure there are only 4 players + "none" (and no spelling errors)
print (player_list) # making sure there are only 4 players (and no spelling errors)

games_and_hands_dict

['player_4' 'player_2' 'player_3' 'none' 'player_1']
['player_1' 'player_2' 'player_3' 'player_4']


{'1': 10,
 '2': 11,
 '3': 8,
 '4': 12,
 '5': 8,
 '6': 12,
 '7': 7,
 '8': 9,
 '9': 5,
 '10': 8,
 '11': 9,
 '12': 8,
 '13': 12,
 '14': 7}

### Dealing with Class Imbalance
- player 2 won 56% of games 

In [13]:
scores['game_winner'].value_counts(normalize = True)

player_2    0.563492
player_3    0.214286
player_4    0.126984
player_1    0.095238
Name: game_winner, dtype: float64