# 📖 Import Library 

In [2]:
import pandas as pd
import numpy as np

# 💻 Load in data

In [5]:
df=pd.read_csv('../datasets/poker_states.csv')

---

# 📊 Objective for Data Preparation

**My aims of this project are to:** </br>
- **Recommend Strategy:** Suggest optimal actions like Call, Fold, Raise based on in-game observations.

---

# 📌 Check the data 
- There are originally 10 columns with 83,630 rows
- There are 2 features that having NaN
    - `community_cards`: Missing values might occur before any community cards are revealed (pre-flop).
    - `actions_this_street`: Missing values might occur when no actions have been taken yet. For instance, at the start of a betting round.
      I will focus on the complete game therefor, I will drop row where action_this_street has missing value.

In [10]:
df.shape

(83630, 10)

In [11]:
df.isnull().sum()

config_id                  0
round                      0
acting_player              0
action                     0
action_amount              0
hole_cards                 0
community_cards        64994
pot                        0
stacks                     0
actions_this_street     9082
dtype: int64

In [12]:
# See what columns do my dataset has...
df.head()

Unnamed: 0,config_id,round,acting_player,action,action_amount,hole_cards,community_cards,pot,stacks,actions_this_street
0,0,preflop,geqao,call,200,"ST,S4",,300,5000;4900;4800;5000,smmpihzszsesmpxjefwcwu:SMALLBLIND:100;eavjfjqe...
1,0,preflop,ufrlx,fold,0,"C2,D5",,500,5000;4900;4800;4800,smmpihzszsesmpxjefwcwu:SMALLBLIND:100;eavjfjqe...
2,0,preflop,smmpi,call,200,"HA,D7",,500,5000;4900;4800;4800,smmpihzszsesmpxjefwcwu:SMALLBLIND:100;eavjfjqe...
3,0,preflop,eavjf,call,200,"C8,D8",,600,5000;4800;4800;4800,smmpihzszsesmpxjefwcwu:SMALLBLIND:100;eavjfjqe...
4,0,flop,smmpi,raise,2128,"HA,D7","H4,H3,HQ",600,5000;4800;4800;4800,


### 🔎 Investigate missing values 

### `community_cards`

In [15]:
# Count rows where round is 'preflop' and community_cards is NaN
df[(df['round'] == 'preflop') & (df['community_cards'].isna())].shape[0]

# This can confirm that missing values occur before any community cards are revealed (pre-flop).

64994

In [16]:
# Check  the percentage of each action each happen during preflop round 
# I want to make sure there are mixed of actions happen during preflop round
print(df[df['round'] == 'preflop'].groupby('config_id')['action'].value_counts(normalize=True).unstack(fill_value=0))

action         call      fold     raise
config_id                              
0          0.335260  0.335472  0.329268
1          0.329375  0.335852  0.334773
2          0.336726  0.331228  0.332047
3          0.333964  0.333853  0.332183


In [17]:
# The percentage of each action is balance. 
# I will replace missing value of `community_cards` to 0 as there will be 0 community cards during preflop round
df['community_cards'] = df['community_cards'].fillna(0)

### `actions_this_street`

In [19]:
#Check if missing values correlate with specific rounds (round column) or game states
df[df['actions_this_street'].isna()]['round'].value_counts()

round
flop     6938
turn     1722
river     422
Name: count, dtype: int64

In [20]:
# Understand the missingness of `action_this_street`
# round
# flop     6938 (Most common)
# turn     1722
# river     422 (Least common)

# Possible causes:
# Early termination of hands during preflop round
# Hands end due to flop action
# Hand beaten or beats by river hand
# Situations where multiple players only performed checks without significant bets or raises.

In [21]:
# I want to focus on complete round therefore, I decide to drop the `actions_this_street` rows where contain NaN 

# Reasons:
# It minimize the missing values
# Allow the better detection of player strategies, behavioral patterns, and game dynamics.
# Enhances ability to train models that effectively capture patterns in poker strategies.

In [22]:
# Create new DataFrame to store data and filter out the rows that have NaN in `actions_this_street`
df = df.dropna(subset=['actions_this_street'])

In [23]:
# Check data again
df.isnull().sum() 

config_id              0
round                  0
acting_player          0
action                 0
action_amount          0
hole_cards             0
community_cards        0
pot                    0
stacks                 0
actions_this_street    0
dtype: int64

---

### 👩🏼‍💻 Prepare data for futher analysis

In [26]:
# Split the hole_cards into individual cards
df[['card1', 'card2']] = df['hole_cards'].str.split(',', expand=True)

# Extract suit and rank for card1
df['card1_suit'] = df['card1'].str[0]
df['card1_rank'] = df['card1'].str[1:]

# Extract suit and rank for card2
df['card2_suit'] = df['card2'].str[0]
df['card2_rank'] = df['card2'].str[1:]

In [27]:
# Convert all entries to strings to handle non-string inputs
df["community_cards"] = df["community_cards"].astype(str)

max_cards = 5  # Number of columns to create (5 is the max number of community_cards)

# Extract ranks
ranks = df["community_cards"].apply(
    lambda x: [card[1:] for card in x.split(",") if card != "0" and card != ""] + [0] * (max_cards - len(x.split(",")))
              if x not in ["", "0"] else [0] * max_cards)

# Extract suits
suits = df["community_cards"].apply(
    lambda x: [card[0] for card in x.split(",") if card != "0" and card != ""] + [0] * (max_cards - len(x.split(",")))
              if x not in ["", "0"] else [0] * max_cards)

# Create new columns for each rank of `community_cards`
for i in range(max_cards):
    df[f"community{i+1}_rank"] = ranks.apply(lambda x: x[i] if i < len(x) else 0)
    df[f"community{i+1}_suit"] = suits.apply(lambda x: x[i] if i < len(x) else 0)

In [28]:
# Check work
df[['hole_cards','community_cards','card1_suit', 'card1_rank', 'card2_suit', 'card2_rank',
          'community1_rank', 'community1_suit', 'community2_rank', 'community2_suit',
          'community3_rank', 'community3_suit', 'community4_rank', 'community4_suit',
          'community5_rank', 'community5_suit']].head(10)

Unnamed: 0,hole_cards,community_cards,card1_suit,card1_rank,card2_suit,card2_rank,community1_rank,community1_suit,community2_rank,community2_suit,community3_rank,community3_suit,community4_rank,community4_suit,community5_rank,community5_suit
0,"ST,S4",0,S,T,S,4,0,0,0,0,0,0,0,0,0,0
1,"C2,D5",0,C,2,D,5,0,0,0,0,0,0,0,0,0,0
2,"HA,D7",0,H,A,D,7,0,0,0,0,0,0,0,0,0,0
3,"C8,D8",0,C,8,D,8,0,0,0,0,0,0,0,0,0,0
5,"C8,D8","H4,H3,HQ",C,8,D,8,4,H,3,H,Q,H,0,0,0,0
6,"ST,S4","H4,H3,HQ",S,T,S,4,4,H,3,H,Q,H,0,0,0,0
7,"HA,D7","H4,H3,HQ",H,A,D,7,4,H,3,H,Q,H,0,0,0,0
9,"C8,D8","H4,H3,HQ,SA",C,8,D,8,4,H,3,H,Q,H,A,S,0,0
10,"ST,S4","H4,H3,HQ,SA",S,T,S,4,4,H,3,H,Q,H,A,S,0,0
12,"ST,S4","H4,H3,HQ,SA,DJ",S,T,S,4,4,H,3,H,Q,H,A,S,J,D


In [29]:
# I see in card suit store in str and some card rank is represent in alphabet 
# (e.g. T in this case is represent 10, K = King)

# I want to change to numerical for further analysis and train in model

In [30]:
# Define mapping dictionary for suits
suit_map = {'S': 4, 'H': 3, 'D': 2, 'C': 1}
# (1=Clubs, 2=Diamonds, 3=Hearts, 4=Spades)

# Fill missing values with 0 and map suits
df['card1_suit'] = df['card1_suit'].map(suit_map).fillna(0).astype(int)
df['card2_suit'] = df['card2_suit'].map(suit_map).fillna(0).astype(int)
df['community1_suit'] = df['community1_suit'].map(suit_map).fillna(0).astype(int)
df['community2_suit'] = df['community2_suit'].map(suit_map).fillna(0).astype(int)
df['community3_suit'] = df['community3_suit'].map(suit_map).fillna(0).astype(int)
df['community4_suit'] = df['community4_suit'].map(suit_map).fillna(0).astype(int)
df['community5_suit'] = df['community5_suit'].map(suit_map).fillna(0).astype(int)

In [31]:
# Define mapping dictionaries for ranks
rank_map = {'T': 10, 'J': 11, 'Q': 12, 'K': 13, 'A': 14}

# Map ranks to numbers, filling NaN with 0 where necessary and converting the result to int
df['card1_rank'] = df['card1_rank'].apply(lambda x: rank_map.get(x, x) if pd.notnull(x) else 0).astype(int)
df['card2_rank'] = df['card2_rank'].apply(lambda x: rank_map.get(x, x) if pd.notnull(x) else 0).astype(int)
df['community1_rank'] = df['community1_rank'].apply(lambda x: rank_map.get(x, x) if pd.notnull(x) else 0).astype(int)
df['community2_rank'] = df['community2_rank'].apply(lambda x: rank_map.get(x, x) if pd.notnull(x) else 0).astype(int)
df['community3_rank'] = df['community3_rank'].apply(lambda x: rank_map.get(x, x) if pd.notnull(x) else 0).astype(int)
df['community4_rank'] = df['community4_rank'].apply(lambda x: rank_map.get(x, x) if pd.notnull(x) else 0).astype(int)
df['community5_rank'] = df['community5_rank'].apply(lambda x: rank_map.get(x, x) if pd.notnull(x) else 0).astype(int)

In [32]:
# Check work
df[['hole_cards','community_cards','card1_suit', 'card1_rank', 'card2_suit', 'card2_rank',
          'community1_rank', 'community1_suit', 'community2_rank', 'community2_suit',
          'community3_rank', 'community3_suit', 'community4_rank', 'community4_suit',
          'community5_rank', 'community5_suit']].head(5)

Unnamed: 0,hole_cards,community_cards,card1_suit,card1_rank,card2_suit,card2_rank,community1_rank,community1_suit,community2_rank,community2_suit,community3_rank,community3_suit,community4_rank,community4_suit,community5_rank,community5_suit
0,"ST,S4",0,4,10,4,4,0,0,0,0,0,0,0,0,0,0
1,"C2,D5",0,1,2,2,5,0,0,0,0,0,0,0,0,0,0
2,"HA,D7",0,3,14,2,7,0,0,0,0,0,0,0,0,0,0
3,"C8,D8",0,1,8,2,8,0,0,0,0,0,0,0,0,0,0
5,"C8,D8","H4,H3,HQ",1,8,2,8,4,3,3,3,12,3,0,0,0,0


### Now, Let's catagorize the poker hand rank

**<div align="center">
  Poker hand rank descriptionm ♤ ♡ ♧ ♢**
</div>

| Rank | Poker Hand       | Description                                                       |
|------|------------------|-------------------------------------------------------------------|
| 1    | Nothing in hand   | Not a recognized poker hand                                      |
| 2    | One pair          | One pair of equal ranks within five cards                        |
| 3    | Two pairs         | Two pairs of equal ranks within five cards                       |
| 4    | Three of a kind   | Three equal ranks within five cards                              |
| 5    | Straight          | Five cards, sequentially ranked with no gaps                     |
| 6    | Flush             | Five cards with the same suit                                    |
| 7    | Full house        | Pair + different rank three of a kind                            |
| 8    | Four of a kind    | Four equal ranks within five cards                               |
| 9    | Straight flush    | Straight + flush                                                 |
| 10   | Royal flush       | {Ace, King, Queen, Jack, Ten} + flush                            |} + flush} + flush

In [35]:
# Step 1: Combine all rank and suits cards
df['all_suits'] = df[['community1_suit', 'community2_suit', 'community3_suit', 
                      'community4_suit', 'community5_suit', 'card1_suit', 'card2_suit']].values.tolist()

df['all_ranks'] = df[['community1_rank', 'community2_rank', 'community3_rank', 
                      'community4_rank', 'community5_rank', 'card1_rank', 'card2_rank']].values.tolist()

In [36]:
# Step 2: Clean up ranks and suits by removing any zero values (invalid cards)
df['all_suits'] = df['all_suits'].apply(lambda suits: [suit for suit in suits if suit != ''])
df['all_ranks'] = df['all_ranks'].apply(lambda ranks: [rank for rank in ranks if rank != 0])

In [37]:
# Step 3: Count occurrences of suits and ranks, only considering non-zero values
df['suit_counts'] = df['all_suits'].apply(lambda suits: pd.Series(suits).value_counts().to_dict())
df['rank_counts'] = df['all_ranks'].apply(lambda ranks: pd.Series(ranks).value_counts().to_dict())

In [38]:
# Step 4: Initialize poker_hand column
df['poker_hand'] = 1 # No pair start with 1

# Step 5: Assign poker hand values
# Royal Flush: All cards same suit and ranks 14, 13, 12, 11, 10
royal_set = {14, 13, 12, 11, 10}
df.loc[
    (df['suit_counts'].apply(lambda x: len(x) == 1)) & 
    (df['all_ranks'].apply(lambda x: set(x) == royal_set)), 
    'poker_hand'] = 10

# Straight Flush: All cards same suit, and ranks form a consecutive sequence
df.loc[
    (df['suit_counts'].apply(lambda x: len(x) == 1)) & 
    (df['rank_counts'].apply(lambda x: len(x) == 5 and max(x.keys()) - min(x.keys()) == 4)), 
    'poker_hand'] = 9

# Four of a Kind: Four cards of the same rank
df.loc[
    df['rank_counts'].apply(lambda x: 4 in x.values()), 
    'poker_hand'] = 8

# Full House: Three of one rank and two of another
df.loc[
    df['rank_counts'].apply(lambda x: sorted(x.values()) == [2, 3]), 
    'poker_hand'] = 7

# Flush: All cards of the same suit
df.loc[
    df['suit_counts'].apply(lambda x: len(x) == 1), 
    'poker_hand'] = 6

# Straight: Cards form a consecutive sequence
df.loc[
    (df['rank_counts'].apply(lambda x: len(x) == 5)) & 
    (df['rank_counts'].apply(lambda x: max(x.keys()) - min(x.keys()) == 4)), 
    'poker_hand'] = 5

# Three of a Kind: Three cards of the same rank
df.loc[
    df['rank_counts'].apply(lambda x: 3 in x.values()), 
    'poker_hand'] = 4

# Two Pairs: Two pairs of the same rank
df.loc[
    df['rank_counts'].apply(lambda x: list(x.values()).count(2) == 2), 
    'poker_hand'] = 3

# One Pair: One pair of the same rank
df.loc[
    df['rank_counts'].apply(lambda x: 2 in x.values()), 
    'poker_hand'] = 2

In [39]:
# Step 6: Clean up intermediate columns (I don't need anymore)
df.drop(['all_suits', 'all_ranks', 'suit_counts', 'rank_counts'], axis=1, inplace=True)

In [40]:
# Check work
df[['hole_cards','community_cards','poker_hand']].head(30)

Unnamed: 0,hole_cards,community_cards,poker_hand
0,"ST,S4",0,1
1,"C2,D5",0,1
2,"HA,D7",0,1
3,"C8,D8",0,2
5,"C8,D8","H4,H3,HQ",2
6,"ST,S4","H4,H3,HQ",2
7,"HA,D7","H4,H3,HQ",1
9,"C8,D8","H4,H3,HQ,SA",2
10,"ST,S4","H4,H3,HQ,SA",2
12,"ST,S4","H4,H3,HQ,SA,DJ",2


In [41]:
df['poker_hand'].unique()

array([1, 2, 4, 5, 8])

In [42]:
# Split the 'stacks' column into individual player stack amounts
# Maximum of 4 players
stack_columns = ['stack_player1', 'stack_player2', 'stack_player3', 'stack_player4']
df[stack_columns] = df['stacks'].str.split(';', expand=True).astype(float)

# Display the updated dataset with parsed stacks
df[stack_columns].head()

Unnamed: 0,stack_player1,stack_player2,stack_player3,stack_player4
0,5000.0,4900.0,4800.0,5000.0
1,5000.0,4900.0,4800.0,4800.0
2,5000.0,4900.0,4800.0,4800.0
3,5000.0,4800.0,4800.0,4800.0
5,5000.0,2672.0,4800.0,4800.0


In [43]:
# Count occurrences of 'RAISE', 'CALL', and 'FOLD' in the 'actions_this_street' column
# WHY:
# 1. To understand the player behavior by analyzing how many times a player raises or calls.
# 2. Prepare for feature engineering for predicition models.

df['num_raises'] = df['actions_this_street'].str.count('RAISE')
df['num_calls'] = df['actions_this_street'].str.count('CALL')
df['num_folds'] = df['actions_this_street'].str.count('FOLD')

# Display the updated dataset with new features
df[['actions_this_street', 'num_raises', 'num_calls', 'num_folds']].head()

Unnamed: 0,actions_this_street,num_raises,num_calls,num_folds
0,smmpihzszsesmpxjefwcwu:SMALLBLIND:100;eavjfjqe...,0,0,0
1,smmpihzszsesmpxjefwcwu:SMALLBLIND:100;eavjfjqe...,0,1,0
2,smmpihzszsesmpxjefwcwu:SMALLBLIND:100;eavjfjqe...,0,1,1
3,smmpihzszsesmpxjefwcwu:SMALLBLIND:100;eavjfjqe...,0,2,1
5,smmpihzszsesmpxjefwcwu:RAISE:2128,1,0,0


In [44]:
# Calculate action-to-pot ratio 
# WHY: action_to_pot_ratio
# 1.The ratio measures the relationship between the player's action amount (the bet or raise they make) and the total pot size.
# 2.Help for aggression indicator. A higher ratio suggests that the player is making a larger bet relative to the pot, 
# which may indicate a strong hand or an attempt to bluff.
# 3.Bluff Detection. If a player frequently makes large bets compared to the pot size, it could be an indicator of bluffing, 
# as they may be trying to push other players out with a large bet.
# 4.Strategic Insight: The ratio helps in understanding the aggression level of a player, which is essential when recommending actions 
# (e.g., whether to call, raise, or fold).

df['action_to_pot_ratio'] = df['action_amount'] / df['pot']

In [45]:
# Calculate stack-to-pot ratio
# WHY: 
# 1. To measures the relationship between a player's current stack size and the pot size.
# 2. A lower stack-to-pot ratio indicates that a player is more "pot-committed," 
#    meaning they have fewer chips relative to the pot, making it harder to fold without risking a significant portion of their stack.
# 3. Strategic Recommendations: In a low stack-to-pot scenario, a player is more likely to push with any strong hand. 
#    A higher ratio suggests they may have more room to maneuver and can afford to fold or play conservatively.
df['stack_to_pot_ratio'] = df['stack_player1'] / df['pot']  


# Display the with the new 2 ratios
df[['action_amount', 'pot', 'action_to_pot_ratio', 'stack_to_pot_ratio']].head()

Unnamed: 0,action_amount,pot,action_to_pot_ratio,stack_to_pot_ratio
0,200,300,0.666667,16.666667
1,0,500,0.0,10.0
2,200,500,0.4,10.0
3,200,600,0.333333,8.333333
5,4760,2728,1.744868,1.832845


In [80]:
df.to_csv('../datasets/poker_completed.csv', index=False)