# 📖 Import Library 

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 💻 Load in data from data that I stimulate earlier 

In [15]:
df=pd.read_csv('../datasets/poker_states.csv')

# 📌 Check the data 
- There are originally 10 columns with 83,630 rows
- There are 2 features that having NaN
    - `community_cards`: Missing values might occur before any community cards are revealed (pre-flop).
    - `actions_this_street`: Missing values might occur when no actions have been taken yet. For instance, at the start of a betting round.
If the missing values have a specific context (e.g., logical absence), treat them differently than random missingness.

In [17]:
df.head()

Unnamed: 0,config_id,round,acting_player,action,action_amount,hole_cards,community_cards,pot,stacks,actions_this_street
0,0,preflop,geqao,call,200,"ST,S4",,300,5000;4900;4800;5000,smmpihzszsesmpxjefwcwu:SMALLBLIND:100;eavjfjqe...
1,0,preflop,ufrlx,fold,0,"C2,D5",,500,5000;4900;4800;4800,smmpihzszsesmpxjefwcwu:SMALLBLIND:100;eavjfjqe...
2,0,preflop,smmpi,call,200,"HA,D7",,500,5000;4900;4800;4800,smmpihzszsesmpxjefwcwu:SMALLBLIND:100;eavjfjqe...
3,0,preflop,eavjf,call,200,"C8,D8",,600,5000;4800;4800;4800,smmpihzszsesmpxjefwcwu:SMALLBLIND:100;eavjfjqe...
4,0,flop,smmpi,raise,2128,"HA,D7","H4,H3,HQ",600,5000;4800;4800;4800,


In [19]:
df.shape

(83630, 10)

In [21]:
df.columns

Index(['config_id', 'round', 'acting_player', 'action', 'action_amount',
       'hole_cards', 'community_cards', 'pot', 'stacks',
       'actions_this_street'],
      dtype='object')

In [36]:
df.isnull().sum()

config_id                  0
round                      0
acting_player              0
action                     0
action_amount              0
hole_cards                 0
community_cards        64994
pot                        0
stacks                     0
actions_this_street     9082
dtype: int64

### 🔎 Investigate missing values 

### `community_cards`

In [45]:
# Count rows where round is 'preflop' and community_cards is NaN
df[(df['round'] == 'preflop') & (df['community_cards'].isna())].shape[0]

# This can confirm that missing values occur before any community cards are revealed (pre-flop).

64994

In [59]:
# Replace 'None' in NaN
# Reason : 'None' can provide a clear representation of the pre-flop state.
df['community_cards'] = df['community_cards'].fillna('None')

### `actions_this_street`

In [67]:
# Count missing actions grouped by round
df[df['actions_this_street'].isna()].groupby('round').size()

round
flop     6938
river     422
turn     1722
dtype: int64

In [None]:
# community_cards        64994
# actions_this_street     9082