In [1]:
import json
import numpy as np
import pandas as pd
import re
from copy import copy
from tqdm import tqdm
import df_utils

### Data Extraction

#### Reading Json File

In [2]:
with open('hands.json', 'r') as file:
    data = json.load(file)

In [3]:
len(data)

439682

#### Shape of a Json object

In [4]:
first_obj = data[0]

In [5]:
first_obj

{'_id': 'holdem3_199505_800160769',
 'board': [],
 'dealer': 1,
 'game': 'holdem3',
 'hand_num': 1,
 'num_players': 2,
 'players': {'A8': {'total_bet': 25,
   'bankroll': 8371,
   'bets': [{'actions': 'Bf', 'stage': 'p'},
    {'actions': '-', 'stage': 'f'},
    {'actions': '-', 'stage': 't'},
    {'actions': '-', 'stage': 'r'}],
   'pocket_cards': [],
   'position': 1,
   'total_win': 0},
  'Schween': {'total_bet': 50,
   'bankroll': 8035,
   'bets': [{'actions': 'B', 'stage': 'p'},
    {'actions': '-', 'stage': 'f'},
    {'actions': '-', 'stage': 't'},
    {'actions': '-', 'stage': 'r'}],
   'pocket_cards': [],
   'position': 2,
   'total_win': 75}},
 'pots': [{'num_players': 0, 'stage': 'f', 'size': 0},
  {'num_players': 0, 'stage': 't', 'size': 0},
  {'num_players': 0, 'stage': 'r', 'size': 0},
  {'num_players': 1, 'stage': 's', 'size': 75}]}

In [6]:
first_obj['_id']

'holdem3_199505_800160769'

#### Extractors functions

In [7]:
def extract_datetime(obj: dict):
    _, year_month, timestamp = obj["_id"].split('_')
    return year_month[:4], year_month[4:], timestamp

In [8]:
def normalize_cards_list(board: list, cards_num=5, padding_element='-'):
    normalize_board = copy(board)
    cards_number = len(board)
    if cards_number == cards_num:
        return normalize_board
    else:
        appendex_number = cards_num - cards_number
        appendex = [padding_element] * appendex_number
        normalize_board.extend(appendex)
    return normalize_board

In [9]:
stages = {
    "p": "preflop",
    "f": "flop",
    "t": "turn",
    "r": "river",
    "s": "showdown"
}

In [10]:
def process_bets(player_label:str, bets_list:list[dict], stages:dict):
    bets_ = {}
    for bet in bets_list:
        stage_name = stages[bet['stage']]
        bets_[f"{player_label}_bet_{stage_name}"] = bet['actions']
    return bets_

In [11]:
def process_player(player_label: str, player_object: dict, stages:dict):
    bets = process_bets(player_label, player_object['bets'], stages)
    result = {
        f"{player_label}_total_bet": player_object['total_bet'],
        f"{player_label}_bankroll": player_object['bankroll'],
        f"{player_label}_total_win": player_object['total_win']
    }
    
    result.update(bets)
    
    return result

In [12]:
def process_pots(pots_list:list[dict], sstages:dict):
    result = {}
    for pot in pots_list:
        stage_verbose_name = stages[pot['stage']]
        result[f"pot_players_num_{stage_verbose_name}"] = pot['num_players'] 
        result[f"pot_size_{stage_verbose_name}"] = pot['size']
    return result

In [13]:
def process_pocket_cards(player_label: str, cards: list):
    return {f"{player_label}_pocket_card{i+1}": card for i, card in enumerate(cards)}

In [14]:
def holdem_version_extractor(game_name:str):
    match = re.search(r"holdem(\d+)", game_name)
    game_version = int(match.group(1)) if match else 0
    return game_version

#### Hands objects construction

In [15]:
def process_row(dict_row, stages:dict):
    result = {
        "game_id": holdem_version_extractor(dict_row["game"]),
        "players_num": dict_row["num_players"]
    }
    year, month, timestamp = extract_datetime(dict_row)
    result["year"], result["month"], result["timestamp"] = year, month, timestamp
    board = normalize_cards_list(dict_row['board'], cards_num=5)
    for i, card in enumerate(board):
        result[f"board_card_{i+1}"] = card
    for player_name, player_dict in dict_row["players"].items():
        pocket_cards = player_dict["pocket_cards"]
        normalized_pocket_cards = normalize_cards_list(pocket_cards, cards_num=2)
        player_label = f"player{player_dict['position']}"
        processed_player = process_player(player_label, player_dict, stages)
        processed_player.update(process_pocket_cards(player_label, normalized_pocket_cards))
        result.update(processed_player)
        result[f"player{player_dict['position']}_name"] = player_name
    pots = process_pots(dict_row['pots'], stages)
    result.update(pots)
    return result

In [16]:
hands_processed_data = [process_row(row, stages) for row in tqdm(data, 
                                                           desc="Processing Poker hands data",
                                                           total=len(data),
                                                           unit="hand")]


rocessing Poker hands data: 100%|█████████████████████████████████████████| 439682/439682 [00:16<00:00, 26128.71hand/s]

In [23]:
hands_processed_data[1:3]

[{'game_id': 3,
  'players_num': 2,
  'year': '1995',
  'month': '05',
  'timestamp': '800160788',
  'board_card_1': 'Jc',
  'board_card_2': 'Qs',
  'board_card_3': '6s',
  'board_card_4': '6d',
  'board_card_5': '-',
  'player2_total_bet': 250,
  'player2_bankroll': 8346,
  'player2_total_win': 0,
  'player2_bet_preflop': 'Br',
  'player2_bet_flop': 'b',
  'player2_bet_turn': 'bf',
  'player2_bet_river': '-',
  'player2_pocket_card1': '-',
  'player2_pocket_card2': '-',
  'player2_name': 'A8',
  'player1_total_bet': 350,
  'player1_bankroll': 8060,
  'player1_total_win': 600,
  'player1_bet_preflop': 'Bcc',
  'player1_bet_flop': 'kc',
  'player1_bet_turn': 'kr',
  'player1_bet_river': '-',
  'player1_pocket_card1': '-',
  'player1_pocket_card2': '-',
  'player1_name': 'Schween',
  'pot_players_num_flop': 2,
  'pot_size_flop': 200,
  'pot_players_num_turn': 2,
  'pot_size_turn': 300,
  'pot_players_num_river': 0,
  'pot_size_river': 0,
  'pot_players_num_showdown': 1,
  'pot_size_showd

### Dataset building

In [None]:
df = pd.DataFrame(hands_processed_data)

In [40]:
df.shape

(439682, 139)

In [41]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439682 entries, 0 to 439681
Data columns (total 139 columns):
 #    Column                    Dtype  
---   ------                    -----  
 0    game_id                   int64  
 1    dealer_id                 int64  
 2    players_num               int64  
 3    year                      object 
 4    month                     object 
 5    timestamp                 object 
 6    board_card_1              object 
 7    board_card_2              object 
 8    board_card_3              object 
 9    board_card_4              object 
 10   board_card_5              object 
 11   player1_total_bet         int64  
 12   player1_bankroll          int64  
 13   player1_total_win         int64  
 14   player1_bet_preflop       object 
 15   player1_bet_flop          object 
 16   player1_bet_turn          object 
 17   player1_bet_river         object 
 18   player1_pocket_card1      object 
 19   player1_pocket_card2      object 
 20   pl

In [42]:
df.columns

Index(['game_id', 'dealer_id', 'players_num', 'year', 'month', 'timestamp',
       'board_card_1', 'board_card_2', 'board_card_3', 'board_card_4',
       ...
       'player12_total_bet', 'player12_bankroll', 'player12_total_win',
       'player12_bet_preflop', 'player12_bet_flop', 'player12_bet_turn',
       'player12_bet_river', 'player12_pocket_card1', 'player12_pocket_card2',
       'player12_name'],
      dtype='object', length=139)

In [None]:
df.to_csv("data/holdem3.csv", index=False)