# Cleaning the data
This document demonstrates how I used my functions from data_processing to compile the textual data into usable csv files.

In [43]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
from datetime import datetime
import regex

from data_processing import find_hands, table_session_data, hand_data, player_data_update, player_data, end_stack, play_data

#### First, I compiled the seperate files downloaded from PokerStars account into one string. Then I use the find_hands function to divide this string into a list of hands.

In [44]:
path = 'PokerHistoryFiles'
files = os.listdir(path)

table_hands=[]

for file in files:
    hands=[]
    if os.path.isfile(os.path.join(path, file)):
        f = open(os.path.join(path, file),'r')
        with f as file:
            for line in file:
                hands.append(line) 
            table_hands.append(hands)

In [45]:
table_hands_lists=[]

for j in range(len(table_hands)):
    hands=table_hands[j]
    start=[]
    hands_lists=[]
    for i in range(len(hands)):
        if find_hands(hands[i])==True:
            start.append(i)
    for i in range(len(start)-1):
        hands_lists.append(hands[start[i]:start[i+1]-3])
    hands_lists.append(hands[start[-1]:-3])
    table_hands_lists.append(hands_lists)

#### This is an example of a hand.

In [46]:
table_hands_lists[0][0]

["ï»¿PokerStars Hand #240332455400:  Hold'em No Limit (50/100) - 2022/12/08 12:19:13 CUST [2022/12/08 6:19:13 ET]\n",
 "Table 'Crescentia' 6-max (Play Money) Seat #2 is the button\n",
 'Seat 1: SickHooksJ88 (10000 in chips) \n',
 'Seat 2: 44dÃ©dÃ© (21894 in chips) \n',
 'Seat 3: mwm 013 (4553 in chips) \n',
 'Seat 4: 333albi (4000 in chips) \n',
 'Seat 5: serioushippo677 (10000 in chips) \n',
 'mwm 013: posts small blind 50\n',
 '333albi: posts big blind 100\n',
 'serioushippo677: posts big blind 100\n',
 'SickHooksJ88: posts big blind 100\n',
 '*** HOLE CARDS ***\n',
 'Dealt to serioushippo677 [Ad Td]\n',
 'serioushippo677: checks \n',
 'SickHooksJ88: raises 400 to 500\n',
 '44dÃ©dÃ©: folds \n',
 'mwm 013: calls 450\n',
 '333albi: folds \n',
 'HighFoxBM598 leaves the table\n',
 'serioushippo677: calls 400\n',
 '*** FLOP *** [6d 6h As]\n',
 '333albi leaves the table\n',
 'mwm 013: checks \n',
 'serioushippo677: checks \n',
 'SickHooksJ88: bets 2800\n',
 'mwm 013: folds \n',
 'serioushi

#### This is the information I wanted to extract: the table information, hand information, player attributes and statistics and information about how a player plays a hand.

In [47]:
table_session_information = ["table_name", "small_blind", "big_blind", "table_datetime_start", "table_duration_estimate"]

hand_information = ["hand_table_name", "hand_datetime_start", "hand_number", "number_players", "positions", "winner_main_pot", "winner_side_pot1", "winner_side_pot2", "winner_side_pot3", "money_main_pot", "money_side_pot1", "money_side_pot2", "money_side_pot3"]

player_information = ["type", "hands_recorded", "vpip_count", "pfr_count","username"]

play_information = ["username_play", "hand_number_play","position_play", "starting_stack", "finishing_stack", "cards"]

#### Here I extract the table information.

In [48]:
table_session_info=[]
for i in range(len(table_hands_lists)):
    table_dictionary=dict(zip(table_session_information, table_session_data(table_hands_lists[i])))
    table_session_info.append(table_dictionary)

# Compiling table_session_info into a list of dictionaries about each table

table_info=[table_session_info[0]]
for i in range(1,len(table_session_info)):
    if table_session_info[i]["table_name"] in [dictionary["table_name"] for dictionary in table_info]:
        table_info[j]["table_datetime_start"].append(table_session_info[i]["table_datetime_start"])
        table_info[j]["table_duration_estimate"].append(table_session_info[i]["table_duration_estimate"])
    else:
        table_info.append(table_session_info[i])

#### Here I extract the hand information.

In [49]:
# Compiling data about hands into a list of dictionaries

hand_info=[]
for i in range(len(table_hands_lists)):
    for j in range(len(table_hands_lists[i])):
        hand_dictionary=dict(zip(hand_information, hand_data(table_hands_lists[i][j])))
        hand_info.append(hand_dictionary)

#### Here I extract the player information.

In [50]:
# Compiles into list of player data

player_info=[]

dict_of_dicts=player_data(table_hands_lists)
for player in dict_of_dicts:
    player_info.append(dict_of_dicts[player])
    player_info[-1]['username']=player

#### Here I extract information about how a player plays a hand.

In [51]:
# play_info is a list of dictionaries containing information about a player's play in each hand, with usernames as keys.
# e.g. username:[hand_number, hand_datetime, position, starting_stack, finishing_stack, cards]

play_info_1=[]
flat_list_hands = [item for sublist in table_hands_lists for item in sublist]
for i in range(len(flat_list_hands)):
    play_info_1.append(play_data(flat_list_hands[i],hand_info[i], player_info))

play_info=[]
for i in range(len(play_info_1)):
    hand_play=play_info_1[i]
    for key in hand_play:
        values=hand_play[key]
        values.insert(0,key)
        dictionary=dict(zip(play_information,values))
        play_info.append(dictionary)

In [31]:
table_keys=table_info[0].keys()
hand_keys=hand_info[0].keys()
player_keys=player_info[0].keys()
play_keys=play_info[0].keys()

#### Examples of the output.

The information about the first table is as follows.

In [63]:
table_info[0]

{'table_name': 'Crescentia',
 'small_blind': 50,
 'big_blind': 100,
 'table_datetime_start': ['2022-12-08 12:19:13'],
 'table_duration_estimate': ['0:11:39']}

The first hand played at the first table is as follows.

In [61]:
hand_info[0]

{'hand_table_name': 'Crescentia',
 'hand_datetime_start': '2022-12-08 12:19:13',
 'hand_number': '240332455400',
 'number_players': 5,
 'positions': {'btn': '44dÃ©dÃ©',
  'sb': 'mwm 013',
  'bb': '333albi',
  'utg': 'serioushippo677',
  'mp': 'SickHooksJ88'},
 'winner_main_pot': 'SickHooksJ88',
 'winner_side_pot1': None,
 'winner_side_pot2': None,
 'winner_side_pot3': None,
 'money_main_pot': 1512,
 'money_side_pot1': None,
 'money_side_pot2': None,
 'money_side_pot3': None}

The first player, 'SickHooksJ88', having played 11 hands with them, has voluntarily put money into the pot during 10 of those hands. They have pre-flop-raised all 10 as well.

In [64]:
player_info[1]

{'type': 'villain',
 'hands_recorded': 11,
 'vpip_count': 8,
 'pfr_count': 2,
 'username': '44dÃ©dÃ©'}

We can see that during one of these hands, they won in the big blind with the 6 and queen of spades.

In [67]:
play_info[7]

{'username_play': 'SickHooksJ88',
 'hand_number_play': '240332464851',
 'position_play': 'bb',
 'starting_stack': 11012,
 'finishing_stack': 18457,
 'cards': ['6s', 'Qs']}

#### I then export the data as seperate csv files.

In [33]:
import csv

with open('poker_table.csv', 'w', newline='') as output_file:
    dictionary_writer = csv.DictWriter(output_file, table_keys)
    dictionary_writer.writeheader()
    dictionary_writer.writerows(table_info)

In [34]:
with open('hand.csv', 'w', newline='') as output_file:
    dictionary_writer = csv.DictWriter(output_file, hand_keys)
    dictionary_writer.writeheader()
    dictionary_writer.writerows(hand_info)

In [35]:
with open('player.csv', 'w', newline='') as output_file:
    dictionary_writer = csv.DictWriter(output_file, player_keys)
    dictionary_writer.writeheader()
    dictionary_writer.writerows(player_info)

In [36]:
with open('play.csv', 'w', newline='') as output_file:
    dictionary_writer = csv.DictWriter(output_file, play_keys)
    dictionary_writer.writeheader()
    dictionary_writer.writerows(play_info)