# Create a dataset of players, where each player has a set of plays they made

## ----------------------------------------------------------------------------------------------

### Load the set of 18832 valid hands extracted from the IRCdatabase. We have only extracted valid hands from the "holdem" tag (there are around 6 more)

In [15]:
# CONFIGURATION
DATAPATH = "/Users/ketanjog/Documents/Columbia_Classes/Year_5/Semester_9/pgm/project/bayespoker/data/PokerHandsDataset/hands_valid.json"

STRENGTH_GRANULARITY = 5
AGGRESSIVENESS_GRANULARITY = 10

In [18]:
# !pip install treys
import json
import pandas as pd
import time 
import pprint
printDict = pprint.PrettyPrinter(depth=6)
from treys import Card, Evaluator, Deck
evaluator = Evaluator()


In [8]:
# Demonstrate a single entry in the dataset

with open(DATAPATH, 'r') as f:
        # print('#' * 60)
        line = f.readline()
        hand = json.loads(line)
        print("An example of a poker hand record\n")
        printDict.pprint(hand)
            

An example of a poker hand record

{'board': ['6c', '3d', '7s', 'Ts', '2h'],
 'dealer': 1,
 'game': 'holdem',
 'id': 1,
 'num_players': 2,
 'players': [{'action': 50,
              'bankroll': 845,
              'bets': [{'actions': ['B', 'r'], 'stage': 'p'},
                       {'actions': ['b'], 'stage': 'f'},
                       {'actions': ['k', 'c'], 'stage': 't'},
                       {'actions': ['k'], 'stage': 'r'}],
              'pocket_cards': ['9s', '9h'],
              'pos': 1,
              'user': 'GregR',
              'winnings': 0},
             {'action': 50,
              'bankroll': 860,
              'bets': [{'actions': ['B', 'c'], 'stage': 'p'},
                       {'actions': ['c'], 'stage': 'f'},
                       {'actions': ['b'], 'stage': 't'},
                       {'actions': ['k'], 'stage': 'r'}],
              'pocket_cards': ['Kc', 'Tc'],
              'pos': 2,
              'user': 'kwAAkbot',
              'winnings': 100}],
 'pots

In [9]:
# Count the lines in the dataset
count = 0
st = time.time()
with open(DATAPATH, 'r') as f:
        # print('#' * 60)
        line = f.readline()
        while line:
            hand = json.loads(line)
            count = count+1
            line = f.readline()
            
end = time.time()

print(f'We have {count} hands')
print(f"Execution time: {(end-st):.2f} seconds")

We have 18832 hands
Execution time: 0.40 seconds


### Now extract structured dataset for mixed membership models

In [46]:
class Round:
    def __init__(self):
        self.strength = 0
        self.aggression = 0

class Player:
    def __init__(self, name):
        self.name = name
        self.rounds = []

class Dataset:
    def __init__(self):
        self.data =dict()

    def addPlayer(self, player: Player):
        self.data[player.name] = player.rounds

    def updatePlayer(self, player: Player):
        self.data[player.name] = player.rounds


In [63]:
DF = Dataset()

def getHandStrength(board, hand, round):
    evaluator = Evaluator()
    myHand = []
    for card in hand:
        myHand.append(Card.new(card))

    myBoard = []
    for card in board:
        myBoard.append(Card.new(card))
    strength = evaluator.evaluate(myBoard[:round], myHand)
    # We know that trey has 7642 distinct hands ranked in a lookup table, 1 being the strongest
    gradation = 7642/STRENGTH_GRANULARITY
    strength = int(strength/gradation)
    return strength

def getHandAggression(actions):
    aggression = 0
    for action in actions:
        # Add 0 for check
        if action == 'k':
            continue
        elif action == 'c':
            aggression += 1
        elif action == 'r':
            aggression += 2
        elif action == 'f':
            continue
    # Cap aggression at 10
    if aggression > 10:
        aggression = 10
    return aggression

def getRounds(numCards: int):
    if numCards == 0:
        return 1
    elif numCards == 3:
        return 2
    elif numCards == 4:
        return 3
    elif numCards == 5:
        return 4
    else:
        return 0
    

def addToDataset(hand, DF):
    boardCards = hand['board']
    numberOfRounds = getRounds(len(boardCards))
    if numberOfRounds == 0:
        return None
    numberOfPlayers = len(hand['players'])

    for playerIndex in range(numberOfPlayers):
        name = hand['players'][playerIndex]['user']
        if hand['players'][playerIndex]['user'] not in DF.data:
        
            player = Player(name)
            DF.addPlayer(player)
        else:
            player = DF.data[hand['players'][playerIndex]['user']]
       
        for roundIndex in range(1, numberOfRounds):
            if roundIndex == 1:
                assert(hand['players'][playerIndex]['bets'][roundIndex]['stage'] == 'f')

                # Create a new round
                round = Round()

                # Calculate strength
                round.strength = getHandStrength(boardCards, hand['players'][playerIndex]['pocket_cards'], 3)
                
                # Calculate aggression
                round.aggression = getHandAggression(hand['players'][playerIndex]['bets'][roundIndex]['actions'])
            elif roundIndex == 2:
                assert(hand['players'][playerIndex]['bets'][roundIndex]['stage'] == 't')

                # Create a new round
                round = Round()

                # Calculate strength
                round.strength = getHandStrength(boardCards, hand['players'][playerIndex]['pocket_cards'], 4)
                
                # Calculate aggression
                round.aggression = getHandAggression(hand['players'][playerIndex]['bets'][roundIndex]['actions'])
            elif roundIndex == 3:
                assert(hand['players'][playerIndex]['bets'][roundIndex]['stage'] == 'r')

                # Create a new round
                round = Round()

                # Calculate strength
                round.strength = getHandStrength(boardCards, hand['players'][playerIndex]['pocket_cards'], 5)
                
                # Calculate aggression
                round.aggression = getHandAggression(hand['players'][playerIndex]['bets'][roundIndex]['actions'])
            
            # Add the round to the player
            DF.data[name].append(round)


            





In [64]:
def extractDataset(DATAPATH, DF):
    linecount = 0
    with open(DATAPATH, 'r') as f:
        # print('#' * 60)
        line = f.readline()

        while line:
            linecount += 1
            hand = json.loads(line)
            addToDataset(hand, DF)
            line = f.readline()
            if linecount % 1000 == 0:
                print(f"Processed {linecount} hands")
    return DF

In [65]:
DF = Dataset()
extractDataset(DATAPATH, DF)

Processed 1000 hands
Processed 2000 hands
Processed 3000 hands
Processed 4000 hands
Processed 5000 hands
Processed 6000 hands
Processed 7000 hands
Processed 8000 hands
Processed 9000 hands
Processed 10000 hands
Processed 11000 hands
Processed 12000 hands
Processed 13000 hands
Processed 14000 hands
Processed 15000 hands
Processed 16000 hands
Processed 17000 hands
Processed 18000 hands


<__main__.Dataset at 0x7fa2360e15e0>

In [20]:
def getHandStrength(board, hand, round):
    evaluator = Evaluator()
    strength = evaluator.evaluate(board[:round], hand)
    # We know that trey has 7642 distinct hands ranked in a lookup table, 1 being the strongest
    gradation = 7642/STRENGTH_GRANULARITY
    strength = int(strength/gradation)
    return strength

cards = hand['players'][0]['pocket_cards']
myHand = []
for card in cards:
    myHand.append(Card.new(card))

board = []
for card in hand['board']:
    board.append(Card.new(card))
evaluator = Evaluator()
strength = evaluator.evaluate(board[:3], myHand)

# We know that trey has 7642 distinct hands ranked in a lookup table, 1 being the strongest
gradation = 7642/STRENGTH_GRANULARITY
strength = int(strength/gradation)
print(f"Strength: {strength}")




Strength: 4


In [66]:
import pickle

with open('data.pickle', 'wb') as handle:
    pickle.dump(DF, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [67]:
len(DF.data)

2283

In [71]:
numRounds = 0
for player, rounds in DF.data.items():
    numRounds += len(rounds)
print(f"Number of rounds: {numRounds}")

Number of rounds: 119802


In [72]:
INDEX_TO_NAME = {}
for index, player in enumerate(DF.data):
    INDEX_TO_NAME[index] = player



In [74]:
# Save the index to name mapping
with open('indexToName.pickle', 'wb') as handle:
    pickle.dump(INDEX_TO_NAME, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [77]:
# Print length of rounds for 1 player:
for player, rounds in DF.data.items():
    print(f"Player: {player} | Rounds: {len(rounds)}")
    break

Player: GregR | Rounds: 57


In [127]:
# Print number of players
print(f"Number of players: {len(DF.data.keys())}")

Number of players: 2283


In [78]:
# Calculate average number of rounds per player
totalRounds = 0
for player, rounds in DF.data.items():
    totalRounds += len(rounds)
print(f"Average number of rounds per player: {totalRounds/len(DF.data)}")

Average number of rounds per player: 52.47568988173456


In [80]:
# Make data more usable
POKER_DATA = {}
for player, rounds in DF.data.items():
    POKER_DATA[player] = []
    for round in rounds:
        POKER_DATA[player].append((round.strength, round.aggression))


In [81]:
with open('pokerData.pickle', 'wb') as handle:
    pickle.dump(POKER_DATA, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Now we construct a Mixture Model over this Poker Dataset

In [82]:
from collections import Counter
import random
import re
import matplotlib.pyplot as plt
from re import RegexFlag

In [83]:
# Get our dataset
with open('pokerData.pickle', 'rb') as handle:
    data = pickle.load(handle)

In [128]:
# Print number of players
print(f"Number of players: {len(data.keys())}")

Number of players: 2283


In [112]:
class MixtureModel:
    '''
    We follow a modelling approach very similar to LDA, a topic model designed for text documents.
 
    This model allows us to extract playing styles for each player from a dataset of plays. 
    Each style can be described by a list of parameters from most to least involved in the players style. 
    Then, each player can be connected to these style, to determine
    how representative that player is of that style, and also which style does he/she favor more.
    '''

    def __init__(self, K, max_iteration):
        """
        K represents the number of styles we want to extract from the dataset.
        max_iteration represents the maximum number of iterations we want to run the algorithm for.
        """
        self.K = K
        self.max_iteration = max_iteration

    def sample_from_weights(self, weights):
        '''
        We use this function to randomly choose an index based some set of weights.
        Return the index of the first weight that is more or the same as a random sampled number.
        '''
        total = sum(weights)
        rnd = total * random.random()  # uniform between 0 and total
        for i, w in enumerate(weights):
            rnd -= w  # return the smallest i such that
            if rnd <= 0: return i  # sum(weights[:(i+1)]) >= rnd

    def p_topic_given_document(self, topic, d, alpha=0.1):
        '''
        P(topic|d,Alpha)
        The fraction of words in document d
        that are assigned to topic (plus some smoothing)
        '''    
        return ((self.document_topic_counts[d][topic] + alpha) / 
                (self.document_lengths[d] + self.K * alpha))

    def p_word_given_topic(self, word, topic, beta=0.1):
        '''
        P(word|topic,Beta)
        The fraction of words assigned to topic
        that equal word (plus some smoothing)
        '''    
        return ((self.topic_word_counts[topic][word] + beta) / 
                (self.topic_counts[topic] + self.W * beta))

    def topic_weight(self, d, word, topic):
        '''
        P(topic|word,Alpha,Beta) = P(topic|d,Alpha) * P(word|topic,Beta)
        Given a document and a word in that document,
        return the weight for the k-th topic
        '''    
        return self.p_word_given_topic(word, topic) * self.p_topic_given_document(topic, d)

    def choose_new_topic(self, d, word):
        return self.sample_from_weights([self.topic_weight(d, word, k)
                            for k in range(self.K)])
    
    def gibbs_sample(self, document_topics, documents):
        '''
        Gibbs sampling https://en.wikipedia.org/wiki/Gibbs_sampling.
        '''
        iterations = 0
        for _ in range(self.max_iteration):
            start = time.time()
            for d in range(self.D):
                for i, (word, topic) in enumerate(zip(documents[d],
                                                      document_topics[d])):        
                    # remove this word / topic from the counts
                    # so that it doesn't influence the weights
                    self.document_topic_counts[d][topic] -= 1
                    self.topic_word_counts[topic][word] -= 1
                    self.topic_counts[topic] -= 1
                    self.document_lengths[d] -= 1
        
                    # choose a new topic based on the weights
                    new_topic = self.choose_new_topic(d, word)
                    document_topics[d][i] = new_topic
        
                    # and now add it back to the counts
                    self.document_topic_counts[d][new_topic] += 1
                    self.topic_word_counts[new_topic][word] += 1
                    self.topic_counts[new_topic] += 1
                    self.document_lengths[d] += 1
                
            iterations += 1
            if iterations % 50 == 0:
                end = time.time()
                print(f"Iteration: {iterations}")
                print("Average time per iteration: ", (end-start)/50)
                print("Estimated time remaining: ", (end-start)/50 * (self.max_iteration - iterations))

    def run(self, documents):  
        # How many times each topic is assigned to each document.
        self.document_topic_counts = [Counter()
                             for _ in documents]
        
        # How many times each word is assigned to each topic.
        self.topic_word_counts = [Counter() for _ in range(self.K)]
        # The total number of words assigned to each topic.
        self.topic_counts = [0 for _ in range(self.K)]
        # The total number of words contained in each document.
        self.document_lengths = [len(d) for d in documents]        
        self.distinct_words = set(word for document in documents for word in document)
        # The number of distinct words
        self.W = len(self.distinct_words)
        for i in self.distinct_words:
            print("Example of a word: ", i)
            break
        
        print("Number of distinct words: ", self.W)
        # The number of documents
        self.D = len(documents)      
        print("Number of players: ", self.D)
        # document_topics is a Collection that assign a topic (number between 0 and K-1) to each word in each document.
        # For example: document_topic[3][4] -> [4 document][id of topic assigned to 5 word]
        # This collection defines each document's distribution over topics, and
        # implicitly defines each topic's distribution over words.
        document_topics = [[random.randrange(self.K) for word in document]
                           for document in documents]
        
        for d in range(self.D):
            for word, topic in zip(documents[d], document_topics[d]):
                self.document_topic_counts[d][topic] += 1
                self.topic_word_counts[topic][word] += 1
                self.topic_counts[topic] += 1
        
        self.gibbs_sample(document_topics, documents)
        
        return(self.topic_word_counts, self.document_topic_counts)

    def plot_words_clouds_topic(self, topic_names, plt):
        for topic in range(self.K):
            data = []   
            text = ""
            for word, count in self.topic_word_counts[topic].most_common():
                if count > 1: 
                    data.append(str(word)) 
            text = ' '.join(data)
            # Generate a word cloud image
            wordcloud = WordCloud().generate(text)  
            plt.figure()
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis("off")
            plt.title("Topic #" + str(topic_names[topic]))
            plt.show()     

    
    


In [110]:
# Set random seed
random.seed(0)
K = 3 
max_iteration = 300

import numpy as np

# Convert data to list
mixtureData = list(data.values())

In [111]:
lda = MixtureModel(K, max_iteration)
lda.run(mixtureData) 
lda.plot_words_clouds_topic(data.keys(), plt) 

Example of a word:  (4, 0)
Number of distinct words:  47
Number of players:  2283
Iteration: 50
Average time per iteration:  0.016253142356872557
Estimated time remaining:  4.06328558921814
Iteration: 100
Average time per iteration:  0.016283836364746094
Estimated time remaining:  3.2567672729492188
Iteration: 150
Average time per iteration:  0.016137642860412596
Estimated time remaining:  2.420646429061889
Iteration: 200
Average time per iteration:  0.016097378730773926
Estimated time remaining:  1.6097378730773926
Iteration: 250
Average time per iteration:  0.01603353977203369
Estimated time remaining:  0.8016769886016845
Iteration: 300
Average time per iteration:  0.01596564769744873
Estimated time remaining:  0.0


TypeError: sequence item 0: expected str instance, tuple found

In [134]:
with open('lda_k3.pickle', 'wb') as handle:
    pickle.dump(lda, handle, protocol=pickle.HIGHEST_PROTOCOL)

PicklingError: Can't pickle <class '__main__.MixtureModel'>: it's not the same object as __main__.MixtureModel