# Kaggle Learn-and-compete `Poker Rule Induction`

## Data Description

You are provided with 25,010 poker hands in train.csv and 1,000,000 in test.csv. Each hand consists of five cards with a given suit and rank, drawn from a standard deck of 52. Suits and ranks are represented as ordinal categories:

```
S1 “Suit of card #1”
Ordinal (1-4) representing {Hearts, Spades, Diamonds, Clubs}
C1 “Rank of card #1”
Numerical (1-13) representing (Ace, 2, 3, ... , Queen, King)

...

S5 “Suit of card #5”
C5 “Rank of card #5”
```

Each row in the training set has the accompanying class label for the poker hand it comprises. The hands are omitted from the test set and must be predicted by participants. Hands are classified into the following ordinal categories:


```
0: Nothing in hand; not a recognized poker hand 
1: One pair; one pair of equal ranks within five cards
2: Two pairs; two pairs of equal ranks within five cards
3: Three of a kind; three equal ranks within five cards
4: Straight; five cards, sequentially ranked with no gaps
5: Flush; five cards with the same suit
6: Full house; pair + different rank three of a kind
7: Four of a kind; four equal ranks within five cards
8: Straight flush; straight + flush
9: Royal flush; {Ace, King, Queen, Jack, Ten} + flush
```
Note that the Straight flush and Royal flush hands are not representative of
the true domain because they have been over-sampled. The straight flush
is 14.43 times more likely to occur in the training set, while the royal flush is 129.82 times more likely.

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from functools import reduce
from itertools import permutations

#### Read data

In [10]:
test_dataset = pd.read_csv('test.csv', delimiter=',')

In [11]:
test_dataset

Unnamed: 0,id,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5
0,1,1,10,2,2,3,3,3,8,1,1
1,2,2,13,3,5,3,7,4,6,1,4
2,3,1,3,1,11,2,8,2,1,2,4
3,4,1,6,3,3,4,7,1,8,3,11
4,5,2,10,3,4,1,6,2,12,2,6
5,6,1,4,3,10,2,11,2,6,1,7
6,7,1,10,3,8,1,4,3,11,3,9
7,8,2,11,3,8,1,1,1,11,2,3
8,9,3,4,1,1,1,3,3,5,3,6
9,10,3,12,2,1,1,3,1,2,3,10


In [12]:
train_dataset = pd.read_csv('train.csv', delimiter=',')

In [13]:
data = train_dataset[['S1', 'C1','S2', 'C2','S3', 'C3','S4', 'C4','S5', 'C5']]
labels = train_dataset[['hand']]

In [14]:
data[0:10]

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5
0,4,9,2,1,2,2,4,7,2,8
1,1,4,3,6,1,12,3,11,2,7
2,1,11,4,1,3,7,4,11,2,1
3,2,9,2,4,3,6,1,9,4,9
4,1,8,2,4,2,11,2,2,2,1
5,2,5,1,5,2,13,2,3,3,13
6,3,10,4,6,1,4,2,13,4,5
7,4,10,3,1,2,13,4,2,4,7
8,3,2,4,10,3,3,4,4,1,9
9,2,7,3,8,4,8,2,13,2,12


In [15]:
labels_vect = tf.keras.utils.to_categorical(labels)

#### Data augmentation

In [8]:
cards = ['C1','C2','C3','C4','C5']
suits = ['S1','S2','S3','S4','S5']

def augment_data (data):
    huge_data = pd.concat([data[cards], data[suits]], axis=1)
    combinations = list(permutations(cards, 5))[1:]
    
    for c in combinations:
        shuffled = data[cards].reindex(combinations[1], axis=1)
        concated = pd.concat([shuffled, data[suits]], axis=1)
        huge_data = pd.concat([huge_data, concated], axis=0)
        
    huge_data.index = list(range(huge_data.shape[0]))
    
    return huge_data

In [9]:
huge_data = augment_data(data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # This is added back by InteractiveShellApp.init_path()


In [10]:
huge_data.shape

(3001200, 10)

#### One hot encoding

In [11]:
def data_hotencode(data, columns, encoder, mode='transform'):
    encoded = None
    if (mode == 'fit'):
        encoded = encoder.fit_transform(data[columns])
    elif (mode == 'transform'):
        encoded = encoder.transform(data[columns])
    return pd.concat([data.drop(columns, axis=1), pd.DataFrame(encoded.toarray())], axis=1)

def hand_hotencode(hand, columns, encoder):
    df = pd.DataFrame(data=[hand], columns=['S1', 'C1','S2', 'C2','S3', 'C3','S4', 'C4','S5', 'C5'])
    return data_hotencode(df, columns, encoder, 'transform')

In [12]:
encoder = OneHotEncoder()
columns_to_hotencode = ['S1', 'S2', 'S3', 'S4', 'S5']
data_encoded = data_hotencode(huge_data, columns_to_hotencode, encoder, 'fit')
print(data_encoded.shape)

data_encoded.head()

(3001200, 25)


Unnamed: 0,C1,C2,C3,C4,C5,0,1,2,3,4,...,10,11,12,13,14,15,16,17,18,19
0,9,1,2,7,8,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,4,6,12,11,7,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,11,1,7,11,1,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,9,4,6,9,9,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,8,4,11,2,1,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Normalization

In [13]:
def normalize(data, columns, mean, std):
    data_normalized = (data[columns] - mean) / std
    to_return = pd.concat([data.drop(columns, axis=1), data_normalized], axis=1)
    to_return.columns = list(range(to_return.shape[1]))
    return to_return

In [14]:
mean = data_encoded[cards].mean().mean()
std = pd.Series(data_encoded[cards].values.flatten()).std()
data_normalized = normalize(data_encoded, cards, mean, std)
data_normalized.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.537111,-1.596142,-1.329485,0.003798,0.270454
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,-0.796172,-0.262859,1.337081,1.070424,0.003798
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.070424,-1.596142,0.003798,1.070424,-1.596142
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.537111,-0.796172,-0.262859,0.537111,0.537111
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.270454,-0.796172,1.070424,-1.329485,-1.596142


In [15]:
data_normalized.to_csv('./data_aug_norm.csv')

In [2]:
data_normalized = pd.read_csv('./data_aug_norm.csv').drop('Unnamed: 0', axis=1)

#### Test function

In [5]:
def calc_hand_label(hand):    
    def f(hand):
        suits_hist = reduce(lambda d,x: {**d, **{x: (d.get(x, 0) + 1)}}, hand[:,0], {})
        ranks_hist = reduce(lambda d,x: {**d, **{x: (d.get(x, 0) + 1)}}, hand[:,1], {})
        
        if len(ranks_hist.values()) < 5:
            if len(ranks_hist.values()) == 2:
                if max(list(ranks_hist.values())) == 4:
                    return 7
    #                 print('7: Four of a kind; four equal ranks within five cards')
                else:
                    return 6
    #                 print('6 Full house; pair + different rank three of a kind')
            elif len(ranks_hist.values()) == 3:
                if max(list(ranks_hist.values())) == 3:
                    return 3
    #                 print('3: Three of a kind; three equal ranks within five cards')
                else:
                    return 2
    #                 print('2: Two pairs; two pairs of equal ranks within five cards')
            else:
                return 1
    #             print('1: One pair; one pair of equal ranks within five cards')
        else:
            if len(suits_hist.values()) == 1:
                if max(list(ranks_hist.keys())) -  min(list(ranks_hist.keys())) == 4:
                    if max(list(ranks_hist.keys())) == 13:
                        return 9
    #                     print('9: Royal flush; {Ace, King, Queen, Jack, Ten} + flush')
                    else:
                        return 8
    #                     print('8: Straight flush; straight + flush')
                else:
                    return 5
    #                 print('5: Flush; five cards with the same suit')
            elif max(list(ranks_hist.keys())) -  min(list(ranks_hist.keys())) == 4:
                return 4    
    #             print('4: Straight; five cards, sequentially ranked with no gaps')
            else:
                return 0
                print('0: Nothing in hand; not a recognized poker hand ')
                
    hand = np.array(hand).reshape(5,2)
    res1 = f(hand)
    
    hand[:,1] = list(map(lambda x: 13 if (x == 1) else x-1, hand[:,1]))
    res2 = f(hand)
    
    if res1 == 9 and res2 == 8:
        return res2
    else:
        return max(res1, res2)

#### Encode cards

In [6]:
def card_encode(s, c):
    cards=["A","2","3","4","5","6","7","8","9","10","J","Q","K"]
    suits="♥♦♣♠"
    return cards[c - 1] + suits[s - 1]

def hand_encode(hand):
    return ', '.join(map(lambda x: card_encode(hand[x*2], hand[x*2+1]),range(5)))

print(hand_encode([4,9,2,1,2,2,4,7,2,8]))
print(hand_encode([2,9,2,4,3,6,1,9,4,9]))

9♠, A♦, 2♦, 7♠, 8♦
9♦, 4♦, 6♣, 9♥, 9♠


### Solutions

1. Neural network
2. Random forest

Ousiders:
 - Genetic algorithm
 - Decision tree

### Neural Network

In [16]:
from time import time
from keras.callbacks import TensorBoard

model = keras.Sequential()
#input
model.add(keras.layers.Dense(64, activation='relu', input_shape=(25,)))
model.add(keras.layers.Dropout(0.1))
#hidden
model.add(keras.layers.Dense(32, activation='relu'))
model.add(keras.layers.Dropout(0.05))
model.add(keras.layers.Dense(16, activation='relu'))
# Add a softmax layer with 10 output units:
model.add(keras.layers.Dense(10, activation='softmax'))

model.compile(optimizer=tf.train.AdamOptimizer(0.0005),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

model.fit(np.array(data_normalized), np.array(labels_vect), epochs=1000, batch_size=512)

ValueError: Input arrays should have the same number of samples as target arrays. Found 3001200 input samples and 25010 target samples.

In [33]:
y_test = test_dataset.drop(['id'], axis=1).apply(calc_hand_label, axis=1)

In [39]:
model.save('./model-sort-166')



### Predictions

In [34]:
test_dataset_noid = test_dataset.drop(['id'], axis=1)

In [66]:
test_sample = test_dataset_noid
test_data_sample_encoded_1 = normalize(
    sort(
        data_hotencode(test_sample, columns_to_hotencode, encoder),
        columns_to_sort,
        False,
    ), 
    columns_to_normalize,
    mean,
    std
)

test_data_sample_encoded_2 = normalize(
    sort(
        data_hotencode(test_sample, columns_to_hotencode, encoder),
        columns_to_sort,
        True,
    ), 
    columns_to_normalize,
    mean,
    std
)

In [1]:
model1 = keras.models.load_model('./model-sort-166')
model1.compile(optimizer=tf.train.AdamOptimizer(0.0005),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model2 = keras.models.load_model('./model-sort-174')
model2.compile(optimizer=tf.train.AdamOptimizer(0.0005),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

NameError: name 'keras' is not defined

In [6]:
test_hand = [1,9,1,10,1,11,1,12,1,1]
print(hand_encode(test_hand))
prediction = model.predict(np.array(normalize(hand_hotencode(test_hand, columns_to_hotencode, encoder), columns_to_normalize, scaler)))
print(prediction[0])
plt.plot(prediction[0])

9♥, 10♥, J♥, Q♥, A♥


NameError: name 'model' is not defined

In [87]:
test_data_sample_predicted_1_10 = model1.predict(np.array(test_data_sample_encoded_1))
test_data_sample_predicted_1 = np.array(list(map(np.argmax, test_data_sample_predicted_1_10)))
test_data_sample_predicted_probs_1 = np.array(list(map(np.max, test_data_sample_predicted_1_10)))
test_data_sample_predicted_2_10 = model2.predict(np.array(test_data_sample_encoded_2))
test_data_sample_predicted_2 = np.array(list(map(np.argmax, test_data_sample_predicted_2_10)))
test_data_sample_predicted_probs_2 = np.array(list(map(np.max, test_data_sample_predicted_2_10)))

In [103]:
preds = np.concatenate(
    (
        test_data_sample_predicted_1.reshape(len(test_data_sample_predicted_1), 1),
        test_data_sample_predicted_2.reshape(len(test_data_sample_predicted_2), 1)
    ),
    axis=1
)

probs = np.concatenate(
    (
        test_data_sample_predicted_probs_1.reshape(len(test_data_sample_predicted_probs_1), 1),
        test_data_sample_predicted_probs_2.reshape(len(test_data_sample_predicted_probs_2), 1)
    ),
    axis=1
)

indices = np.array(list(map(np.argmax, probs))).reshape(1000000,1)

In [104]:
indices.shape

(1000000, 1)

In [114]:
test_data_sample_predicted = []
for p, index in zip(preds, indices):
    test_data_sample_predicted.append(p[index[0]])

In [116]:
#test_data_sample_predicted = np.maximum(test_data_sample_predicted_1, test_data_sample_predicted_2)
#test_data_sample_predicted = test_data_sample_predicted_2

test_data_sample_labels = y_test
df = pd.DataFrame({
    'predicted_1': test_data_sample_predicted_1,
    'predicted_2': test_data_sample_predicted_2,
    'predicted_probs_1': test_data_sample_predicted_probs_1,
    'predicted_probs_2': test_data_sample_predicted_probs_2,
    'predicted': test_data_sample_predicted, 
    'real': test_data_sample_labels
})
df = pd.concat([test_sample, df], axis=1)
wrong_answers = df[df['predicted'] != df['real']]
print(wrong_answers.shape)
wrong_answers

(24, 16)


Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,predicted_1,predicted_2,predicted_probs_1,predicted_probs_2,predicted,real
14657,4,1,1,5,3,4,3,3,1,2,4,0,0.996941,0.999503,0,4
153085,4,4,1,1,4,2,1,3,1,5,4,0,0.997092,0.998551,0,4
189896,2,12,4,12,2,11,3,12,1,12,6,6,0.843938,0.700288,6,7
325202,2,8,2,9,3,9,4,9,1,9,3,7,0.911571,0.635526,3,7
333453,1,11,1,10,2,11,4,11,3,11,6,6,0.542889,0.630375,6,7
473784,3,3,4,5,3,1,3,2,3,4,4,0,0.991499,0.997622,0,4
511090,1,13,3,13,3,11,4,13,2,13,3,6,0.470374,0.514752,6,7
519829,3,1,3,2,4,3,3,4,3,5,4,0,0.99114,0.999485,0,4
561476,4,11,3,11,1,10,2,11,1,11,6,7,0.982283,0.537851,6,7
583292,4,5,1,4,3,3,1,1,2,2,4,0,0.998962,0.999341,0,4


In [109]:
wrong_answers['real'].value_counts()

7    15
4     8
8     1
Name: real, dtype: int64

In [117]:
wrong_answers.drop(['predicted_1', 'predicted_2', 'predicted_probs_1', 'predicted_probs_2', 'predicted', 'real'], axis=1).apply(lambda h: hand_encode(np.array(h)), axis=1)

14657      A♠, 5♥, 4♣, 3♣, 2♥
153085     4♠, A♥, 2♠, 3♥, 5♥
189896     Q♦, Q♠, J♦, Q♣, Q♥
325202     8♦, 9♦, 9♣, 9♠, 9♥
333453    J♥, 10♥, J♦, J♠, J♣
473784     3♣, 5♠, A♣, 2♣, 4♣
511090     K♥, K♣, J♣, K♠, K♦
519829     A♣, 2♣, 3♠, 4♣, 5♣
561476    J♠, J♣, 10♥, J♦, J♥
583292     5♠, 4♥, 3♣, A♥, 2♦
585044     A♥, A♦, A♣, A♠, K♣
628440     9♦, 6♠, 9♣, 9♠, 9♥
640832     K♣, K♥, K♦, K♠, Q♣
654165     K♣, K♥, K♠, K♦, Q♦
691555     5♠, A♥, 4♠, 2♥, 3♥
692475     Q♦, Q♠, Q♥, 9♠, Q♣
723435     A♠, 2♣, 5♣, 4♣, 3♠
763760     3♠, 2♣, A♣, 5♣, 4♠
819607     Q♣, Q♥, Q♠, 9♦, Q♦
845678     K♦, J♠, K♣, K♠, K♥
888126     9♥, 9♦, 9♣, 9♠, 8♠
912142    10♠, Q♠, 9♠, J♠, K♠
925292     K♦, K♥, K♠, Q♠, K♣
932343     7♥, 7♦, 7♠, 6♠, 7♣
dtype: object