## Data Description

You are provided with 25,010 poker hands in train.csv and 1,000,000 in test.csv. Each hand consists of five cards with a given suit and rank, drawn from a standard deck of 52. Suits and ranks are represented as ordinal categories:

```
S1 “Suit of card #1”
Ordinal (1-4) representing {Hearts, Spades, Diamonds, Clubs}
C1 “Rank of card #1”
Numerical (1-13) representing (Ace, 2, 3, ... , Queen, King)

...

S5 “Suit of card #5”
C5 “Rank of card #5”
```

Each row in the training set has the accompanying class label for the poker hand it comprises. The hands are omitted from the test set and must be predicted by participants. Hands are classified into the following ordinal categories:


```
0: Nothing in hand; not a recognized poker hand 
1: One pair; one pair of equal ranks within five cards
2: Two pairs; two pairs of equal ranks within five cards
3: Three of a kind; three equal ranks within five cards
4: Straight; five cards, sequentially ranked with no gaps
5: Flush; five cards with the same suit
6: Full house; pair + different rank three of a kind
7: Four of a kind; four equal ranks within five cards
8: Straight flush; straight + flush
9: Royal flush; {Ace, King, Queen, Jack, Ten} + flush
```
Note that the Straight flush and Royal flush hands are not representative of
the true domain because they have been over-sampled. The straight flush
is 14.43 times more likely to occur in the training set, while the royal flush is 129.82 times more likely.

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from functools import reduce
import random

## Utils

#### Test function

In [4]:
def calc_hand_label(hand):    
    def f(hand):
        suits_hist = reduce(lambda d,x: {**d, **{x: (d.get(x, 0) + 1)}}, hand[:,0], {})
        ranks_hist = reduce(lambda d,x: {**d, **{x: (d.get(x, 0) + 1)}}, hand[:,1], {})
        
        if len(ranks_hist.values()) < 5:
            if len(ranks_hist.values()) == 2:
                if max(list(ranks_hist.values())) == 4:
                    return 7
    #                 print('7: Four of a kind; four equal ranks within five cards')
                else:
                    return 6
    #                 print('6 Full house; pair + different rank three of a kind')
            elif len(ranks_hist.values()) == 3:
                if max(list(ranks_hist.values())) == 3:
                    return 3
    #                 print('3: Three of a kind; three equal ranks within five cards')
                else:
                    return 2
    #                 print('2: Two pairs; two pairs of equal ranks within five cards')
            else:
                return 1
    #             print('1: One pair; one pair of equal ranks within five cards')
        else:
            if len(suits_hist.values()) == 1:
                if max(list(ranks_hist.keys())) -  min(list(ranks_hist.keys())) == 4:
                    if max(list(ranks_hist.keys())) == 13:
                        return 9
    #                     print('9: Royal flush; {Ace, King, Queen, Jack, Ten} + flush')
                    else:
                        return 8
    #                     print('8: Straight flush; straight + flush')
                else:
                    return 5
    #                 print('5: Flush; five cards with the same suit')
            elif max(list(ranks_hist.keys())) -  min(list(ranks_hist.keys())) == 4:
                return 4    
    #             print('4: Straight; five cards, sequentially ranked with no gaps')
            else:
                return 0
                print('0: Nothing in hand; not a recognized poker hand ')
                
    hand = np.array(hand).reshape(5,2)
    res1 = f(hand)
    
    hand[:,1] = list(map(lambda x: 13 if (x == 1) else x-1, hand[:,1]))
    res2 = f(hand)
    
    if res1 == 9 and res2 == 8:
        return res2
    else:
        return max(res1, res2)

#### Encode cards

In [3]:
def card_encode(s, c):
    cards=["A","2","3","4","5","6","7","8","9","10","J","Q","K"]
    suits="♥♦♣♠"
    return cards[c - 1] + suits[s - 1]

def hand_encode(hand):
    return ', '.join(map(lambda x: card_encode(hand[x*2], hand[x*2+1]),range(5)))

print(hand_encode([4,9,2,1,2,2,4,7,2,8]))
print(hand_encode([2,9,2,4,3,6,1,9,4,9]))

9♠, A♦, 2♦, 7♠, 8♦
9♦, 4♦, 6♣, 9♥, 9♠


## Train data

In [4]:
def shuffle_df(data):
    data = data.copy()
    for index, row in data.iterrows():
        rank = -1
    
        row['S1'] = random.randint(1,4)
        row['S2'] = random.randint(1,4)
        row['S3'] = random.randint(1,4)
        row['S4'] = random.randint(1,4)
        row['S5'] = random.randint(1,4)
        row['C1'] = random.randint(1,13)
        row['C2'] = random.randint(1,13)
        row['C3'] = random.randint(1,13)
        row['C4'] = random.randint(1,13)
        row['C5'] = random.randint(1,13)
        row['hand'] = calc_hand_label(row[['S1', 'C1','S2', 'C2','S3', 'C3','S4', 'C4','S5', 'C5']])
    return data

In [5]:
suits = ['S1','S2','S3','S4','S5']
cards = ['C1','C2','C3','C4','C5']
distances = ['D1', 'D2', 'D3', 'D4', 'D5']

In [6]:
train_dataset = pd.read_csv('train.csv', delimiter=',')

In [7]:
train_dataset = train_dataset\
.append(shuffle_df(train_dataset),ignore_index=True)\
.append(shuffle_df(train_dataset),ignore_index=True)\
.append(shuffle_df(train_dataset),ignore_index=True)\

In [28]:
train_dataset.to_csv('./big_train_dataset.csv')

In [8]:
train_dataset = pd.read_csv('./big_train_dataset.csv').drop('Unnamed: 0', axis=1)

In [9]:
train_dataset.shape

(100040, 11)

In [10]:
data = train_dataset.drop('hand', axis=1)
train_labels = train_dataset['hand']

In [11]:
labels_vect = tf.keras.utils.to_categorical(train_labels)

#### Adding distance features

In [5]:
def get_distances(hand):
    distances_list = []
    for i, card in enumerate(hand):
        if (i < len(hand) - 1):
            distances_list.append(hand[i+1] - card)
        else:
            distances_list.append(13 - card + hand[0])
    return pd.Series(distances_list)

def with_distances(data):
    sorted = data[cards]
    sorted.values.sort()

    distances_df = sorted.apply(get_distances, axis=1)
    distances_df.columns = distances
    
    data_with_distances = pd.concat([data, distances_df], axis=1)
    return data_with_distances

In [13]:
data_with_distances = with_distances(data)
data_with_distances.head()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,D1,D2,D3,D4,D5
0,4,9,2,1,2,2,4,7,2,8,1,5,1,1,5
1,1,4,3,6,1,12,3,11,2,7,2,1,4,1,5
2,1,11,4,1,3,7,4,11,2,1,0,6,4,0,3
3,2,9,2,4,3,6,1,9,4,9,2,3,0,0,8
4,1,8,2,4,2,11,2,2,2,1,1,2,4,3,3


#### One hot encoding

In [6]:
def one_hot_encoded(data, columns, encoder, mode='transform'):
    encoded = None
    if (mode == 'fit'):
        encoded = encoder.fit_transform(data[columns])
    elif (mode == 'transform'):
        encoded = encoder.transform(data[columns])
    return pd.concat([data.drop(columns, axis=1), pd.DataFrame(encoded.toarray())], axis=1)

In [15]:
encoder = OneHotEncoder()
data_encoded = one_hot_encoded(data_with_distances, suits, encoder, 'fit')
print(data_encoded.shape)

data_encoded.head()

(100040, 30)


Unnamed: 0,C1,C2,C3,C4,C5,D1,D2,D3,D4,D5,...,10,11,12,13,14,15,16,17,18,19
0,9,1,2,7,8,1,5,1,1,5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,4,6,12,11,7,2,1,4,1,5,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,11,1,7,11,1,0,6,4,0,3,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,9,4,6,9,9,2,3,0,0,8,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,8,4,11,2,1,1,2,4,3,3,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Normalization

In [7]:
def normalized(data, columns, mean, std):
    data_normalized = (data[columns] - mean) / std
    to_return = pd.concat([data_normalized, data.drop(columns, axis=1)], axis=1)
    return to_return

In [17]:
mean_c = data_encoded[cards].mean().mean()
std_c = pd.Series(data_encoded[cards].values.flatten()).std()
mean_d = data_encoded[distances].mean().mean()
std_d = pd.Series(data_encoded[distances].values.flatten()).std()

data_normalized = normalized(data_encoded, cards, mean_c, std_c)
data_normalized = normalized(data_normalized, distances, mean_d, std_d)
data_normalized.head()

Unnamed: 0,D1,D2,D3,D4,D5,C1,C2,C3,C4,C5,...,10,11,12,13,14,15,16,17,18,19
0,-0.748732,1.123098,-0.748732,-0.748732,1.123098,0.53438,-1.603162,-1.335969,-6e-06,0.267187,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-0.280775,-0.748732,0.655141,-0.748732,1.123098,-0.801584,-0.267199,1.335958,1.068765,-6e-06,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,-1.21669,1.591056,0.655141,-1.21669,0.187183,1.068765,-1.603162,-6e-06,1.068765,-1.603162,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,-0.280775,0.187183,-1.21669,-1.21669,2.526971,0.53438,-0.801584,-0.267199,0.53438,0.53438,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.748732,-0.280775,0.655141,0.187183,0.187183,0.267187,-0.801584,1.068765,-1.335969,-1.603162,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [18]:
data_normalized.shape

(100040, 30)

## Neural Network

In [28]:
from time import time
from keras.callbacks import TensorBoard

model = keras.Sequential()
#input
model.add(keras.layers.Dense(120, activation='relu', input_shape=(30,)))
model.add(keras.layers.Dropout(0.2))
#hidden
model.add(keras.layers.Dense(240, activation='relu'))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(120, activation='relu'))
model.add(keras.layers.Dropout(0.1))
model.add(keras.layers.Dense(120, activation='relu'))
model.add(keras.layers.Dense(60, activation='relu'))
# Add a softmax layer with 10 output units:
model.add(keras.layers.Dense(10, activation='softmax'))

model.compile(optimizer=tf.train.AdamOptimizer(0.0005),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

model.fit(np.array(data_normalized), np.array(labels_vect), epochs=300, batch_size=512)

Epoch 1/300

Epoch 2/300

Epoch 3/300

Epoch 4/300

Epoch 5/300

Epoch 6/300

Epoch 7/300

Epoch 8/300

Epoch 9/300

Epoch 10/300

Epoch 11/300

Epoch 12/300

Epoch 13/300

Epoch 14/300

Epoch 15/300

Epoch 16/300

Epoch 17/300

Epoch 18/300

Epoch 19/300

Epoch 20/300

Epoch 21/300

Epoch 22/300

Epoch 23/300

Epoch 24/300

Epoch 25/300

Epoch 26/300

Epoch 27/300

Epoch 28/300

Epoch 29/300

Epoch 30/300

Epoch 31/300

Epoch 32/300

Epoch 33/300

Epoch 34/300

Epoch 35/300

Epoch 36/300

Epoch 37/300

Epoch 38/300

Epoch 39/300

Epoch 40/300

Epoch 41/300

Epoch 42/300

Epoch 43/300

Epoch 44/300

Epoch 45/300

Epoch 46/300

Epoch 47/300

Epoch 48/300

Epoch 49/300

Epoch 50/300

Epoch 51/300

Epoch 52/300

Epoch 53/300

Epoch 54/300

Epoch 55/300

Epoch 56/300

Epoch 57/300

Epoch 58/300

Epoch 59/300

Epoch 60/300

Epoch 61/300

Epoch 62/300

Epoch 63/300

Epoch 64/300

Epoch 65/300

Epoch 66/300

Epoch 67/300

Epoch 68/300

Epoch 69/300

Epoch 70/300

Epoch 71/300

Epoch 72/300

E


Epoch 108/300

Epoch 109/300

Epoch 110/300

Epoch 111/300

Epoch 112/300

Epoch 113/300

Epoch 114/300

Epoch 115/300

Epoch 116/300

Epoch 117/300

Epoch 118/300

Epoch 119/300

Epoch 120/300

Epoch 121/300

Epoch 122/300

Epoch 123/300

Epoch 124/300

Epoch 125/300

Epoch 126/300

Epoch 127/300

Epoch 128/300

Epoch 129/300

Epoch 130/300

Epoch 131/300

Epoch 132/300

Epoch 133/300

Epoch 134/300

Epoch 135/300

Epoch 136/300

Epoch 137/300

Epoch 138/300

Epoch 139/300

Epoch 140/300

Epoch 141/300

Epoch 142/300

Epoch 143/300

Epoch 144/300

Epoch 145/300

Epoch 146/300

Epoch 147/300

Epoch 148/300

Epoch 149/300

Epoch 150/300

Epoch 151/300

Epoch 152/300

Epoch 153/300

Epoch 154/300

Epoch 155/300

Epoch 156/300

Epoch 157/300

Epoch 158/300

Epoch 159/300

Epoch 160/300

Epoch 161/300

Epoch 162/300

Epoch 163/300

Epoch 164/300

Epoch 165/300

Epoch 166/300

Epoch 167/300

Epoch 168/300

Epoch 169/300

Epoch 170/300

Epoch 171/300

Epoch 172/300

Epoch 173/300

Epoch 174


Epoch 212/300

Epoch 213/300

Epoch 214/300

Epoch 215/300

Epoch 216/300

Epoch 217/300

Epoch 218/300

Epoch 219/300

Epoch 220/300

Epoch 221/300

Epoch 222/300

Epoch 223/300

Epoch 224/300

Epoch 225/300

Epoch 226/300

Epoch 227/300

Epoch 228/300

Epoch 229/300

Epoch 230/300

Epoch 231/300

Epoch 232/300

Epoch 233/300

Epoch 234/300

Epoch 235/300

Epoch 236/300

Epoch 237/300

Epoch 238/300

Epoch 239/300

Epoch 240/300

Epoch 241/300

Epoch 242/300

Epoch 243/300

Epoch 244/300

Epoch 245/300

Epoch 246/300

Epoch 247/300

Epoch 248/300

Epoch 249/300

Epoch 250/300

Epoch 251/300

Epoch 252/300

Epoch 253/300

Epoch 254/300

Epoch 255/300

Epoch 256/300

Epoch 257/300

Epoch 258/300

Epoch 259/300

Epoch 260/300

Epoch 261/300

Epoch 262/300

Epoch 263/300

Epoch 264/300

Epoch 265/300

Epoch 266/300

Epoch 267/300

Epoch 268/300

Epoch 269/300

Epoch 270/300

Epoch 271/300

Epoch 272/300

Epoch 273/300

Epoch 274/300

Epoch 275/300

Epoch 276/300

Epoch 277/300

Epoch 278

<tensorflow.python.keras._impl.keras.callbacks.History at 0x7fc9a5a4c198>

In [36]:
model.save('./model-0')



## Test data

In [2]:
test_dataset = pd.read_csv('test.csv', delimiter=',')
test_data = test_dataset.drop('id', axis=1)

In [5]:
test_labels = test_data.apply(calc_hand_label, axis=1)

In [20]:
test_data_with_distances = with_distances(test_data)
test_data_encoded = one_hot_encoded(test_data_with_distances, suits, encoder)
test_data_normalized = normalized(test_data_encoded, cards, mean_c, std_c)
test_data_normalized = normalized(test_data_normalized, distances, mean_d, std_d)

In [30]:
test_data_normalized.to_csv('./test_data_with_distances.csv')

In [6]:
test_data_normalized = pd.read_csv('./test_data_with_distances.csv').drop('Unnamed: 0', axis=1)

In [7]:
test_data_normalized.head()

Unnamed: 0,D1,D2,D3,D4,D5,C1,C2,C3,C4,C5,...,10,11,12,13,14,15,16,17,18,19
0,-0.744891,-0.744891,1.117337,-0.279334,0.65178,0.803039,-1.331849,-1.064988,0.269317,-1.59871,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,-0.744891,-0.744891,-0.744891,1.582894,0.65178,1.603622,-0.531266,0.002456,-0.264405,-0.798127,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,-0.279334,-0.744891,0.65178,0.186223,0.186223,-1.064988,1.0699,0.269317,-1.59871,-0.798127,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.186223,-0.744891,-0.744891,0.186223,1.117337,-0.264405,-1.064988,0.002456,0.269317,1.0699,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.279334,-1.210448,0.65178,-0.279334,1.117337,0.803039,-0.798127,-0.264405,1.336761,-0.264405,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
test_data_normalized.shape

(1000000, 30)

## Predictions

In [9]:
model = keras.models.load_model('./model-0')
model.compile(optimizer=tf.train.AdamOptimizer(0.0005),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [10]:
test_data_sample_predicted_10 = model.predict(np.array(test_data_normalized))
test_data_sample_predicted = np.array(list(map(np.argmax, test_data_sample_predicted_10)))

In [11]:
predictions_data = pd.concat([
    test_dataset,
    pd.DataFrame({'predictions': test_data_sample_predicted}),
    pd.DataFrame({'labels': test_labels}),
], axis=1)

In [13]:
predictions_data.shape

(1000000, 13)

In [12]:
wrong_answers = predictions_data[predictions_data['predictions'] != predictions_data['labels']]
wrong_answers.shape

(0, 13)

In [14]:
wrong_answers['labels'].value_counts()

Series([], Name: labels, dtype: int64)

In [15]:
wrong_answers

Unnamed: 0,id,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,predictions,labels


In [16]:
wrong_answers.drop(['id', 'predictions', 'labels'], axis=1).apply(lambda h: hand_encode(np.array(h)), axis=1)

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5


## Submission

In [31]:
submission = predictions_data[['id', 'predictions']]

In [32]:
submission = submission.rename({'predictions': 'hand'}, axis=1)
submission.index = submission['id']
submission = submission.drop('id', axis=1)
submission.head()

Unnamed: 0_level_0,hand
id,Unnamed: 1_level_1
1,0
2,0
3,0
4,0
5,1


In [33]:
submission.to_csv('./submission.csv')