# How to use this

Run each cell from top to bottom. 
View README.md for more infos. 

In [7]:
# Init global infos
import tensorflow as tf
import numpy as np

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense, Dropout, Activation

from tensorflow.keras.optimizers import SGD

from tensorflow.keras.utils import to_categorical

import pandas as pd

In [8]:
pokerdf = pd.read_csv('train.csv',index_col = False)

In [9]:
pokercols = pokerdf.columns

In [10]:
#Defining a new input for my Adam Poker application
inputs = ( 
    ("S1", ('1','2','3','4')), 
    ("C1", ("continuous",)), 
    ("S2", ('1','2','3','4')), 
    ("C2", ("continuous",)), 
    ("S3", ('1','2','3','4')), 
    ("C3", ("continuous",)), 
    ("S4", ('1','2','3','4')), 
    ("C4", ("continuous",)), 
    ("S5", ('1','2','3','4')), 
    ("C5", ("continuous",)), 
)

In [11]:
input_shape = []
for i in inputs:
    count = len(i[1 ])
    input_shape.append(count)
input_dim = sum(input_shape)
print("input_shape:", input_shape)
print("input_dim:", input_dim)

input_shape: [4, 1, 4, 1, 4, 1, 4, 1, 4, 1]
input_dim: 25


In [12]:
outputs = tuple(range(0,10))  # (">50K", "<=50K")
output_dim = len(outputs)
print("output_dim:", output_dim)
print()

output_dim: 10



In [13]:
# Helpful function 2    
    
def find_means_for_continuous_types(X):
    means = []
    for col in range(len(X[0])):
        summ = 0
        count = 0.000000000000000000001
        for value in X[:, col]:
            if isinstance(value,float): 
                summ += value
                count +=1
        means.append(summ/count)
    return means

## Changing dtype to float

In [14]:
pokerdf[['C1','C2','C3','C4','C5']] = pokerdf[['C1','C2','C3','C4','C5']].astype('float64')
pokerdf[['S1','S2','S3','S4','S5']] = pokerdf[['S1','S2','S3','S4','S5']].astype('str')

In [22]:
# Helpful function 3
def prepare_data(raw_data, means):
    
    X = raw_data[:, :-1]
    y = raw_data[:, -1:]
    
    # X:
    def flatten_persons_inputs_for_model(person_inputs):
        global inputs
        global input_shape
        global input_dim
        global means
        float_inputs = []

        for i in range(len(input_shape)):
            features_of_this_type = input_shape[i]
            is_feature_continuous = features_of_this_type == 1

            if is_feature_continuous:
                mean = means[i]
                scale_factor = 1/(2*mean)
                float_inputs.append(person_inputs[i]*scale_factor)
#                 if isinstance(person_inputs[i],float):
#                     scale_factor = 1/(2*mean)  # we prefer inputs mainly scaled from -1 to 1. 
#                     float_inputs.append(float(person_inputs[i])*scale_factor)
#                 else:
#                     float_inputs.append(mean)
            else:
                for j in range(features_of_this_type):
                    feature_name = inputs[i][1][j]

                    if feature_name == person_inputs[i]:
                        float_inputs.append(1.)
                    else:
                        float_inputs.append(0)
        return float_inputs
    
    new_X = []
    for person in range(len(X)):
        formatted_X = flatten_persons_inputs_for_model(X[person])
        new_X.append(formatted_X)
    new_X = np.array(new_X)
    
    # y:
    new_y = to_categorical(y, num_classes = 10)
    
#     new_y = []
#     for i in range(len(y)):
#         if y[i] == ">50k":
#             new_y.append((1, 0))
#         else:  # y[i] == "<=50k":
#             new_y.append((0, 1))
#     new_y = np.array(new_y)
    
    return (new_X, new_y)

## Finding means for poker data below

In [16]:
hartrain = pokerdf.values

In [17]:
means = find_means_for_continuous_types(hartrain)
print("Mean values for data types (if continuous):", means)

Mean values for data types (if continuous): [0.0, 6.995241903238704, 0.0, 7.014194322271091, 0.0, 7.014154338264694, 0.0, 6.9424630147940825, 0.0, 6.962734906037585, 0.0]


In [23]:
X_train, y_train = prepare_data(hartrain, means)

In [26]:
# Explanation on data format
print("Training data format example:")
print(X_train[4])  # 4 is a random person, from cuba.

Training data format example:
[1.         0.         0.         0.         0.57181725 0.
 1.         0.         0.         0.2851361  0.         1.
 0.         0.         0.78412874 0.         1.         0.
 0.         0.1440411  0.         1.         0.         0.
 0.07181086]


In [39]:
# Init model

mid_dim = 100

model = Sequential()

model.add(Dense(mid_dim, input_dim=input_dim, activation='relu'))
model.add(Dense(output_dim, activation='relu'))

model.compile(loss='categorical_crossentropy', optimizer='sgd',metrics = ['accuracy'] )


In [40]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 100)               2600      
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1010      
Total params: 3,610
Trainable params: 3,610
Non-trainable params: 0
_________________________________________________________________


In [41]:
# Train the model

print("(training_datas, dimension):", X_train.shape)

(training_datas, dimension): (25010, 25)


In [42]:
# model.fit(new_X_train, y_train, nb_epoch=3, batch_size=16, show_accuracy=True, verbose=2)
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x13e8ed190>