## Load the dataset

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Load Data
data = pd.read_pickle('mallika_combined.pkl')

# Preprocess Data: Convert boards into 6x7x2 representation
def preprocess_data(data):
    boards = []
    labels = []
    for _, item in data.iterrows():
        board = np.array(item['board'])

        # Convert to 6x7x2 format
        if board.shape == (6, 7, 2):
            board_6x7x2 = board
        else:
            board_6x7x2 = np.zeros((6, 7, 2))
            board_6x7x2[:, :, 0] = (board == 1).astype(int)
            board_6x7x2[:, :, 1] = (board == -1).astype(int)

        label = item['recommended_column']
        boards.append(board_6x7x2)
        labels.append(label)

    boards = np.array(boards)
    labels = np.array(labels)

    # One-hot encode labels (7 possible moves)
    labels = tf.keras.utils.to_categorical(labels, num_classes=7)
    return boards, labels

# Get processed data
boards, labels = preprocess_data(data)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(boards, labels, test_size=0.2, random_state=42)

# Reshape Data for Transformer (Flatten the board)
num_samples, n, m, channels = X_train.shape  # (num_samples, 6, 7, 2)
X_train = X_train.reshape(num_samples, n * m, channels)
X_val = X_val.reshape(X_val.shape[0], n * m, channels)

# Debugging Step: Check shape before proceeding
print(f"X_train shape: {X_train.shape}")  # Expected: (num_samples, 42, 2)
print(f"y_train shape: {y_train.shape}")  # Expected: (num_samples, 7)


X_train shape: (128641, 42, 2)
y_train shape: (128641, 7)


In [7]:
import tensorflow as tf

class PositionalIndex(tf.keras.layers.Layer):
    def call(self, x):
        bs = tf.shape(x)[0]  # Extract batch size
        number_of_vectors = tf.shape(x)[1]  # Count the number of vectors (should be m*n)
        indices = tf.range(number_of_vectors)  # Index for each vector
        indices = tf.expand_dims(indices, 0)  # Reshape appropriately
        return tf.tile(indices, [bs, 1])  # Repeat for each batch


class ClassTokenIndex(tf.keras.layers.Layer):
    def call(self, x):
        bs = tf.shape(x)[0]  # Extract batch size
        number_of_vectors = 1  # We want just 1 vector for the class token
        indices = tf.range(number_of_vectors)  # Index for the vector
        indices = tf.expand_dims(indices, 0)  # Reshape appropriately
        return tf.tile(indices, [bs, 1])  # Repeat for each batch


## Transformer

In [8]:
def build_ViT(n,m,block_size,hidden_dim,num_layers,num_heads,key_dim,value_dim,mlp_dim,dropout_rate,num_classes):
    # n is number of rows of blocks
    # m is number of cols of blocks
    # block_size is number of pixels (with rgb) in each block
    inp = tf.keras.layers.Input(shape=(n*m,block_size))
    mid = tf.keras.layers.Dense(hidden_dim)(inp) # transform to vectors with different dimension
    # the positional embeddings
    inp2 = PositionalIndex()(inp)
    emb = tf.keras.layers.Embedding(input_dim=n*m, output_dim=hidden_dim)(inp2) # learned positional embedding for each of the n*m possible possitions
    mid = tf.keras.layers.Add()([mid, emb]) # for some reason, tf.keras.layers.Add causes an error, but + doesn't?
    # create and append class token to beginning of all input vectors
    tokenInd = ClassTokenIndex()(mid)
    token = tf.keras.layers.Embedding(input_dim=1, output_dim=hidden_dim)(tokenInd)
    mid = tf.keras.layers.Concatenate(axis=1)([token, mid])
    
    for l in range(num_layers): # how many Transformer Head layers are there?
        ln  = tf.keras.layers.LayerNormalization()(mid) # normalize
        mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,key_dim=key_dim,value_dim=value_dim)(ln,ln,ln) # self attention!
        add = tf.keras.layers.Add()([mid,mha]) # add and norm
        ln  = tf.keras.layers.LayerNormalization()(add)
        den = tf.keras.layers.Dense(mlp_dim,activation='gelu')(ln) # maybe should be relu...who knows...
        den = tf.keras.layers.Dropout(dropout_rate)(den) # regularization
        den = tf.keras.layers.Dense(hidden_dim)(den) # back to the right dimensional space
        den = tf.keras.layers.Dropout(dropout_rate)(den)
        mid = tf.keras.layers.Add()([den,add]) # add and norm again
    
    fl = mid[:,0,:] # just grab the class token for each image in batch
    ln = tf.keras.layers.LayerNormalization()(fl)
    clas = tf.keras.layers.Dense(num_classes,activation='softmax')(ln) # probability that the image is in each category
    mod = tf.keras.models.Model(inp,clas)
    mod.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    return mod

In [18]:
# Model hyperparameters
n = 6
m = 7
block_size = 2
hidden_dim = 32
num_layers = 4
num_heads = 4
key_dim = hidden_dim // num_heads  # Good practice for key_dim to be hidden_dim//num_heads
value_dim = key_dim * 2
mlp_dim = hidden_dim
dropout_rate = 0.1
num_classes = 7  # Output classes for classification

# Build the Transformer model
trans = build_ViT(n, m, block_size, hidden_dim, num_layers, num_heads, 
                  key_dim, value_dim, mlp_dim, dropout_rate, num_classes)

# Display model summary
trans.summary()


In [19]:
print(f"X_train shape: {X_train.shape}")  # Expected: (num_samples, 42, 2)
print(f"y_train shape: {y_train.shape}")  # Expected: (num_samples, 7)


X_train shape: (128641, 42, 2)
y_train shape: (128641, 7)


In [20]:
trans.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
              loss="categorical_crossentropy", 
              metrics=["accuracy"])


In [22]:
history = trans.fit(X_train, y_train, 
                    validation_data=(X_val, y_val),
                    epochs=20,  # Adjust based on performance
                    batch_size=32)  # Tune based on memory and dataset size


Epoch 1/20
[1m4021/4021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 21ms/step - accuracy: 0.3441 - loss: 1.5459 - val_accuracy: 0.3871 - val_loss: 1.4516
Epoch 2/20
[1m4021/4021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 21ms/step - accuracy: 0.3982 - loss: 1.4412 - val_accuracy: 0.4174 - val_loss: 1.4113
Epoch 3/20
[1m4021/4021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 21ms/step - accuracy: 0.4174 - loss: 1.4115 - val_accuracy: 0.4280 - val_loss: 1.3935
Epoch 4/20
[1m4021/4021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m629s[0m 156ms/step - accuracy: 0.4258 - loss: 1.3959 - val_accuracy: 0.4253 - val_loss: 1.3938
Epoch 5/20
[1m4021/4021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m570s[0m 142ms/step - accuracy: 0.4328 - loss: 1.3841 - val_accuracy: 0.4378 - val_loss: 1.3758
Epoch 6/20
[1m4021/4021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 21ms/step - accuracy: 0.4360 - loss: 1.3742 - val_accuracy: 0.4393 - val_loss: 1.3719


## Test

In [12]:
# Evaluate Models
def evaluate_models(model, X_val, y_val):
    loss, accuracy = model.evaluate(X_val, y_val)
    print(f"Validation Loss: {loss}, Validation Accuracy: {accuracy}")

evaluate_models(cnn_model, X_val, y_val)
evaluate_models(transformer_model, X_val, y_val)

# Further Testing Against MCTS (Placeholder)
def test_against_mcts(model):
    # Implement MCTS testing logic here
    pass
test_against_mcts(cnn_model)
test_against_mcts(transformer_model)


[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 686us/step - accuracy: 0.4775 - loss: 1.3569
Validation Loss: 1.3543078899383545, Validation Accuracy: 0.4690828025341034
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3771 - loss: 1.6158
Validation Loss: 1.6246517896652222, Validation Accuracy: 0.3711651563644409


## CNN

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers

def build_improved_cnn():
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(6, 7, 2), padding="same"),
        layers.BatchNormalization(),
        layers.Conv2D(64, (3, 3), activation='relu', padding="same"),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        
        layers.Conv2D(128, (3, 3), activation='relu', padding="same"),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),

        layers.Flatten(),
        layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dropout(0.3),
        layers.Dense(7, activation='softmax')
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

cnn_model = build_improved_cnn()

# Train CNN
cnn_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=64)
