In [18]:
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from utils import read_dataset
import os
import pandas as pd

DATASET = 'connect-4'

data = read_dataset(DATASET)
df = pd.DataFrame(data['data'])
df

[ 0/1] Reading connect-4 dataset...


Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,...,a34,a35,a36,a37,a38,a39,a40,a41,a42,class
0,b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',...,b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'win'
1,b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',...,b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'win'
2,b'b',b'b',b'b',b'b',b'b',b'b',b'o',b'b',b'b',b'b',...,b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'win'
3,b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',...,b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'win'
4,b'o',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',...,b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'b',b'win'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67552,b'x',b'x',b'b',b'b',b'b',b'b',b'o',b'x',b'o',b'b',...,b'b',b'b',b'b',b'o',b'o',b'x',b'b',b'b',b'b',b'loss'
67553,b'x',b'x',b'b',b'b',b'b',b'b',b'o',b'b',b'b',b'b',...,b'b',b'b',b'b',b'o',b'x',b'o',b'o',b'x',b'b',b'draw'
67554,b'x',b'x',b'b',b'b',b'b',b'b',b'o',b'o',b'b',b'b',...,b'b',b'b',b'b',b'o',b'x',b'x',b'o',b'b',b'b',b'loss'
67555,b'x',b'o',b'b',b'b',b'b',b'b',b'o',b'b',b'b',b'b',...,b'b',b'b',b'b',b'o',b'x',b'o',b'x',b'x',b'b',b'draw'


In [35]:
# since we are still doing unsupervised methods (clustering), we will ignore labels y
X = df.loc[:, df.columns != 'class']
X = X.applymap(lambda x: x.decode('utf-8')) # encode values as unicode strings instead of bytes
# For all vars in X, the domain is ['b', 'o', 'x']
# However, we will check it programatically.
# Also, even if the dataset is supposed to have no missing values, we will check it as well, just in case.
X_categories = set([])
for index, row in X.iterrows():
    for col_val in row:
        X_categories.add(col_val)
X_categories
# {'b', 'o', 'x'}, so the domain is confirmed, Also, no missing values,
# because otherwise would have None or others
# Recall that: 'x' means that we have a cell with a disk belonging to player 'x',
# 'o' means that we have a cell with a disk belonging to player 'o', and 'b' means that
# the cell is empty (blank).
X

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,...,a33,a34,a35,a36,a37,a38,a39,a40,a41,a42
0,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,b
1,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,b
2,b,b,b,b,b,b,o,b,b,b,...,b,b,b,b,b,b,b,b,b,b
3,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,b
4,o,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67552,x,x,b,b,b,b,o,x,o,b,...,b,b,b,b,o,o,x,b,b,b
67553,x,x,b,b,b,b,o,b,b,b,...,b,b,b,b,o,x,o,o,x,b
67554,x,x,b,b,b,b,o,o,b,b,...,b,b,b,b,o,x,x,o,b,b
67555,x,o,b,b,b,b,o,b,b,b,...,b,b,b,b,o,x,o,x,x,b


In [37]:
# Instead of one hot encoding, we will apply label encoder with [-1, 0, 1]. The reason why we will do it
# this way is that 'x' and 'o' are antagonists, and 'b' is the neutral value. So, there is some kind of natural
# order. This way, we can avoid the one hot encoding, which would increase the number of columns.
# Since all the variables have the same domain, we should be consistent with the encoding. For us, 'x'
# will always be encoded as '-1' and 'o' will always be encoded as '1'.
# X_encoded = X.apply(LabelEncoder().fit_transform)
# LabelEncoder works alphabetically and with range [0,n_classes-1],
# so 'b' will be encoded as 2, 'o' as 1, and 'x' as 0, which is not the intended outcome for us.
# It has no additional parameters, so we will apply our own encoder:
def recode(x):
    recode_map = {'x': -1, 'b': 0, 'o': 1}
    return recode_map[x]
X_encoded = X.applymap(recode)

Unnamed: 0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,...,a33,a34,a35,a36,a37,a38,a39,a40,a41,a42
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67552,-1,-1,0,0,0,0,1,-1,1,0,...,0,0,0,0,1,1,-1,0,0,0
67553,-1,-1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,-1,1,1,-1,0
67554,-1,-1,0,0,0,0,1,1,0,0,...,0,0,0,0,1,-1,-1,1,0,0
67555,-1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,-1,1,-1,-1,0


In [41]:
# save the cleaned/encoded X as a CSV for later
X.to_csv(os.path.join('datasets', 'connect-4-clean.csv'))
X_encoded.to_csv(os.path.join('datasets', 'connect-4-clean-enc.csv'))