In [1]:
import pandas as pd
import numpy as np

In [2]:
# load the dataset
def load_dataset(filename):
    # load the dataset as a pandas DataFrame
    data = pd.read_csv(filename, header=None)
    # retrieve numpy array
    dataset = data.values
    # split into input (X) and output (y) variables
    X = dataset[:, :-1]
    y = dataset[:,-1]
    # format all fields as string
    X = X.astype(str)
    # reshape target to be a 2d array
    y = y.reshape((len(y), 1))
    return X, y

In [3]:
# prepare input data
def prepare_inputs(X_train, X_test):
    X_train_enc, X_test_enc = list(), list()
    # label encode each column
    for i in range(X_train.shape[1]):
        le = LabelEncoder()
        le.fit(X_train[:, i])
        # encode
        train_enc = le.transform(X_train[:, i])
        test_enc = le.transform(X_test[:, i])
        # store
        X_train_enc.append(train_enc)
        X_test_enc.append(test_enc)
    return X_train_enc, X_test_enc

In [4]:
# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [5]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
#np.set_printoptions(threshold=1000)
np.set_printoptions(edgeitems=1000, threshold=np.inf)

In [6]:
import keras

In [7]:
# example of learned embedding encoding for a neural network
from numpy import unique
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers.merge import concatenate
from keras.utils import plot_model
 
# load the dataset
X, y = load_dataset('breast-cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
# make output 3d
y_train_enc = y_train_enc.reshape((len(y_train_enc), 1, 1))
y_test_enc = y_test_enc.reshape((len(y_test_enc), 1, 1))
# prepare each input head
in_layers = list()
em_layers = list()
for i in range(len(X_train_enc)):
	# calculate the number of unique inputs
	n_labels = len(unique(X_train_enc[i]))
	# define input layer
	in_layer = Input(shape=(1,))
	# define embedding layer
	em_layer = Embedding(n_labels, 10)(in_layer)
	# store layers
	in_layers.append(in_layer)
	em_layers.append(em_layer)
# concat all embeddings
merge = concatenate(em_layers)
dense = Dense(10, activation='relu', kernel_initializer='he_normal')(merge)
output = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=in_layers, outputs=output)
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# plot graph: MWB - Requires pydot and graphviz (which wants python 3.9)
#plot_model(model, show_shapes=True, to_file='embeddings.png')

# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=20, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

  return f(*args, **kwargs)


('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')
Epoch 1/20
12/12 - 2s - loss: 0.6917 - accuracy: 0.5707
Epoch 2/20
12/12 - 0s - loss: 0.6770 - accuracy: 0.7382
Epoch 3/20
12/12 - 0s - loss: 0.6618 - accuracy: 0.7330
Epoch 4/20
12/12 - 0s - loss: 0.6411 - accuracy: 0.7277
Epoch 5/20
12/12 - 0s - loss: 0.6180 - accuracy: 0.7277
Epoch 6/20
12/12 - 0s - loss: 0.5949 - accuracy: 0.7277
Epoch 7/20
12/12 - 0s - loss: 0.5765 - accuracy: 0.7277
Epoch 8/20
12/12 - 0s - loss: 0.5612 - accuracy: 0.7277
Epoch 9/20
12/12 - 0s - loss: 0.5496 - accuracy: 0.7277
Epoch 10/20
12/12 - 0s - loss: 0.5422 - accuracy: 0.7277
Epoch 11/20
12/12 - 0s - loss: 0.5317 - accuracy: 0.7277
Epoch 12/20
12/12 - 0s - loss: 0.5219 - accuracy: 0.7382
Epoch 13/20
12/12 - 0s - loss: 0.5141 - accuracy: 0.7539
Epoch 14/20
12/12 - 0s - loss: 0.5052 - accuracy: 0.7592
Epoch 15/20
12/12 - 0s - loss: 0.4984 - accuracy: 0.7644
Epoch 

In [8]:
X_test_enc

[array([3, 3, 3, 2, 1, 1, 2, 1, 4, 3, 3, 2, 2, 2, 3, 2, 3, 4, 1, 1, 2, 3,
        1, 4, 3, 3, 3, 4, 4, 1, 2, 2, 3, 3, 4, 2, 3, 2, 4, 4, 2, 3, 4, 3,
        3, 2, 4, 4, 1, 1, 3, 4, 4, 3, 3, 2, 4, 3, 2, 3, 2, 4, 3, 3, 4, 3,
        4, 2, 3, 2, 1, 2, 2, 3, 2, 2, 2, 1, 4, 2, 1, 3, 4, 3, 2, 2, 3, 2,
        2, 2, 2, 4, 1, 3, 2]),
 array([2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2,
        2, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 2, 1, 2, 0, 0, 0, 0, 0, 0,
        0, 2, 0, 0, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0,
        0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 1, 0, 0, 2, 2, 2, 0, 2,
        2, 2, 0, 0, 2, 0, 2]),
 array([ 4,  5,  7,  3,  0,  6,  5,  3,  2,  5,  3,  4,  1,  5,  5,  3,  5,
         2,  5,  3,  4,  4,  6,  1,  0,  4,  4,  5,  5,  5,  3,  3,  6,  3,
        10,  3,  5,  3,  3,  6,  5, 10,  3,  5,  5,  5,  2, 10,  5,  7,  4,
         8,  4,  4,  4,  4,  2,  4,  1,  1,  3,  5,  7,  6,  7,  5,  1,  5,
         5,  2,  3,  3,  3,  3,  4,  6, 10