In [1]:
#embedding_layer.ipynb 
# 
#by Joe Hahn
#joe.hahn@oracle.com
#27 July 2018
#
#train an embedding layer on the XO dataset

In [2]:
#set system parameters
x_half_width = 0.5
radius = 3.5
box_half_width = 7.1
jitter = 1.0
rn_seed = 15
initial_id = 0

In [3]:
#start time
import time as tm
time_start = tm.time()

In [9]:
#generate balanced training data
N_dots = 26700
import numpy as np
rn_state = np.random.RandomState(seed=rn_seed)
from helper_fns import *
df = make_xo_df(N_dots, initial_id, x_half_width, radius, box_half_width, jitter)
df = rebalance_df(train)
print df.groupby('class')['id'].count()
train = df
train.tail(5)

class
B    3380
O    3380
X    3380
Name: id, dtype: int64


Unnamed: 0,id,x,y,r,angle,class,X_score,O_score,B_score
10135,18549,1.104946,-2.493079,2.726967,-1.153607,O,0.0,1.0,0.0
10136,20087,1.702426,0.807118,1.884063,0.442713,O,0.0,1.0,0.0
10137,19014,-0.333025,1.423073,1.46152,1.800677,X,1.0,0.0,0.0
10138,23069,0.191831,2.530519,2.537779,1.495134,O,0.0,1.0,0.0
10139,16853,3.571846,-2.792311,4.533772,-0.663517,X,1.0,0.0,0.0


In [11]:
#select x-features and y = target variables = class-scores
x_cols = ['x', 'y']
y_cols = ['O_score', 'X_score', 'B_score']
x = train[x_cols].values
y = train[y_cols].values
print 'x.shape = ', x.shape
print 'y.shape = ', y.shape
x

x.shape =  (10140, 2)
y.shape =  (10140, 3)


array([[-8.39027587,  1.92749158],
       [ 0.72665317, -0.87579893],
       [ 0.18988122, -1.97527298],
       ..., 
       [-0.33302456,  1.42307251],
       [ 0.19183067,  2.53051874],
       [ 3.57184635, -2.79231114]])

In [None]:
#model parameters
N_input = len(x_cols)
N_output = len(y_cols)
vocab_size = 100
embedding_dimension = vocab_size

In [None]:
#rescale x so 0 <= x < vocab_size
x = x - x.min()
x /= x.max()
x *= vocab_size - 1
x = x.astype(int)
#pad x with zeros
max_length = vocab_size
from keras.preprocessing.sequence import pad_sequences
x = pad_sequences(x, maxlen=max_length, padding='post')
N_input = x.shape[1]
print 'x.min() = ', x.min()
print 'x.max() = ', x.max()
print 'vocab_size, embedding_dimension, N_input = ', vocab_size, embedding_dimension, N_input
print 'x.shape = ', x.shape
x_train = x
y_train = y
x_train

In [None]:
#generate validation data
#...generate XO data and rebalance
df = make_xo_df(N_dots, initial_id, x_half_width, radius, box_half_width, jitter)
idx_O = (df['class'] == 'O')
idx_X = (df['class'] == 'X')
idx_B = (df['class'] == 'B')
df = df[idx_O].append(df[idx_X]).append(df[idx_B].sample(n=idx_O.sum()))
df['ran_num'] = np.random.uniform(size=len(df))
df = df.sort_values('ran_num').reset_index(drop=True)
print df.groupby('class')['id'].count()
x = df[x_cols].values
y = df[y_cols].values
#rescale so 0<x<vocab_size
x = x - x.min()
x /= x.max()
x *= vocab_size - 1
x = x.astype(int)
#pad x with zeros
x = pad_sequences(x, maxlen=max_length, padding='post')
print 'x.min() = ', x.min()
print 'x.max() = ', x.max()
print 'vocab_size, embedding_dimension, N_input = ', vocab_size, embedding_dimension, N_input
print 'x.shape = ', x.shape
x_val = x
y_val = y
x_val

In [None]:
#this helper function builds an embedding classifier composed of single embedding layer
def embedding_classifier(vocab_size, embedding_dimension, N_input, N_output):
    from keras.models import Sequential
    from keras.layers.embeddings import Embedding
    from keras.layers import Flatten
    from keras.layers import Dense
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dimension, input_length=N_input))
    model.add(Flatten())
    model.add(Dense(N_output, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    return model

In [None]:
#build embedding classification model
print 'vocab_size = ', vocab_size
print 'embedding_dimension = ', embedding_dimension
print 'N_input = ', N_input
print 'N_output = ', N_output
print 'x.shape = ', x.shape
model = embedding_classifier(vocab_size, embedding_dimension, N_input, N_output)
model.summary()

In [None]:
#fit model to the training data
N_training_epochs = 100
batch_size = len(x_train)/10
model = embedding_classifier(vocab_size, embedding_dimension, N_input, N_output)
fit_history = model.fit(x_train, y_train, epochs=N_training_epochs, batch_size=batch_size, verbose=0, \
    validation_data=(x_val, y_val))

In [None]:
#prep plots
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
sns.set(font_scale=1.5, font='DejaVu Sans')

In [None]:
#plot accuracy vs training epoch
fig, ax = plt.subplots(1,1, figsize=(15, 6))
xp = fit_history.epoch
yp = fit_history.history['loss']
ax.plot(xp, yp, label='training sample')
yp = fit_history.history['val_loss']
ax.plot(xp, yp, label='validation sample')
ax.set_title('classifier loss versus training epoch')
ax.set_ylabel('loss')
ax.set_xlabel('training epoch')
ax.legend(loc='lower left')
ax.set_ylim(0.95*np.min(yp[1:]), 1.05*np.max(yp[1:]))
plt.savefig('figs/mlp_loss.png')
print 'final validation loss = ', yp[-1]

In [None]:
#generate testing data
#...generate XO data and rebalance
df = make_xo_df(N_dots, initial_id, x_half_width, radius, box_half_width, jitter)
x = df[x_cols].values
y = df[y_cols].values
#rescale so 0<x<vocab_size
x = x - x.min()
x /= x.max()
x *= vocab_size - 1
x = x.astype(int)
#pad x with zeros
x = pad_sequences(x, maxlen=max_length, padding='post')
print 'x.min() = ', x.min()
print 'x.max() = ', x.max()
print 'vocab_size, embedding_dimension, N_input = ', vocab_size, embedding_dimension, N_input
print 'x.shape = ', x.shape
x_test = x
y_test = y
xy = df

In [None]:
#generate predicted class probabilities and their predicted X,O,B
y_pred = model.predict(x_test)
for idx in range(len(y_cols)):
    y_col = y_cols[idx]
    xy[y_col] = y_pred[:, idx]
idx = (xy.O_score > xy.X_score) & (xy.O_score > xy.B_score)
xy.loc[idx, 'class_pred'] = 'O'
idx = (xy.X_score > xy.B_score) & (xy.X_score > xy.O_score)
xy.loc[idx, 'class_pred'] = 'X'
idx = (xy.B_score > xy.O_score) & (xy.B_score > xy.X_score)
xy.loc[idx, 'class_pred'] = 'B'
loss, accuracy = model.evaluate(x, y, verbose=0)
print 'testing loss = ', loss
print 'testing accuracy = ', accuracy
xy.head()

In [None]:
#scatterplot showing model's predicted decision boundaries
df = xy
xy_rng = (-6.5, 6.5)
sz = 30
f, ax = plt.subplots(figsize=(10,10))
lbl = 'B'
idx = (df['class_pred'] == lbl)
xp = df[idx].x
yp = df[idx].y
p = sns.regplot(xp, yp, ax=ax, fit_reg=False, scatter_kws={'alpha':0.6, 's': sz}, marker='s', label=lbl)
lbl = 'X'
idx = (df['class_pred'] == lbl)
xp = df[idx].x
yp = df[idx].y
p = sns.regplot(xp, yp, ax=ax, fit_reg=False, scatter_kws={'alpha':0.6, 's': sz}, marker='s', label=lbl)
lbl = 'O'
idx = (df['class_pred'] == lbl)
xp = df[idx].x
yp = df[idx].y
p = sns.regplot(xp, yp, ax=ax, fit_reg=False, scatter_kws={'alpha':0.6, 's': sz}, marker='s', label=lbl)
p = ax.set_title('MLP-inferred decision boundary')
p = ax.set_xlabel('x')
p = ax.set_ylabel('y')
p = ax.set_xlim(xy_rng)
p = ax.set_ylim(xy_rng)
#plt.savefig('figs/mlp_decision_boundary.png')

In [None]:
#done
time_stop = tm.time()
print 'execution time (minutes) = ', (time_stop - time_start)/60.0