In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

In [2]:
# load the dataset
def load_dataset(filename, cramer_coef, target, var_count, row_count):
    # load the dataset as a pandas DataFrame
    df = pd.read_csv(filename, header=0, index_col=0, nrows=row_count)
    
    # split into input (X) and output (y) variables
    X = df.drop(target, axis=1, inplace=False)
    
    # order columns by Cramer coeffs
    cramer_df = pd.read_csv(cramer_coef, sep='\t', header=None)
    cols = cramer_df.iloc[:, 0].tolist()
    X = X[cols]
    
    # Reduce # of variables
    if var_count < X.shape[1]:
        X = X.iloc[:, 0:var_count]
        
    # reshape target to be a 2d array
    y = df[target].values
    y = y.reshape((len(y), 1))
    X = X.values
    
    return X, y

In [3]:
# prepare input data
from sklearn.preprocessing import LabelEncoder
def prepare_inputs(X_train, X_test):
    X_train_enc, X_test_enc = list(), list()
    # label encode each column
    for i in range(X_train.shape[1]):
        le = LabelEncoder()
        #le.fit(X_train[:, i])
        # Fix missing field error by fitting both train & test - MWB
        both = np.concatenate((X_train[:, i], X_test[:,i]), axis=0)
        le.fit(both)  
        # encode
        train_enc = le.transform(X_train[:, i])
        test_enc = le.transform(X_test[:, i])
        
        X_train_enc.append(train_enc)
        X_test_enc.append(test_enc)
    return X_train_enc, X_test_enc

In [4]:
# MWB - Not necessary - already [0,1]
# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [5]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 200
#np.set_printoptions(threshold=1000)
np.set_printoptions(edgeitems=1000, threshold=np.inf)

In [16]:
%%time
# example of learned embedding encoding for a neural network
from numpy import unique
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers.merge import concatenate
#from keras.utils import plot_model  MWB - errors
#from prettytable import PrettyTable
#from tabulate import tabulate
 
# load the dataset
X, y = load_dataset('../../../data/csl/CSL_tl_PI_binned.csv', 
                    '../../../data/csl/CramerTheil/Cramer_PI_Tl_coeff_ALL.csv',
                    'trans_loss', 9, 2000)
#                    'trans_loss', 9, 200000)
X.shape

CPU times: user 70.8 ms, sys: 1.59 ms, total: 72.4 ms
Wall time: 72.5 ms


(2000, 9)

In [14]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.30, random_state=1)

# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data - MWB: not necessary
#y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
y_train_enc, y_test_enc = y_train, y_test
print(len(y_train_enc))
#print(y_train_enc)

[array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 3, 3, 0, 0, 0, 0,
       0, 0, 3, 0, 3, 1, 0, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 0, 2, 0, 1, 0, 3, 3, 0, 4, 3, 0, 0, 0, 2, 3, 0, 4, 0, 0,
       0, 4, 0, 0, 3, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       4, 3, 4, 4, 0, 2, 3, 0, 0, 2, 0, 0, 0, 4, 0, 3, 0, 0, 0, 0, 0, 0,
       0, 4, 0, 0, 1, 0, 0, 1, 1, 0, 0, 3, 3, 3, 0, 3, 4, 0, 0, 0, 0, 4,
       0, 0, 0, 4, 3, 0, 0, 0, 0, 3, 1, 4, 0, 0, 3, 3, 4, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 1, 3, 3, 0, 0,
       0, 0, 4, 0, 3, 0, 1, 0, 0, 4, 1, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 3,
       4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 4,
       0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 0, 4, 3, 0, 3, 3, 0, 0, 0, 0, 4,
       3, 0, 0, 3, 4, 3, 0, 4, 4, 0, 0, 0, 0, 3, 0, 0, 3, 4, 0, 0, 0, 1,
       3, 0, 0, 0, 0, 4, 0, 4, 4, 0, 0, 1, 0, 0, 0, 0, 3, 0, 2, 0, 4, 3,
       0, 0, 1, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0

In [8]:
%%time
# make output 3d
y_train_enc = y_train_enc.reshape((len(y_train_enc), 1, 1))
y_test_enc = y_test_enc.reshape((len(y_test_enc), 1, 1))
# prepare each input head
in_layers = list()
em_layers = list()
for i in range(len(X_train_enc)):
    # calculate the number of unique inputs
    n_labels = len(unique(X_train_enc[i]))
    # define input layer
    in_layer = Input(shape=(1,))
    # define embedding layer
    em_layer = Embedding(n_labels+1, 10)(in_layer)  # MWB - Embedding docs say to use this
    #em_layer = Embedding(n_labels, 10)(in_layer)
    # store layers
    in_layers.append(in_layer)
    em_layers.append(em_layer)
# concat all embeddings
merge = concatenate(em_layers)
dense = Dense(10, activation='relu', kernel_initializer='he_normal')(merge)
output = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=in_layers, outputs=output)
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', sample_weight_mode='temporal', 
              metrics=['accuracy','Precision','Recall','AUC'])
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC'])

# plot graph: MWB - Requires pydot and graphviz (which wants python 3.9)
#plot_model(model, show_shapes=True, to_file='embeddings.png')

# fit the keras model on the dataset
#weights = {0:0.51, 1:18.0}
#weights = {0:1, 1:36}
weights = np.zeros((10, 2))
weights[:,0] = 0.51
weights[:,1] = 18.0 
print(type(weights))
print(weights)
model.fit(X_train_enc, y_train_enc, epochs=20, batch_size=16, verbose=2) 
#model.fit(X_train_enc, y_train_enc, epochs=20, batch_size=16, verbose=2, 
#          class_weight=weights)
# evaluate the keras model
_, accuracy, prec, recall, auc = model.evaluate(X_test_enc, y_test_enc, verbose=0)
#print('Accuracy: %.2f' % (accuracy*100))
print(f'Accuracy: {accuracy}; Prec: {prec}; Recall: {recall}, AUC: {auc}')

<class 'numpy.ndarray'>
[[ 0.51 18.  ]
 [ 0.51 18.  ]
 [ 0.51 18.  ]
 [ 0.51 18.  ]
 [ 0.51 18.  ]
 [ 0.51 18.  ]
 [ 0.51 18.  ]
 [ 0.51 18.  ]
 [ 0.51 18.  ]
 [ 0.51 18.  ]]
Epoch 1/20
8112/8112 - 47s - loss: 0.1887 - accuracy: 0.9437 - precision: 0.0571 - recall: 5.5241e-04 - auc: 0.7817
Epoch 2/20
8112/8112 - 41s - loss: 0.1822 - accuracy: 0.9442 - precision: 0.0000e+00 - recall: 0.0000e+00 - auc: 0.7963
Epoch 3/20
8112/8112 - 38s - loss: 0.1819 - accuracy: 0.9442 - precision: 0.0000e+00 - recall: 0.0000e+00 - auc: 0.7976
Epoch 4/20
8112/8112 - 37s - loss: 0.1814 - accuracy: 0.9442 - precision: 0.0000e+00 - recall: 0.0000e+00 - auc: 0.8001
Epoch 5/20
8112/8112 - 37s - loss: 0.1806 - accuracy: 0.9442 - precision: 0.0000e+00 - recall: 0.0000e+00 - auc: 0.8045
Epoch 6/20
8112/8112 - 37s - loss: 0.1796 - accuracy: 0.9442 - precision: 0.0000e+00 - recall: 0.0000e+00 - auc: 0.8088
Epoch 7/20
8112/8112 - 38s - loss: 0.1794 - accuracy: 0.9442 - precision: 1.0000 - recall: 1.3810e-04 - auc: 