In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

In [2]:
# load the dataset
def load_dataset(filename, cramer_coef, target, var_count, row_count):
    # load the dataset as a pandas DataFrame
    df = pd.read_csv(filename, header=0, index_col=0, nrows=row_count)
    
    # split into input (X) and output (y) variables
    X = df.drop(target, axis=1, inplace=False)
    
    # order columns by Cramer coeffs
    cramer_df = pd.read_csv(cramer_coef, sep='\t', header=None)
    cols = cramer_df.iloc[:, 0].tolist()
    X = X[cols]
    
    # Reduce # of variables
    if var_count < X.shape[1]:
        X = X.iloc[:, 0:var_count]
        
    y = df[target].values
    
    # reshape target to be a 2d array
    #y = y.reshape((len(y), 1))
    #X = X.values
    
    return X, y

In [3]:
from sklearn.preprocessing import LabelEncoder
def encode_df(df):
    df_enc = pd.DataFrame()
    for col in df:
        le = LabelEncoder()
        le.fit(df[col])
        df_enc[col] = le.transform(df[col])
    return df_enc

In [4]:
# prepare input data
from sklearn.preprocessing import LabelEncoder
def prepare_inputs(X_train, X_test):
    X_train_enc, X_test_enc = list(), list()
    # label encode each column
    for i in range(X_train.shape[1]):
        le = LabelEncoder()
        #le.fit(X_train[:, i])
        # Fix missing field error by fitting both train & test - MWB
        both = np.concatenate((X_train[:, i], X_test[:,i]), axis=0)
        le.fit(both)  
        # encode
        train_enc = le.transform(X_train[:, i])
        test_enc = le.transform(X_test[:, i])
        
        X_train_enc.append(train_enc)
        X_test_enc.append(test_enc)
    return X_train_enc, X_test_enc

In [5]:
# MWB - Not necessary - already [0,1]
# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [6]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 200
#np.set_printoptions(threshold=1000)
np.set_printoptions(edgeitems=1000, threshold=np.inf)

In [7]:
%%time
# example of learned embedding encoding for a neural network
from numpy import unique
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers.merge import concatenate
#from keras.utils import plot_model  MWB - errors
#from prettytable import PrettyTable
#from tabulate import tabulate
 
# load the dataset
X, y = load_dataset('/MFMDatasets/MFM_bopf/data/csl/CSL_tl_PI_binned.csv', 
                    '/MFMDatasets/MFM_bopf/data/csl/CramerTheil/Cramer_PI_Tl_coeff_ALL.csv',
                    'trans_loss', 5, 200000)
#                    'trans_loss', 9, 200000)
X.shape

CPU times: user 5.04 s, sys: 1.2 s, total: 6.24 s
Wall time: 17 s


(185413, 5)

In [8]:
# Encode features using LabelEncoder
X_enc_df = encode_df(X)

for col in X_enc_df:
    print(unique(X_enc_df[col]))

[0 1 2 3 4]
[0 1 2 3 4 5 6 7 8 9]
[0 1]
[0 1 2 3 4 5]
[0 1 2 3 4]


In [9]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_enc_df, y, stratify=y, test_size=0.30, random_state=1)

X_train_enc = np.array(X_train)
X_test_enc = np.array(X_test)
y_train_enc = np.array(y_train)
y_test_enc = np.array(y_test)
print(X_train_enc.shape)
print(y_train_enc.shape)

(129789, 5)
(129789,)


In [10]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_enc_df, y, stratify=y, test_size=0.30, random_state=1)


In [11]:
from stat_mwb import under_samp

#X_res, y_res = under_samp(X_train_enc, y_train_enc)
X_train, y_train= under_samp(X_train, y_train)

under_method = RAND
target = None

In under_samp(): X.shape = (129789, 5); y.shape = (129789,)



In [12]:
X_train_enc = np.array(X_train)
X_test_enc = np.array(X_test)
y_train_enc = np.array(y_train)
y_test_enc = np.array(y_test)
print(X_train_enc.shape)
print(y_train_enc.shape)

(14482, 5)
(14482,)


## Undersample training data

In [13]:
from stat_mwb import under_samp

#X_res, y_res = under_samp(X_train_enc, y_train_enc)
X_train_enc, y_train_enc = under_samp(X_train_enc, y_train_enc)

under_method = RAND
target = None

In under_samp(): X.shape = (14482, 5); y.shape = (14482,)



In [14]:
print(X_train_enc.shape)
print(np.bincount(y_train_enc))

(14482, 5)
[7241 7241]


In [15]:
# make output 3d
y_train_enc = y_train_enc.reshape((len(y_train_enc), 1, 1))
y_test_enc = y_test_enc.reshape((len(y_test_enc), 1, 1))
# prepare each input head
print(y_train_enc.shape)
y_train_enc[:3]

(14482, 1, 1)


array([[[0]],

       [[0]],

       [[0]]])

In [16]:
for col in range(X_train_enc.shape[1]):
    print(unique(X_train_enc[:,col]))

[0 1 2 3 4]
[0 1 2 3 4 5 6 7 9]
[0 1]
[0 1 2 3 4 5]
[0 1 2 3 4]


In [17]:
X_train_enc.shape

(14482, 5)

In [18]:
in_layers = list()
em_layers = list()
for col in range(X_train_enc.shape[1]):
    # calculate the number of unique inputs
    n_labels = len(unique(X_train_enc[:, col]))
    # define input layer
    in_layer = Input(shape=(1,))
    # define embedding layer
    em_layer = Embedding(n_labels+1, 10)(in_layer)  # MWB - Embedding docs say to use this
    #em_layer = Embedding(n_labels, 10)(in_layer)
    # store layers
    in_layers.append(in_layer)
    em_layers.append(em_layer)

In [19]:
# transpose input data to lists
X_train_encl = []
X_test_encl = []
for col in range(X_train_enc.shape[1]):
    X_train_encl.append(X_train_enc[..., [col]])
    X_test_encl.append(X_test_enc[..., [col]])
X_train_encl[0].shape

(14482, 1)

In [20]:
for col in range(X_train_enc.shape[1]):
    print(unique(X_train_enc[:,col]))

[0 1 2 3 4]
[0 1 2 3 4 5 6 7 9]
[0 1]
[0 1 2 3 4 5]
[0 1 2 3 4]


In [21]:
print(len(in_layers))
print(len(em_layers))
print(type(em_layers))
print(type(X_train_encl))
print(len(X_train_encl))

5
5
<class 'list'>
<class 'list'>
5


In [22]:
%%time
# concat all embeddings
merge = concatenate(em_layers)
dense = Dense(10, activation='relu', kernel_initializer='he_normal')(merge)
output = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=in_layers, outputs=output)
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', sample_weight_mode='temporal', 
              metrics=['accuracy','Precision','Recall','AUC'])
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC'])

# plot graph: MWB - Requires pydot and graphviz (which wants python 3.9)
#plot_model(model, show_shapes=True, to_file='embeddings.png')

#weights = {0:0.51, 1:18.0}
#weights = {0:1, 1:36}
#weights = np.zeros((10, 2))
#weights[:,0] = 0.51
#weights[:,1] = 18.0 
#print(type(weights))
#print(weights)

# fit the keras model on the dataset
model.fit(X_train_encl, y_train_enc, epochs=20, batch_size=16, verbose=2) 
#model.fit(X_train_enc, y_train_enc, epochs=20, batch_size=16, verbose=2) 
#model.fit(X_train_enc, y_train_enc, epochs=20, batch_size=16, verbose=2, 
#          class_weight=weights)
# evaluate the keras model
_, accuracy, prec, recall, auc = model.evaluate(X_test_encl, y_test_enc, verbose=0)
#_, accuracy, prec, recall, auc = model.evaluate(X_test_enc, y_test_enc, verbose=0)
#print('Accuracy: %.2f' % (accuracy*100))
print(f'Accuracy: {accuracy}; Prec: {prec}; Recall: {recall}, AUC: {auc}')

Epoch 1/20
906/906 - 8s - loss: 0.5678 - accuracy: 0.7079 - precision: 0.6679 - recall: 0.8272 - auc: 0.7600
Epoch 2/20
906/906 - 4s - loss: 0.5410 - accuracy: 0.7214 - precision: 0.6592 - recall: 0.9166 - auc: 0.7841
Epoch 3/20
906/906 - 5s - loss: 0.5393 - accuracy: 0.7203 - precision: 0.6588 - recall: 0.9140 - auc: 0.7858
Epoch 4/20
906/906 - 5s - loss: 0.5383 - accuracy: 0.7208 - precision: 0.6588 - recall: 0.9156 - auc: 0.7871
Epoch 5/20
906/906 - 4s - loss: 0.5384 - accuracy: 0.7210 - precision: 0.6586 - recall: 0.9177 - auc: 0.7867
Epoch 6/20
906/906 - 4s - loss: 0.5385 - accuracy: 0.7217 - precision: 0.6592 - recall: 0.9180 - auc: 0.7861
Epoch 7/20
906/906 - 4s - loss: 0.5381 - accuracy: 0.7211 - precision: 0.6587 - recall: 0.9177 - auc: 0.7874
Epoch 8/20
906/906 - 4s - loss: 0.5378 - accuracy: 0.7210 - precision: 0.6592 - recall: 0.9152 - auc: 0.7877
Epoch 9/20
906/906 - 4s - loss: 0.5379 - accuracy: 0.7212 - precision: 0.6590 - recall: 0.9169 - auc: 0.7880
Epoch 10/20
906/906