In [1]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix, confusion_matrix

# Seed value
seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)

import pandas as pd

import raha

del seed_value

1) Input: We load both datasets (dirty and clean) as dirty_table and clean_table.

In [2]:
# Load Data
data='Tax'

dirty_table = pd.read_csv('./datasets/' + data + '/dirty.csv', sep=",", header="infer", encoding="utf-8", dtype=str, keep_default_na=False, low_memory=False)
clean_table = pd.read_csv('./datasets/' + data + '/clean.csv', sep=",", header="infer", encoding="utf-8", dtype=str, keep_default_na=False, low_memory=False)

2) Structure Transformation: Next we rename the column names in the dirty_table to have identical names with the clean dataset. We need this to combine the information of both datasets and create a new one (df). Also we add tid as sequence number for every row. At the end we cute the strings after 100 characters (numcharmax).

In [3]:
# Structure dirty_table and clean_table equal? (names of columns can be different)
Tablestructure_equal = True

In [4]:
# Rename the different columnames
cols_clean_table = list(clean_table.columns.values)
cols_dirty_table = list(dirty_table.columns.values)

if sorted(cols_clean_table) == sorted(cols_dirty_table): 
    print ("The lists are identical")
else : 
    print ("The lists are not identical")
    if Tablestructure_equal == True:
        print ("The dirty and clean have the same structure. We use the columnames from clean for dirty.") 
        dirty_table.columns = cols_clean_table

del cols_clean_table, cols_dirty_table, Tablestructure_equal

The lists are identical


In [5]:
# Add id_
clean_table.insert(0, 'id_', clean_table.index)
clean_table = clean_table.set_index('id_')

dirty_table.insert(0, 'id_', dirty_table.index)
dirty_table = dirty_table.set_index('id_')

dirty_table = dirty_table.replace(r'^\s*$', np.nan, regex=True)
dirty_table = dirty_table.fillna('')
clean_table = clean_table.replace(r'^\s*$', np.nan, regex=True)
clean_table = clean_table.fillna('')

# Generate table attribute with information about columns
attribute = pd.DataFrame(clean_table.columns.to_numpy(), columns = ['name'])
measurer = np.vectorize(len)
attribute['maxnumchar1'] = measurer(dirty_table.astype(str)).max(axis=0)
attribute['maxnumchar']=np.where(attribute['maxnumchar1']>128, 128, attribute['maxnumchar1'])

del measurer

3) Merge: Next we combine the two tables in the dataset df where every cell of the dirty_table / clean_table is saved in the columns value_x / value_y, respectively. For the models we need an attribute value, i.e. a label, which includes 0 (correct) or 1 (wrong). We get this value when comparing value_x and value_y.

In [6]:
# Produce datasets which transformed the table in rows
clean_row=clean_table.unstack().reset_index()
clean_row['Sort'] = clean_row.index
clean_row = clean_row.rename(columns={'level_0':'attribute','level_1':'id_',0:'value'}).sort_values(by=['id_','Sort'])
clean_row=clean_row.reset_index(drop=True).drop(columns='Sort')

dirty_row=dirty_table.unstack().reset_index()
dirty_row['Sort'] = dirty_row.index
dirty_row = dirty_row.rename(columns={'level_0':'attribute','level_1':'id_',0:'value'}).sort_values(by=['id_','Sort'])
dirty_row=dirty_row.reset_index(drop=True).drop(columns='Sort')

# Produce datasets for M2
X_roh = dirty_table
y = clean_table != dirty_table

y = y.astype(int)

del dirty_table, clean_table

In [7]:
# Merge datasets together
df = pd.merge(dirty_row, clean_row, on=['id_', "attribute"])

# Show rows which are empty (1)
df['empty1'] = np.where(df['value_x'] == '', 1, 0)

# Compare content of dirty and clean dataset
df['value'] = np.where(df['value_x'] == df['value_y'], 0, 1)

# Concatenate attributename and value_x (dirty)
df['concat'] = df['attribute'] + '_' + df.value_x.fillna('')

df['length'] = df.value_x.fillna('').str.len()

del dirty_row, clean_row

In [8]:
# Save all possible 'id_' in a dataset
ID_Alle = df.groupby(['id_'], as_index=False)['value'].sum()

In [9]:
df.head()

Unnamed: 0,attribute,id_,value_x,value_y,empty1,value,concat,length
0,f_name,0,Pengyuan,Pengyuan,0,0,f_name_Pengyuan,8
1,l_name,0,Zendler,Zendler,0,0,l_name_Zendler,7
2,gender,0,F,F,0,0,gender_F,1
3,area_code,0,508,508,0,0,area_code_508,3
4,phone,0,744-9007,744-9007,0,0,phone_744-9007,8


In [10]:
df.dtypes

attribute    object
id_           int64
value_x      object
value_y      object
empty1        int32
value         int32
concat       object
length        int64
dtype: object

In [11]:
# Print properties (length and number of errors per column)
num_error_col=0
for attr in attribute['name']:
    df2 = df[df['attribute']==attr]
    maxnumchar = attribute.loc[attribute['name']==attr]['maxnumchar'].to_numpy()[0]
    maxnumchar1 = attribute.loc[attribute['name']==attr]['maxnumchar1'].to_numpy()[0]
    summe = np.sum(df2.value)
    attribute.loc[attribute['name'] == attr, 'error'] = int(summe)
    print(attr)
    print('Max lenght: ' + str(maxnumchar1) + ' --> ' + str(maxnumchar))
    print('Number of errors: ' + str(summe))
    print('')
    if summe > 0:
        num_error_col+=1

print(str(num_error_col) + '/' + str(len(attribute)) + ' faulty attributes')
del attr, df2, summe, num_error_col, maxnumchar, maxnumchar1

f_name
Max lenght: 17 --> 17
Number of errors: 272

l_name
Max lenght: 18 --> 18
Number of errors: 694

gender
Max lenght: 1 --> 1
Number of errors: 0

area_code
Max lenght: 3 --> 3
Number of errors: 0

phone
Max lenght: 8 --> 8
Number of errors: 0

city
Max lenght: 27 --> 27
Number of errors: 200

state
Max lenght: 4 --> 4
Number of errors: 600

zip
Max lenght: 5 --> 5
Number of errors: 31311

marital_status
Max lenght: 1 --> 1
Number of errors: 200

has_child
Max lenght: 1 --> 1
Number of errors: 200

salary
Max lenght: 6 --> 6
Number of errors: 0

rate
Max lenght: 9 --> 9
Number of errors: 87342

single_exemp
Max lenght: 5 --> 5
Number of errors: 200

married_exemp
Max lenght: 5 --> 5
Number of errors: 0

child_exemp
Max lenght: 4 --> 4
Number of errors: 200

10/15 faulty attributes


In [12]:
attribute

Unnamed: 0,name,maxnumchar1,maxnumchar,error
0,f_name,17,17,272.0
1,l_name,18,18,694.0
2,gender,1,1,0.0
3,area_code,3,3,0.0
4,phone,8,8,0.0
5,city,27,27,200.0
6,state,4,4,600.0
7,zip,5,5,31311.0
8,marital_status,1,1,200.0
9,has_child,1,1,200.0


4) Dictionary Generation: Before we can feed the data into a neural network, we need to transform the data types from character to numeric character embedding. We produce a value dictionary (char_index) which contains an index for each character in value_x.

For the ETSB-RNN we also need an attribute dictionary (attribute_index) which includes an index for each attribute.

In [13]:
maxlen = np.max(attribute['maxnumchar'])
print("Maximum value_x length: ", maxlen)
Summe = df.groupby('value')['id_'].count()
print(Summe)
print()
print('Error Rate:'+ str(round(100/(Summe[0]+Summe[1])*Summe[1],2)))

del Summe

Maximum value_x length:  27
value
0    2878781
1     121219
Name: id_, dtype: int64

Error Rate:4.04


In [14]:
# Tokenizer character
tk_char = tf.keras.preprocessing.text.Tokenizer(num_words=False, lower=False, char_level=True)
tk_char.fit_on_texts(df.value_x)
print("Number of characters: " + str(len(tk_char.word_index)))
print(tk_char.word_index)

Number of characters: 69
{'0': 1, '5': 2, '3': 3, '2': 4, '1': 5, '4': 6, '7': 7, '6': 8, '8': 9, '9': 10, 'M': 11, 'N': 12, 'a': 13, 'S': 14, 'e': 15, 'A': 16, 'i': 17, '-': 18, '.': 19, 'n': 20, 'E': 21, 'r': 22, 'o': 23, 'L': 24, 'O': 25, 'R': 26, 'Y': 27, 'I': 28, 'T': 29, 'F': 30, 'l': 31, 's': 32, 'C': 33, 'u': 34, 'h': 35, 't': 36, 'D': 37, 'H': 38, 'K': 39, 'G': 40, 'W': 41, 'B': 42, 'd': 43, 'P': 44, 'm': 45, 'g': 46, 'k': 47, 'V': 48, 'U': 49, ' ': 50, 'c': 51, 'y': 52, 'b': 53, 'v': 54, 'J': 55, 'z': 56, 'p': 57, 'f': 58, 'w': 59, 'j': 60, 'Z': 61, 'X': 62, 'Q': 63, 'x': 64, 'q': 65, "'": 66, '*': 67, ';': 68, '/': 69}


In [15]:
# Tokenizer attribute
tk_attr = tf.keras.preprocessing.text.Tokenizer(num_words=False, filters='', lower=False, char_level=False, split="nosplit")
tk_attr.fit_on_texts(df.attribute)
print("Number of attributs: " + str(len(tk_attr.word_index)))
print(tk_attr.word_index)

Number of attributs: 15
{'f_name': 1, 'l_name': 2, 'gender': 3, 'area_code': 4, 'phone': 5, 'city': 6, 'state': 7, 'zip': 8, 'marital_status': 9, 'has_child': 10, 'salary': 11, 'rate': 12, 'single_exemp': 13, 'married_exemp': 14, 'child_exemp': 15}


In [16]:
# Example
print('Characters: ' + df.value_x[3])
print(np.array(tf.keras.preprocessing.sequence.pad_sequences(tk_char.texts_to_sequences([df.value_x[3]]), maxlen=maxlen, padding='post')))
print()
print('Attribut: ' + df.attribute[3])
print(tk_attr.texts_to_sequences([df.attribute[3]]))
print()
print('Value: ' + str(df.value[3]))
print(tf.keras.utils.to_categorical([df.value[3]], num_classes=2))

Characters: 508
[[2 1 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]

Attribut: area_code
[[4]]

Value: 0
[[1. 0.]]


5) Generate a Train- and Testset

In [17]:
# Number of tupels for training
n = 20

train = df[df['id_'].isin(ID_Alle['id_'])]
train_ID = ID_Alle['id_']

train_ID_Rest = ID_Alle['id_']
train_Rest = df[df['id_'].isin(train_ID_Rest)]

train_ID_Manuel_List = []

# Iterate for choosing the next observation
for i in range(0,n):
    # For prefering empty value_x we have to compute the number of this
    empty = train_Rest.groupby(['id_'])['empty1'].agg('sum')
    count = train_Rest.groupby(['id_']).size().to_frame()
    count['empty1'] = empty
    count = count.sort_values(by=[0,'empty1'], ascending=False)
    count.reset_index(inplace=True)
    count = count[count[0]==count[0].max()]
    count = count[count['empty1']==count['empty1'].max()]
    train_ID_Manuel_List.append(count.sample(1, random_state=1)['id_'])
    train_ID_Manuel = pd.Series(train_ID_Manuel_List)
    train_Manuel = df[df['id_'].isin(train_ID_Manuel)]
    train_Rest = train_Rest[~train_Rest.concat.isin(train_Manuel.concat)]
    train_ID_Rest = train_ID[~train_ID.isin(train_ID_Manuel)]

print('Number of train-tupels: ' + str(len(train_ID_Manuel)))

# The records which we dont need for training we use for the testing
test_ID = train_ID_Rest.copy()
test = df[df['id_'].isin(test_ID)]

del i, count, train_ID_Manuel_List, empty, train_ID, train, train_ID_Rest, train_Rest

Number of train-tupels: 20


In [18]:
# Transform the text to numbers
X_train_Manuel=np.array(tf.keras.preprocessing.sequence.pad_sequences(tk_char.texts_to_sequences(train_Manuel.value_x), maxlen=maxlen, padding='post'))
X_train_Manuel_attribute=np.array(tk_attr.texts_to_sequences(train_Manuel.attribute))
Y_train_Manuel=tf.keras.utils.to_categorical(train_Manuel.value, num_classes=2)

X_test=np.array(tf.keras.preprocessing.sequence.pad_sequences(tk_char.texts_to_sequences(test.value_x), maxlen=maxlen, padding='post'))
X_test_attribute=np.array(tk_attr.texts_to_sequences(test.attribute))
Y_test=tf.keras.utils.to_categorical(test.value, num_classes=2)

#X_train_Rest=np.array(tf.keras.preprocessing.sequence.pad_sequences(tk_char.texts_to_sequences(train_Rest.value_x), maxlen=maxlen, padding='post'))
#X_train_Rest_attribute=np.array(tk_attr.texts_to_sequences(train_Rest.attribute))
#Y_train_Rest=tf.keras.utils.to_categorical(train_Rest.value, num_classes=2)

In [19]:
# New random examples 1 (specific length)
#Y_rand_obs=Y_rand_obs = np.zeros((new_example*len(tk_attr.word_index),2))
#Y_rand_obs[:,1]=1

#i=0
#for attr in attribute['name']:
#    X_rand_obs_attribute_1 = np.zeros((new_example,1))
#    X_rand_obs_attribute_1[:,0]=np.array(tk_attr.texts_to_sequences([attr]))
#    maxnumchar = attribute.loc[attribute['name']==attr]['maxnumchar'].to_numpy()[0]
#    X_rand_obs_1=np.ndarray.round(np.random.rand(new_example,maxnumchar)*len(tk_char.word_index),)
#    X_rand_obs_1=np.array(tf.keras.preprocessing.sequence.pad_sequences(X_rand_obs_1, maxlen=maxlen, padding='post'))

#    if i==0:
#        X_rand_obs_attribute=X_rand_obs_attribute_1
#        X_rand_obs=X_rand_obs_1
#    else:
#        X_rand_obs_attribute=np.append(X_rand_obs_attribute,X_rand_obs_attribute_1,axis=0)
#        X_rand_obs=np.append(X_rand_obs,X_rand_obs_1,axis=0)
#    i=+1


In [20]:
# New random examples 2
#X_rand_obs=np.ndarray.round(np.random.rand(new_example*len(tk_attr.word_index),X_train_Manuel_1.shape[1])*len(tk_char.word_index),)
#X_rand_obs_attribute=X_train_Manuel_attribute_1[0:X_rand_obs.shape[0]]
#Y_rand_obs = np.zeros((new_example*len(tk_attr.word_index),2))
#Y_rand_obs[:,1]=1

#X_rand_obs_attribute_1=X_train_Manuel_attribute_1[0:len(tk_attr.word_index)]
#X_rand_obs_attribute=X_rand_obs_attribute_1.copy()
#for i in range(1,new_example):
#    X_rand_obs_attribute=np.append(X_rand_obs_attribute,X_rand_obs_attribute_1,axis=0)

In [21]:
#X_train_Manuel=np.append(X_train_Manuel_1,X_rand_obs,axis=0)
#X_train_Manuel_attribute=np.append(X_train_Manuel_attribute_1,X_rand_obs_attribute,axis=0)
#Y_train_Manuel=np.append(Y_train_Manuel_1,Y_rand_obs,axis=0)

In [22]:
# Number of correct (0) and wrong (1) data in the trainset
np.ndarray.sum(Y_train_Manuel,axis=0)

array([293.,   7.], dtype=float32)

Define model TSB-RNN and ETSB-RNN.

In [27]:
# Parameter for models
n_classes = 2
ver=1

# Hyperparameter
n_epochs = 120
batch_size=round(attribute.shape[0]*5)
#batch_size=round(attribute.shape[0]*n/4)
#batch_size=round(X_train_Manuel.shape[0])

emb_dim_char = round(len(tk_char.word_index)+1)
emb_dim_attr = round(len(tk_attr.word_index)+1)
rnn_dim = 64
rnn_dim_att = 8

In [28]:
# Define TSB-RNN
tf.keras.backend.clear_session()
checkpoint_path = 'checkpoint/' + data + 'p11/checkpoint_p11_m0'
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    monitor='loss',
    save_best_only=True,
    save_weights_only=True,
    verbose=ver
)

inputA = tf.keras.Input(shape=(maxlen,))

a = tf.keras.layers.Embedding(emb_dim_char,emb_dim_char,mask_zero=True)(inputA)

x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=True))(a)
x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=False))(x)
x = tf.keras.layers.Dense(round(rnn_dim/2), activation="relu")(x)
x = tf.keras.layers.BatchNormalization()(x)
z = tf.keras.layers.Dense(n_classes, activation='softmax')(x)

model = tf.keras.models.Model(inputs=inputA, outputs=z)

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 27)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 27, 70)            4900      
_________________________________________________________________
bidirectional (Bidirectional (None, 27, 128)           17280     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               24704     
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
batch_normalization (BatchNo (None, 32)                128       
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 66    

In [29]:
# Train TSB-RNN
#log = model.fit(X_train_Manuel, Y_train_Manuel, shuffle=False, batch_size=batch_size, epochs=n_epochs, validation_data=(X_test, Y_test), callbacks=[checkpoint], verbose=ver)
log = model.fit(X_train_Manuel, Y_train_Manuel, shuffle=False, batch_size=batch_size, epochs=n_epochs, callbacks=[checkpoint], verbose=ver)

Epoch 1/120

Epoch 00001: loss improved from inf to 0.79342, saving model to checkpoint/p11/Tax\checkpoint_p11_m0
Epoch 2/120

Epoch 00002: loss improved from 0.79342 to 0.68542, saving model to checkpoint/p11/Tax\checkpoint_p11_m0
Epoch 3/120

Epoch 00003: loss improved from 0.68542 to 0.61988, saving model to checkpoint/p11/Tax\checkpoint_p11_m0
Epoch 4/120

Epoch 00004: loss improved from 0.61988 to 0.58684, saving model to checkpoint/p11/Tax\checkpoint_p11_m0
Epoch 5/120

Epoch 00005: loss improved from 0.58684 to 0.56924, saving model to checkpoint/p11/Tax\checkpoint_p11_m0
Epoch 6/120

Epoch 00006: loss improved from 0.56924 to 0.55179, saving model to checkpoint/p11/Tax\checkpoint_p11_m0
Epoch 7/120

Epoch 00007: loss improved from 0.55179 to 0.53867, saving model to checkpoint/p11/Tax\checkpoint_p11_m0
Epoch 8/120

Epoch 00008: loss improved from 0.53867 to 0.51102, saving model to checkpoint/p11/Tax\checkpoint_p11_m0
Epoch 9/120

Epoch 00009: loss improved from 0.51102 to 0.48

In [30]:
# Define ETSB-RNN
tf.keras.backend.clear_session()
checkpoint_path1 = 'checkpoint/' + data + 'p11/checkpoint_p11_m1'
checkpoint1 = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path1,
    monitor='loss',
    save_best_only=True,
    save_weights_only=True,
    verbose=ver
)

inputA = tf.keras.Input(shape=(maxlen,))
inputB = tf.keras.Input(shape=(1,))

a = tf.keras.layers.Embedding(emb_dim_char,emb_dim_char,mask_zero=True)(inputA)
b = tf.keras.layers.Embedding(emb_dim_attr,emb_dim_attr)(inputB)

x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=True))(a)
x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=False))(x)
x = tf.keras.models.Model(inputs=inputA, outputs=x)

y = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim_att, return_sequences=True))(b)
y = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim_att, return_sequences=False))(y)
y = tf.keras.models.Model(inputs=inputB, outputs=y)

combined = tf.keras.layers.concatenate([x.output, y.output])
combined = tf.keras.layers.Dense(round(rnn_dim/2), activation="relu")(combined)
combined = tf.keras.layers.BatchNormalization()(combined)
z = tf.keras.layers.Dense(n_classes, activation='softmax')(combined)

model1 = tf.keras.models.Model(inputs=[x.input, y.input], outputs=z)
model1.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model1.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 27)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 27, 70)       4900        input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 16)        256         input_2[0][0]                    
____________________________________________________________________________________________

In [31]:
# Train ETSB-RNN
#log1 = model1.fit(x=[X_train_Manuel,X_train_Manuel_attribute], y=Y_train_Manuel, shuffle=False, batch_size=batch_size, epochs=n_epochs, validation_data=([X_test,X_test_attribute], Y_test), callbacks=[checkpoint1], verbose=ver)
log1 = model1.fit(x=[X_train_Manuel,X_train_Manuel_attribute], y=Y_train_Manuel, shuffle=False, batch_size=batch_size, epochs=n_epochs, callbacks=[checkpoint1], verbose=ver)

Epoch 1/120

Epoch 00001: loss improved from inf to 0.79377, saving model to checkpoint/p11/Tax\checkpoint_p11_m1
Epoch 2/120

Epoch 00002: loss improved from 0.79377 to 0.68173, saving model to checkpoint/p11/Tax\checkpoint_p11_m1
Epoch 3/120

Epoch 00003: loss improved from 0.68173 to 0.63719, saving model to checkpoint/p11/Tax\checkpoint_p11_m1
Epoch 4/120

Epoch 00004: loss improved from 0.63719 to 0.60697, saving model to checkpoint/p11/Tax\checkpoint_p11_m1
Epoch 5/120

Epoch 00005: loss improved from 0.60697 to 0.59126, saving model to checkpoint/p11/Tax\checkpoint_p11_m1
Epoch 6/120

Epoch 00006: loss improved from 0.59126 to 0.57154, saving model to checkpoint/p11/Tax\checkpoint_p11_m1
Epoch 7/120

Epoch 00007: loss improved from 0.57154 to 0.55191, saving model to checkpoint/p11/Tax\checkpoint_p11_m1
Epoch 8/120

Epoch 00008: loss improved from 0.55191 to 0.52739, saving model to checkpoint/p11/Tax\checkpoint_p11_m1
Epoch 9/120

Epoch 00009: loss improved from 0.52739 to 0.50

In [32]:
# Load best weights
model.load_weights(checkpoint_path)
model1.load_weights(checkpoint_path1)

# Evaluate with testsets
scores = model.evaluate(X_test, Y_test)
print('model')
print(str(model.metrics_names[0])+': '+str(scores[0]))
print(str(model.metrics_names[1])+': '+str(scores[1]))

scores1 = model1.evaluate([X_test,X_test_attribute], Y_test)
print('model1')
print(str(model1.metrics_names[0])+': '+str(scores1[0]))
print(str(model1.metrics_names[1])+': '+str(scores1[1]))

model
loss: 0.04310808330774307
accuracy: 0.9964406490325928
model1
loss: 0.031671322882175446
accuracy: 0.9970313906669617


In [141]:
# Plot results over epochs ETSB-RNN
#plt.plot(log.history['loss'], label='Training')
#plt.plot(log.history['val_loss'], label='Testing')
#plt.legend()
#plt.grid()

In [142]:
# Plot results over epochs ETSB-RNN
#plt.plot(log1.history['loss'], label='Training')
#plt.plot(log1.history['val_loss'], label='Testing')
#plt.legend()
#plt.grid()

In [33]:
# Predict testdataset
Y_test_disc = np.argmax(Y_test, axis=1)

Y_pred = model.predict(X_test)
Y_pred_disc = np.argmax(Y_pred, axis=1)

Y_pred1 = model1.predict([X_test,X_test_attribute])
Y_pred_disc1 = np.argmax(Y_pred1, axis=1)

In [34]:
# TSB-RNN
confusion_matrix(Y_test_disc, Y_pred_disc)

array([[2870532,    7956],
       [   2721,  118491]], dtype=int64)

In [35]:
# ETSB-RNN
confusion_matrix(Y_test_disc, Y_pred_disc1)

array([[2872149,    6339],
       [   2566,  118646]], dtype=int64)

In [36]:
# Number of errors (1)
test.groupby('value')['value_x'].count()

value
0    2878488
1     121212
Name: value_x, dtype: int64

In [37]:
# Measures TSB-RNN
Summe = test.groupby('value')['value_x'].count()
print('Error Rate: '+ str(round(100/(Summe[0]+Summe[1])*Summe[1],2)))
loss = scores[0]
print('Loss: {:.4f}'.format(loss))
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y_test_disc, Y_pred_disc)
print('Accuracy: {:.2f}%'.format(accuracy*100))
# precision tp / (tp + fp)
precision = precision_score(Y_test_disc, Y_pred_disc)
print('Precision: {:.2f}%'.format(precision*100))
# recall: tp / (tp + fn)
recall = recall_score(Y_test_disc, Y_pred_disc)
print('Recall: {:.2f}%'.format(recall*100))
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y_test_disc, Y_pred_disc)
print('F1 score: {:.2f}%'.format(f1*100))

Error Rate: 4.04
Loss: 0.0431
Accuracy: 99.64%
Precision: 93.71%
Recall: 97.76%
F1 score: 95.69%


In [38]:
# Measures ETSB-RNN
Summe = test.groupby('value')['value_x'].count()
print('Error Rate: '+ str(round(100/(Summe[0]+Summe[1])*Summe[1],2)))
loss = scores1[0]
print('Loss: {:.4f}'.format(loss))
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y_test_disc, Y_pred_disc1)
print('Accuracy: {:.2f}%'.format(accuracy*100))
# precision tp / (tp + fp)
precision = precision_score(Y_test_disc, Y_pred_disc1)
print('Precision: {:.2f}%'.format(precision*100))
# recall: tp / (tp + fn)
recall = recall_score(Y_test_disc, Y_pred_disc1)
print('Recall: {:.2f}%'.format(recall*100))
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y_test_disc, Y_pred_disc1)
print('F1 score: {:.2f}%'.format(f1*100))

Error Rate: 4.04
Loss: 0.0317
Accuracy: 99.70%
Precision: 94.93%
Recall: 97.88%
F1 score: 96.38%


In [39]:
# Generate dataset with results from TSB-RNN (Model 0) and ETSB-RNN (Model 1)
df1 = test.copy()
df1['M0_pred'] = np.round(1-Y_pred[:,0],2)
df1['M1_pred'] = np.round(1-Y_pred1[:,0],2)
df1['M0_pred_disc'] = Y_pred_disc
df1['M1_pred_disc'] = Y_pred_disc1
df1['M0'] = np.where(df1['M0_pred_disc'] != df1['value'],1,0)
df1['M1'] = np.where(df1['M1_pred_disc'] != df1['value'],1,0)
df1['M0_M1'] = np.where((df1['M0_pred_disc'] != df1['value']) & (df1['M1_pred_disc'] != df1['value']),1,0)