In [1]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix, confusion_matrix

# Seed value
seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)

import pandas as pd

import raha

del seed_value

1) Input: We load both datasets (dirty and clean) as dirty_table and clean_table.

In [2]:
# Load Data
data='beers'

dirty_table = pd.read_csv('./datasets/' + data + '/dirty.csv', sep=",", header="infer", encoding="utf-8", dtype=str, keep_default_na=False, low_memory=False)
clean_table = pd.read_csv('./datasets/' + data + '/clean.csv', sep=",", header="infer", encoding="utf-8", dtype=str, keep_default_na=False, low_memory=False)

2) Structure Transformation: Next we rename the column names in the dirty_table to have identical names with the clean dataset. We need this to combine the information of both datasets and create a new one (df). Also we add tid as sequence number for every row. At the end we cute the strings after 100 characters (numcharmax).

In [3]:
# Structure dirty_table and clean_table equal? (names of columns can be different)
Tablestructure_equal = True

In [4]:
# Rename the different columnames
cols_clean_table = list(clean_table.columns.values)
cols_dirty_table = list(dirty_table.columns.values)

if sorted(cols_clean_table) == sorted(cols_dirty_table): 
    print ("The lists are identical")
else : 
    print ("The lists are not identical")
    if Tablestructure_equal == True:
        print ("The dirty and clean have the same structure. We use the columnames from clean for dirty.") 
        dirty_table.columns = cols_clean_table

del cols_clean_table, cols_dirty_table, Tablestructure_equal

The lists are not identical
The dirty and clean have the same structure. We use the columnames from clean for dirty.


In [5]:
# Add id_
clean_table.insert(0, 'id_', clean_table.index)
clean_table = clean_table.set_index('id_')

dirty_table.insert(0, 'id_', dirty_table.index)
dirty_table = dirty_table.set_index('id_')

dirty_table = dirty_table.replace(r'^\s*$', np.nan, regex=True)
dirty_table = dirty_table.fillna('')
clean_table = clean_table.replace(r'^\s*$', np.nan, regex=True)
clean_table = clean_table.fillna('')

# Generate table attribute with information about columns
attribute = pd.DataFrame(clean_table.columns.to_numpy(), columns = ['name'])
measurer = np.vectorize(len)
attribute['maxnumchar1'] = measurer(dirty_table.astype(str)).max(axis=0)
attribute['maxnumchar']=np.where(attribute['maxnumchar1']>128, 128, attribute['maxnumchar1'])

del measurer

3) Merge: Next we combine the two tables in the dataset df where every cell of the dirty_table / clean_table is saved in the columns value_x / value_y, respectively. For the models we need an attribute value, i.e. a label, which includes 0 (correct) or 1 (wrong). We get this value when comparing value_x and value_y.

In [6]:
# Produce datasets which transformed the table in rows
clean_row=clean_table.unstack().reset_index()
clean_row['Sort'] = clean_row.index
clean_row = clean_row.rename(columns={'level_0':'attribute','level_1':'id_',0:'value'}).sort_values(by=['id_','Sort'])
clean_row=clean_row.reset_index(drop=True).drop(columns='Sort')

dirty_row=dirty_table.unstack().reset_index()
dirty_row['Sort'] = dirty_row.index
dirty_row = dirty_row.rename(columns={'level_0':'attribute','level_1':'id_',0:'value'}).sort_values(by=['id_','Sort'])
dirty_row=dirty_row.reset_index(drop=True).drop(columns='Sort')

# Produce datasets for M2
X_roh = dirty_table
y = clean_table != dirty_table

y = y.astype(int)

del dirty_table, clean_table

In [7]:
# Merge datasets together
df = pd.merge(dirty_row, clean_row, on=['id_', "attribute"])

# Show rows which are empty (1)
df['empty1'] = np.where(df['value_x'] == '', 1, 0)

# Compare content of dirty and clean dataset
df['value'] = np.where(df['value_x'] == df['value_y'], 0, 1)

# Concatenate attributename and value_x (dirty)
df['concat'] = df['attribute'] + '_' + df.value_x.fillna('')

df['length'] = df.value_x.fillna('').str.len()

del dirty_row, clean_row

In [8]:
# Save all possible 'id_' in a dataset
ID_Alle = df.groupby(['id_'], as_index=False)['value'].sum()

In [9]:
df.head()

Unnamed: 0,attribute,id_,value_x,value_y,empty1,value,concat,length
0,index,0,1,1,0,0,index_1,1
1,id,0,1436,1436,0,0,id_1436,4
2,beer-name,0,Pub Beer,Pub Beer,0,0,beer-name_Pub Beer,8
3,style,0,American Pale Lager,American Pale Lager,0,0,style_American Pale Lager,19
4,ounces,0,12.0 oz,12,0,1,ounces_12.0 oz,7


In [10]:
df.dtypes

attribute    object
id_           int64
value_x      object
value_y      object
empty1        int64
value         int64
concat       object
length        int64
dtype: object

In [11]:
# Print properties (length and number of errors per column)
num_error_col=0
for attr in attribute['name']:
    df2 = df[df['attribute']==attr]
    maxnumchar = attribute.loc[attribute['name']==attr]['maxnumchar'].to_numpy()[0]
    maxnumchar1 = attribute.loc[attribute['name']==attr]['maxnumchar1'].to_numpy()[0]
    summe = np.sum(df2.value)
    attribute.loc[attribute['name'] == attr, 'error'] = int(summe)
    print(attr)
    print('Max lenght: ' + str(maxnumchar1) + ' --> ' + str(maxnumchar))
    print('Number of errors: ' + str(summe))
    print('')
    if summe > 0:
        num_error_col+=1

print(str(num_error_col) + '/' + str(len(attribute)) + ' faulty attributes')
del attr, df2, summe, num_error_col, maxnumchar, maxnumchar1

index
Max lenght: 4 --> 4
Number of errors: 0

id
Max lenght: 4 --> 4
Number of errors: 0

beer-name
Max lenght: 52 --> 52
Number of errors: 0

style
Max lenght: 35 --> 35
Number of errors: 0

ounces
Max lenght: 18 --> 18
Number of errors: 2410

abv
Max lenght: 21 --> 21
Number of errors: 693

ibu
Max lenght: 3 --> 3
Number of errors: 1005

brewery_id
Max lenght: 3 --> 3
Number of errors: 0

brewery-name
Max lenght: 35 --> 35
Number of errors: 0

city
Max lenght: 21 --> 21
Number of errors: 127

state
Max lenght: 2 --> 2
Number of errors: 127

5/11 faulty attributes


In [12]:
attribute

Unnamed: 0,name,maxnumchar1,maxnumchar,error
0,index,4,4,0.0
1,id,4,4,0.0
2,beer-name,52,52,0.0
3,style,35,35,0.0
4,ounces,18,18,2410.0
5,abv,21,21,693.0
6,ibu,3,3,1005.0
7,brewery_id,3,3,0.0
8,brewery-name,35,35,0.0
9,city,21,21,127.0


4) Dictionary Generation: Before we can feed the data into a neural network, we need to transform the data types from character to numeric character embedding. We produce a value dictionary (char_index) which contains an index for each character in value_x.

For the ETSB-RNN we also need an attribute dictionary (attribute_index) which includes an index for each attribute.

In [13]:
maxlen = np.max(attribute['maxnumchar'])
print("Maximum value_x length: ", maxlen)
Summe = df.groupby('value')['id_'].count()
print(Summe)
print()
print('Error Rate:'+ str(round(100/(Summe[0]+Summe[1])*Summe[1],2)))

del Summe

Maximum value_x length:  52
value
0    22148
1     4362
Name: id_, dtype: int64

Error Rate:16.45


In [14]:
# Tokenizer character
tk_char = tf.keras.preprocessing.text.Tokenizer(num_words=False, lower=False, char_level=True)
tk_char.fit_on_texts(df.value_x)
print("Number of characters: " + str(len(tk_char.word_index)))
print(tk_char.word_index)

Number of characters: 86
{' ': 1, 'e': 2, 'n': 3, 'r': 4, 'a': 5, '0': 6, 'o': 7, 'i': 8, '1': 9, 'A': 10, 'l': 11, '.': 12, '2': 13, 't': 14, 'm': 15, 'B': 16, '5': 17, '6': 18, 's': 19, 'c': 20, 'g': 21, 'C': 22, '9': 23, 'y': 24, '4': 25, 'u': 26, '3': 27, 'w': 28, 'P': 29, 'p': 30, 'd': 31, '7': 32, '8': 33, 'h': 34, 'z': 35, 'I': 36, 'S': 37, 'N': 38, '/': 39, 'k': 40, 'M': 41, 'b': 42, 'O': 43, 'R': 44, 'W': 45, 'L': 46, 'T': 47, 'H': 48, 'D': 49, 'v': 50, '%': 51, 'F': 52, 'G': 53, 'f': 54, '(': 55, ')': 56, 'E': 57, 'V': 58, 'K': 59, "'": 60, 'Z': 61, 'x': 62, '-': 63, 'U': 64, 'Y': 65, 'X': 66, 'J': 67, 'ö': 68, '&': 69, 'q': 70, 'j': 71, 'ä': 72, 'Q': 73, '’': 74, '#': 75, '!': 76, 'è': 77, ':': 78, ',': 79, '°': 80, 'é': 81, '™': 82, 'í': 83, 'ü': 84, '‘': 85, '?': 86}


In [15]:
# Tokenizer attribute
tk_attr = tf.keras.preprocessing.text.Tokenizer(num_words=False, filters='', lower=False, char_level=False, split="nosplit")
tk_attr.fit_on_texts(df.attribute)
print("Number of attributs: " + str(len(tk_attr.word_index)))
print(tk_attr.word_index)

Number of attributs: 11
{'index': 1, 'id': 2, 'beer-name': 3, 'style': 4, 'ounces': 5, 'abv': 6, 'ibu': 7, 'brewery_id': 8, 'brewery-name': 9, 'city': 10, 'state': 11}


In [16]:
# Example
print('Characters: ' + df.value_x[3])
print(np.array(tf.keras.preprocessing.sequence.pad_sequences(tk_char.texts_to_sequences([df.value_x[3]]), maxlen=maxlen, padding='post')))
print()
print('Attribut: ' + df.attribute[3])
print(tk_attr.texts_to_sequences([df.attribute[3]]))
print()
print('Value: ' + str(df.value[3]))
print(tf.keras.utils.to_categorical([df.value[3]], num_classes=2))

Characters: American Pale Lager
[[10 15  2  4  8 20  5  3  1 29  5 11  2  1 46  5 21  2  4  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]]

Attribut: style
[[4]]

Value: 0
[[1. 0.]]


5) Generate a Train- and Testset

In [17]:
# Number of tupels for training
n = 20

train = df[df['id_'].isin(ID_Alle['id_'])]
train_ID = ID_Alle['id_']

train_ID_Rest = ID_Alle['id_']
train_Rest = df[df['id_'].isin(train_ID_Rest)]

train_ID_Manuel_List = []

# Iterate for choosing the next observation
for i in range(0,n):
    # For prefering empty value_x we have to compute the number of this
    empty = train_Rest.groupby(['id_'])['empty1'].agg('sum')
    count = train_Rest.groupby(['id_']).size().to_frame()
    count['empty1'] = empty
    count = count.sort_values(by=[0,'empty1'], ascending=False)
    count.reset_index(inplace=True)
    count = count[count[0]==count[0].max()]
    count = count[count['empty1']==count['empty1'].max()]
    train_ID_Manuel_List.append(int(count.sample(1, random_state=1)['id_']))
    train_ID_Manuel = pd.Series(train_ID_Manuel_List)
    train_Manuel = df[df['id_'].isin(train_ID_Manuel)]
    train_Rest = train_Rest[~train_Rest.concat.isin(train_Manuel.concat)]
    train_ID_Rest = train_ID[~train_ID.isin(train_ID_Manuel)]

print('Number of train-tupels: ' + str(len(train_ID_Manuel)))

# The records which we dont need for training we use for the testing
test_ID = train_ID_Rest.copy()
test = df[df['id_'].isin(test_ID)]

del i, count, train_ID_Manuel_List, empty, train_ID, train, train_ID_Rest, train_Rest

Number of train-tupels: 20


In [18]:
# Transform the text to numbers
X_train_Manuel=np.array(tf.keras.preprocessing.sequence.pad_sequences(tk_char.texts_to_sequences(train_Manuel.value_x), maxlen=maxlen, padding='post'))
X_train_Manuel_attribute=np.array(tk_attr.texts_to_sequences(train_Manuel.attribute))
Y_train_Manuel=tf.keras.utils.to_categorical(train_Manuel.value, num_classes=2)

X_test=np.array(tf.keras.preprocessing.sequence.pad_sequences(tk_char.texts_to_sequences(test.value_x), maxlen=maxlen, padding='post'))
X_test_attribute=np.array(tk_attr.texts_to_sequences(test.attribute))
Y_test=tf.keras.utils.to_categorical(test.value, num_classes=2)

#X_train_Rest=np.array(tf.keras.preprocessing.sequence.pad_sequences(tk_char.texts_to_sequences(train_Rest.value_x), maxlen=maxlen, padding='post'))
#X_train_Rest_attribute=np.array(tk_attr.texts_to_sequences(train_Rest.attribute))
#Y_train_Rest=tf.keras.utils.to_categorical(train_Rest.value, num_classes=2)

In [19]:
# New random examples 1 (specific length)
#Y_rand_obs=Y_rand_obs = np.zeros((new_example*len(tk_attr.word_index),2))
#Y_rand_obs[:,1]=1

#i=0
#for attr in attribute['name']:
#    X_rand_obs_attribute_1 = np.zeros((new_example,1))
#    X_rand_obs_attribute_1[:,0]=np.array(tk_attr.texts_to_sequences([attr]))
#    maxnumchar = attribute.loc[attribute['name']==attr]['maxnumchar'].to_numpy()[0]
#    X_rand_obs_1=np.ndarray.round(np.random.rand(new_example,maxnumchar)*len(tk_char.word_index),)
#    X_rand_obs_1=np.array(tf.keras.preprocessing.sequence.pad_sequences(X_rand_obs_1, maxlen=maxlen, padding='post'))

#    if i==0:
#        X_rand_obs_attribute=X_rand_obs_attribute_1
#        X_rand_obs=X_rand_obs_1
#    else:
#        X_rand_obs_attribute=np.append(X_rand_obs_attribute,X_rand_obs_attribute_1,axis=0)
#        X_rand_obs=np.append(X_rand_obs,X_rand_obs_1,axis=0)
#    i=+1


In [20]:
# New random examples 2
#X_rand_obs=np.ndarray.round(np.random.rand(new_example*len(tk_attr.word_index),X_train_Manuel_1.shape[1])*len(tk_char.word_index),)
#X_rand_obs_attribute=X_train_Manuel_attribute_1[0:X_rand_obs.shape[0]]
#Y_rand_obs = np.zeros((new_example*len(tk_attr.word_index),2))
#Y_rand_obs[:,1]=1

#X_rand_obs_attribute_1=X_train_Manuel_attribute_1[0:len(tk_attr.word_index)]
#X_rand_obs_attribute=X_rand_obs_attribute_1.copy()
#for i in range(1,new_example):
#    X_rand_obs_attribute=np.append(X_rand_obs_attribute,X_rand_obs_attribute_1,axis=0)

In [21]:
#X_train_Manuel=np.append(X_train_Manuel_1,X_rand_obs,axis=0)
#X_train_Manuel_attribute=np.append(X_train_Manuel_attribute_1,X_rand_obs_attribute,axis=0)
#Y_train_Manuel=np.append(Y_train_Manuel_1,Y_rand_obs,axis=0)

In [22]:
# Number of correct (0) and wrong (1) data in the trainset
np.ndarray.sum(Y_train_Manuel,axis=0)

array([188.,  32.], dtype=float32)

Define model TSB-RNN and ETSB-RNN.

In [23]:
# Parameter for models
n_classes = 2
ver=1

# Hyperparameter
n_epochs = 120
batch_size=round(attribute.shape[0]*5)
#batch_size=round(attribute.shape[0]*n/4)
#batch_size=round(X_train_Manuel.shape[0])

emb_dim_char = round(len(tk_char.word_index)+1)
emb_dim_attr = round(len(tk_attr.word_index)+1)
rnn_dim = 64
rnn_dim_att = 8

In [24]:
# Define TSB-RNN
tf.keras.backend.clear_session()
checkpoint_path = 'checkpoint/' + data + 'p11/checkpoint_p11_m0'
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    monitor='loss',
    save_best_only=True,
    save_weights_only=True,
    verbose=ver
)

inputA = tf.keras.Input(shape=(maxlen,))

a = tf.keras.layers.Embedding(emb_dim_char,emb_dim_char,mask_zero=True)(inputA)

x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=True))(a)
x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=False))(x)
x = tf.keras.layers.Dense(round(rnn_dim/2), activation="relu")(x)
x = tf.keras.layers.BatchNormalization()(x)
z = tf.keras.layers.Dense(n_classes, activation='softmax')(x)

model = tf.keras.models.Model(inputs=inputA, outputs=z)

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 52)]              0         
                                                                 
 embedding (Embedding)       (None, 52, 87)            7569      
                                                                 
 bidirectional (Bidirectiona  (None, 52, 128)          19456     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              24704     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 batch_normalization (BatchN  (None, 32)               128   

In [25]:
# Train TSB-RNN
#log = model.fit(X_train_Manuel, Y_train_Manuel, shuffle=False, batch_size=batch_size, epochs=n_epochs, validation_data=(X_test, Y_test), callbacks=[checkpoint], verbose=ver)
log = model.fit(X_train_Manuel, Y_train_Manuel, shuffle=False, batch_size=batch_size, epochs=n_epochs, callbacks=[checkpoint], verbose=ver)

Epoch 1/120


2022-05-06 10:15:08.071938: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1: loss improved from inf to 0.62602, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 2/120
Epoch 2: loss improved from 0.62602 to 0.38394, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 3/120
Epoch 3: loss improved from 0.38394 to 0.31807, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 4/120
Epoch 4: loss improved from 0.31807 to 0.27853, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 5/120
Epoch 5: loss improved from 0.27853 to 0.25072, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 6/120
Epoch 6: loss improved from 0.25072 to 0.23525, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 7/120
Epoch 7: loss improved from 0.23525 to 0.20919, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 8/120
Epoch 8: loss improved from 0.20919 to 0.19089, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 9/120
Epoch 9: loss improved from 0.19089 to 0.18291, saving model to checkpoint/beersp11/checkpoi

Epoch 31: loss improved from 0.04430 to 0.04398, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 32/120
Epoch 32: loss improved from 0.04398 to 0.03908, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 33/120
Epoch 33: loss improved from 0.03908 to 0.03694, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 34/120
Epoch 34: loss improved from 0.03694 to 0.03480, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 35/120
Epoch 35: loss improved from 0.03480 to 0.03309, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 36/120
Epoch 36: loss improved from 0.03309 to 0.02959, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 37/120
Epoch 37: loss improved from 0.02959 to 0.02759, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 38/120
Epoch 38: loss improved from 0.02759 to 0.02623, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 39/120
Epoch 39: loss improved from 0.02623 to 0.02561, saving model to checkpo

Epoch 61: loss did not improve from 0.01147
Epoch 62/120
Epoch 62: loss did not improve from 0.01147
Epoch 63/120
Epoch 63: loss improved from 0.01147 to 0.01114, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 64/120
Epoch 64: loss improved from 0.01114 to 0.01081, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 65/120
Epoch 65: loss improved from 0.01081 to 0.01056, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 66/120
Epoch 66: loss improved from 0.01056 to 0.01046, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 67/120
Epoch 67: loss improved from 0.01046 to 0.01041, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 68/120
Epoch 68: loss did not improve from 0.01041
Epoch 69/120
Epoch 69: loss did not improve from 0.01041
Epoch 70/120
Epoch 70: loss did not improve from 0.01041
Epoch 71/120
Epoch 71: loss improved from 0.01041 to 0.01035, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 72/120
Epoch 72: loss impr

Epoch 93/120
Epoch 93: loss did not improve from 0.00929
Epoch 94/120
Epoch 94: loss did not improve from 0.00929
Epoch 95/120
Epoch 95: loss did not improve from 0.00929
Epoch 96/120
Epoch 96: loss did not improve from 0.00929
Epoch 97/120
Epoch 97: loss did not improve from 0.00929
Epoch 98/120
Epoch 98: loss did not improve from 0.00929
Epoch 99/120
Epoch 99: loss did not improve from 0.00929
Epoch 100/120
Epoch 100: loss did not improve from 0.00929
Epoch 101/120
Epoch 101: loss improved from 0.00929 to 0.00926, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 102/120
Epoch 102: loss improved from 0.00926 to 0.00923, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 103/120
Epoch 103: loss improved from 0.00923 to 0.00921, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 104/120
Epoch 104: loss improved from 0.00921 to 0.00917, saving model to checkpoint/beersp11/checkpoint_p11_m0
Epoch 105/120
Epoch 105: loss improved from 0.00917 to 0.00916, saving

In [26]:
# Define ETSB-RNN
tf.keras.backend.clear_session()
checkpoint_path1 = 'checkpoint/' + data + 'p11/checkpoint_p11_m1'
checkpoint1 = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path1,
    monitor='loss',
    save_best_only=True,
    save_weights_only=True,
    verbose=ver
)

inputA = tf.keras.Input(shape=(maxlen,))
inputB = tf.keras.Input(shape=(1,))

a = tf.keras.layers.Embedding(emb_dim_char,emb_dim_char,mask_zero=True)(inputA)
b = tf.keras.layers.Embedding(emb_dim_attr,emb_dim_attr)(inputB)

x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=True))(a)
x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=False))(x)
x = tf.keras.models.Model(inputs=inputA, outputs=x)

y = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim_att, return_sequences=True))(b)
y = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim_att, return_sequences=False))(y)
y = tf.keras.models.Model(inputs=inputB, outputs=y)

combined = tf.keras.layers.concatenate([x.output, y.output])
combined = tf.keras.layers.Dense(round(rnn_dim/2), activation="relu")(combined)
combined = tf.keras.layers.BatchNormalization()(combined)
z = tf.keras.layers.Dense(n_classes, activation='softmax')(combined)

model1 = tf.keras.models.Model(inputs=[x.input, y.input], outputs=z)
model1.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model1.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 52)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 52, 87)       7569        ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 12)        144         ['input_2[0][0]']                
                                                                                            

In [27]:
# Train ETSB-RNN
#log1 = model1.fit(x=[X_train_Manuel,X_train_Manuel_attribute], y=Y_train_Manuel, shuffle=False, batch_size=batch_size, epochs=n_epochs, validation_data=([X_test,X_test_attribute], Y_test), callbacks=[checkpoint1], verbose=ver)
log1 = model1.fit(x=[X_train_Manuel,X_train_Manuel_attribute], y=Y_train_Manuel, shuffle=False, batch_size=batch_size, epochs=n_epochs, callbacks=[checkpoint1], verbose=ver)

Epoch 1/120
Epoch 1: loss improved from inf to 0.56441, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 2/120
Epoch 2: loss improved from 0.56441 to 0.31339, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 3/120
Epoch 3: loss improved from 0.31339 to 0.24508, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 4/120
Epoch 4: loss improved from 0.24508 to 0.21231, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 5/120
Epoch 5: loss improved from 0.21231 to 0.18707, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 6/120
Epoch 6: loss improved from 0.18707 to 0.16906, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 7/120
Epoch 7: loss improved from 0.16906 to 0.15185, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 8/120
Epoch 8: loss improved from 0.15185 to 0.14076, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 9/120
Epoch 9: loss improved from 0.14076 to 0.13176, saving model to checkpoint/beers

Epoch 31/120
Epoch 31: loss improved from 0.01783 to 0.01597, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 32/120
Epoch 32: loss improved from 0.01597 to 0.01433, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 33/120
Epoch 33: loss improved from 0.01433 to 0.01293, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 34/120
Epoch 34: loss improved from 0.01293 to 0.01168, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 35/120
Epoch 35: loss improved from 0.01168 to 0.01056, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 36/120
Epoch 36: loss improved from 0.01056 to 0.00955, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 37/120
Epoch 37: loss improved from 0.00955 to 0.00872, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 38/120
Epoch 38: loss improved from 0.00872 to 0.00810, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 39/120
Epoch 39: loss did not improve from 0.00810
Epoch 40/120
Ep

Epoch 61/120
Epoch 61: loss did not improve from 0.00080
Epoch 62/120
Epoch 62: loss improved from 0.00080 to 0.00069, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 63/120
Epoch 63: loss improved from 0.00069 to 0.00058, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 64/120
Epoch 64: loss improved from 0.00058 to 0.00051, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 65/120
Epoch 65: loss improved from 0.00051 to 0.00045, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 66/120
Epoch 66: loss improved from 0.00045 to 0.00041, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 67/120
Epoch 67: loss improved from 0.00041 to 0.00037, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 68/120
Epoch 68: loss improved from 0.00037 to 0.00035, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 69/120
Epoch 69: loss did not improve from 0.00035
Epoch 70/120
Epoch 70: loss improved from 0.00035 to 0.00031, saving model

Epoch 90/120
Epoch 90: loss improved from 0.00003 to 0.00003, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 91/120
Epoch 91: loss improved from 0.00003 to 0.00003, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 92/120
Epoch 92: loss improved from 0.00003 to 0.00002, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 93/120
Epoch 93: loss improved from 0.00002 to 0.00002, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 94/120
Epoch 94: loss improved from 0.00002 to 0.00002, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 95/120
Epoch 95: loss improved from 0.00002 to 0.00002, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 96/120
Epoch 96: loss improved from 0.00002 to 0.00001, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 97/120
Epoch 97: loss improved from 0.00001 to 0.00001, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 98/120
Epoch 98: loss improved from 0.00001 to 0.00001, saving mod

Epoch 118/120
Epoch 118: loss improved from 0.00000 to 0.00000, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 119/120
Epoch 119: loss improved from 0.00000 to 0.00000, saving model to checkpoint/beersp11/checkpoint_p11_m1
Epoch 120/120
Epoch 120: loss improved from 0.00000 to 0.00000, saving model to checkpoint/beersp11/checkpoint_p11_m1


In [28]:
# Load best weights
model.load_weights(checkpoint_path)
model1.load_weights(checkpoint_path1)

# Evaluate with testsets
scores = model.evaluate(X_test, Y_test)
print('model')
print(str(model.metrics_names[0])+': '+str(scores[0]))
print(str(model.metrics_names[1])+': '+str(scores[1]))

scores1 = model1.evaluate([X_test,X_test_attribute], Y_test)
print('model1')
print(str(model1.metrics_names[0])+': '+str(scores1[0]))
print(str(model1.metrics_names[1])+': '+str(scores1[1]))

model
loss: 0.0506121851503849
accuracy: 0.9918980598449707
model1
loss: 0.07073335349559784
accuracy: 0.995131254196167


In [29]:
# Plot results over epochs ETSB-RNN
#plt.plot(log.history['loss'], label='Training')
#plt.plot(log.history['val_loss'], label='Testing')
#plt.legend()
#plt.grid()

In [30]:
# Plot results over epochs ETSB-RNN
#plt.plot(log1.history['loss'], label='Training')
#plt.plot(log1.history['val_loss'], label='Testing')
#plt.legend()
#plt.grid()

In [31]:
# Predict testdataset
Y_test_disc = np.argmax(Y_test, axis=1)

Y_pred = model.predict(X_test)
Y_pred_disc = np.argmax(Y_pred, axis=1)

Y_pred1 = model1.predict([X_test,X_test_attribute])
Y_pred_disc1 = np.argmax(Y_pred1, axis=1)

In [32]:
# TSB-RNN
confusion_matrix(Y_test_disc, Y_pred_disc)

array([[21869,    91],
       [  122,  4208]])

In [33]:
# ETSB-RNN
confusion_matrix(Y_test_disc, Y_pred_disc1)

array([[21956,     4],
       [  124,  4206]])

In [34]:
# Number of errors (1)
test.groupby('value')['value_x'].count()

value
0    21960
1     4330
Name: value_x, dtype: int64

In [35]:
# Measures TSB-RNN
Summe = test.groupby('value')['value_x'].count()
print('Error Rate: '+ str(round(100/(Summe[0]+Summe[1])*Summe[1],2)))
loss = scores[0]
print('Loss: {:.4f}'.format(loss))
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y_test_disc, Y_pred_disc)
print('Accuracy: {:.2f}%'.format(accuracy*100))
# precision tp / (tp + fp)
precision = precision_score(Y_test_disc, Y_pred_disc)
print('Precision: {:.2f}%'.format(precision*100))
# recall: tp / (tp + fn)
recall = recall_score(Y_test_disc, Y_pred_disc)
print('Recall: {:.2f}%'.format(recall*100))
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y_test_disc, Y_pred_disc)
print('F1 score: {:.2f}%'.format(f1*100))

Error Rate: 16.47
Loss: 0.0506
Accuracy: 99.19%
Precision: 97.88%
Recall: 97.18%
F1 score: 97.53%


In [36]:
# Measures ETSB-RNN
Summe = test.groupby('value')['value_x'].count()
print('Error Rate: '+ str(round(100/(Summe[0]+Summe[1])*Summe[1],2)))
loss = scores1[0]
print('Loss: {:.4f}'.format(loss))
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y_test_disc, Y_pred_disc1)
print('Accuracy: {:.2f}%'.format(accuracy*100))
# precision tp / (tp + fp)
precision = precision_score(Y_test_disc, Y_pred_disc1)
print('Precision: {:.2f}%'.format(precision*100))
# recall: tp / (tp + fn)
recall = recall_score(Y_test_disc, Y_pred_disc1)
print('Recall: {:.2f}%'.format(recall*100))
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y_test_disc, Y_pred_disc1)
print('F1 score: {:.2f}%'.format(f1*100))

Error Rate: 16.47
Loss: 0.0707
Accuracy: 99.51%
Precision: 99.90%
Recall: 97.14%
F1 score: 98.50%


In [37]:
# Generate dataset with results from TSB-RNN (Model 0) and ETSB-RNN (Model 1)
df1 = test.copy()
df1['M0_pred'] = np.round(1-Y_pred[:,0],2)
df1['M1_pred'] = np.round(1-Y_pred1[:,0],2)
df1['M0_pred_disc'] = Y_pred_disc
df1['M1_pred_disc'] = Y_pred_disc1
df1['M0'] = np.where(df1['M0_pred_disc'] != df1['value'],1,0)
df1['M1'] = np.where(df1['M1_pred_disc'] != df1['value'],1,0)
df1['M0_M1'] = np.where((df1['M0_pred_disc'] != df1['value']) & (df1['M1_pred_disc'] != df1['value']),1,0)