In [1]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix, confusion_matrix

# Seed value
seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)

import pandas as pd
import time

del seed_value

1) Input: We load both datasets (dirty and clean) as dirty_table and clean_table.

In [2]:
def Input(data):
    # Load Data
    dirty_table = pd.read_csv('./datasets/' + data + '/dirty.csv', sep=",", header="infer", encoding="utf-8", dtype=str, keep_default_na=False, low_memory=False)
    clean_table = pd.read_csv('./datasets/' + data + '/clean.csv', sep=",", header="infer", encoding="utf-8", dtype=str, keep_default_na=False, low_memory=False)

    return dirty_table, clean_table

2) Structure Transformation: Next we rename the column names in the dirty_table to have identical names with the clean dataset. We need this to combine the information of both datasets and create a new one (df). Also we add tid as sequence number for every row. At the end we cute the strings after 100 characters (numcharmax).

In [3]:
def Structure(Tablestructure_equal, dirty_table, clean_table):
    # Structure dirty_table and clean_table equal? (names of columns can be different)
    #Tablestructure_equal = True

    # Rename the different columnames
    cols_clean_table = list(clean_table.columns.values)
    cols_dirty_table = list(dirty_table.columns.values)

    if sorted(cols_clean_table) == sorted(cols_dirty_table): 
        print ("The lists are identical")
    else : 
        print ("The lists are not identical")
        if Tablestructure_equal == True:
            print ("The dirty and clean have the same structure. We use the columnames from clean for dirty.") 
            dirty_table.columns = cols_clean_table

    # Add id_
    clean_table.insert(0, 'id_', clean_table.index)
    clean_table = clean_table.set_index('id_')

    dirty_table.insert(0, 'id_', dirty_table.index)
    dirty_table = dirty_table.set_index('id_')

    dirty_table = dirty_table.replace(r'^\s*$', np.nan, regex=True)
    dirty_table = dirty_table.fillna('')
    clean_table = clean_table.replace(r'^\s*$', np.nan, regex=True)
    clean_table = clean_table.fillna('')

    # Generate table attribute with information about columns
    attribute = pd.DataFrame(clean_table.columns.to_numpy(), columns = ['name'])
    measurer = np.vectorize(len)
    attribute['maxnumchar1'] = measurer(dirty_table.astype(str)).max(axis=0)
    attribute['maxnumchar']=np.where(attribute['maxnumchar1']>128, 128, attribute['maxnumchar1'])

    maxlen = np.max(attribute['maxnumchar'])
    print("Maximum value_x length: ", maxlen)

    return dirty_table, clean_table, attribute, maxlen

3) Merge: Next we combine the two tables in the dataset df where every cell of the dirty_table / clean_table is saved in the columns value_x / value_y, respectively. For the models we need an attribute value, i.e. a label, which includes 0 (correct) or 1 (wrong). We get this value when comparing value_x and value_y.

In [4]:
def Merge(dirty_table, clean_table):
    # Produce datasets which transformed the table in rows
    clean_row=clean_table.unstack().reset_index()
    clean_row['Sort'] = clean_row.index
    clean_row = clean_row.rename(columns={'level_0':'attribute','level_1':'id_',0:'value'}).sort_values(by=['id_','Sort'])
    clean_row=clean_row.reset_index(drop=True).drop(columns='Sort')

    dirty_row=dirty_table.unstack().reset_index()
    dirty_row['Sort'] = dirty_row.index
    dirty_row = dirty_row.rename(columns={'level_0':'attribute','level_1':'id_',0:'value'}).sort_values(by=['id_','Sort'])
    dirty_row=dirty_row.reset_index(drop=True).drop(columns='Sort')

    # Produce datasets for M2
    X_roh = dirty_table
    y = clean_table != dirty_table

    y = y.astype(int)

    # Merge datasets together
    df = pd.merge(dirty_row, clean_row, on=['id_', "attribute"])

    # Show rows which are empty (1)
    df['empty1'] = np.where(np.isin(df['value_x'].str.lower(),['', 'nan','n/a','n/n']) == True,1,0)

    # Compare content of dirty and clean dataset
    df['value'] = np.where(df['value_x'] == df['value_y'], 0, 1)

    # Concatenate attributename and value_x (dirty)
    df['concat'] = df['attribute'] + '_' + df['value_x']

    df['length'] = df.value_x.str.len()

    Summe = df.groupby('value')['id_'].count()
    print(Summe)
    print()
    print('Error Rate:'+ str(round(1/(Summe[0]+Summe[1])*Summe[1],2)))

    return df, X_roh, y

4) Dictionary Generation: Before we can feed the data into a neural network, we need to transform the data types from character to numeric character embedding. We produce a value dictionary (char_index) which contains an index for each character in value_x.

For the ETSB-RNN we also need an attribute dictionary (attribute_index) which includes an index for each attribute.

In [5]:
def Dictionary(attribute,df):
    # Tokenizer character
    tk_char = tf.keras.preprocessing.text.Tokenizer(num_words=False, lower=False, char_level=True)
    tk_char.fit_on_texts(df.value_x)
    tk_char_len=len(tk_char.word_index)
    print("Number of characters: " + str(tk_char_len))
    print(tk_char.word_index)
    #tk_char_list = list(tk_char.word_index.keys())

    # Tokenizer attribute
    tk_attr = tf.keras.preprocessing.text.Tokenizer(num_words=False, filters='', lower=False, char_level=False, split="nosplit")
    tk_attr.fit_on_texts(df.attribute)
    print("Number of attributs: " + str(len(tk_attr.word_index)))
    print(tk_attr.word_index)

    return tk_char, tk_attr

In [6]:
def attribute_extend(attribute,df,X_roh,y):
    # Print properties (length and number of errors per column)
    i=0
    num_error_col=0
    Drop_list = []
    for attr in attribute['name']:
        df2 = df[df['attribute']==attr]
        maxnumchar = attribute.loc[attribute['name']==attr]['maxnumchar'].to_numpy()[0]
        maxnumchar1 = attribute.loc[attribute['name']==attr]['maxnumchar1'].to_numpy()[0]
        summe = np.sum(df2.value)
        attribute.loc[attribute['name'] == attr, 'error'] = int(summe)
        
        #if maxnumchar1 > 500:
        if maxnumchar1 > 10000:
            Drop_list.append(attr)
        else:
            i=i+1
            arr = tf.keras.preprocessing.sequence.pad_sequences(tk_char.texts_to_sequences(X_roh[attr].astype(str)), maxlen=maxnumchar, padding='post')
            if i == 1:
                X = arr
            else:
                X = np.concatenate((X, arr), axis=1)
                
        tk_perCol = tf.keras.preprocessing.text.Tokenizer(num_words=False, lower=False, char_level=True)
        tk_perCol.fit_on_texts(X_roh[attr])
        attribute.loc[attribute['name'] == attr, 'numuniquechar'] = len(tk_perCol.word_index)
        df.loc[df['attribute'] == attr, 'length_norm'] = df['length']/maxnumchar1

        print(attr)
        print('Max lenght: ' + str(maxnumchar1) + ' --> ' + str(maxnumchar))
        print('Unique characters: ' + str(len(tk_perCol.word_index)))
        #print(tk_perCol.word_index)
        print('Number of errors: ' + str(summe))
        print('')
        if summe > 0:
            num_error_col+=1

    print(str(num_error_col) + '/' + str(len(attribute)) + ' faulty attributes')

    X = pd.DataFrame(X)
    X.insert(0, 'id_', X.index)
    X = X.set_index('id_')
    y = pd.DataFrame(y.drop(columns=Drop_list))
    X.reset_index(level=0, inplace=True)
    y.reset_index(level=0, inplace=True)

    return attribute, X, y, Drop_list

In [7]:
def chose_Model(Mod):
    global model
    global checkpoint_path
    global checkpoint

    tf.keras.backend.clear_session()

    checkpoint_path = 'checkpoint/' + data + '/p12/checkpoint_p12_' + Mod
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        monitor='loss',
        save_best_only=True,
        save_weights_only=True,
        verbose=ver
    )
    
    if Mod=='M0':
        # Define TSB-RNN
        inputA = tf.keras.Input(shape=(maxlen,))

        a = tf.keras.layers.Embedding(emb_dim_char,emb_dim_char,mask_zero=True)(inputA)

        x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=True))(a)
        x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=False))(x)
        x = tf.keras.layers.Dense(round(rnn_dim/2), activation="relu")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        z = tf.keras.layers.Dense(n_classes, activation='softmax')(x)

        model = tf.keras.models.Model(inputs=inputA, outputs=z)

    elif Mod=='M1alt':
        # Define ETSB-RNN
        inputA = tf.keras.Input(shape=(maxlen,))
        inputB = tf.keras.Input(shape=(1,))

        a = tf.keras.layers.Embedding(emb_dim_char,emb_dim_char,mask_zero=True)(inputA)
        b = tf.keras.layers.Embedding(emb_dim_attr,emb_dim_attr)(inputB)

        x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=True))(a)
        x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=False))(x)
        x = tf.keras.models.Model(inputs=inputA, outputs=x)

        y = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim_att, return_sequences=True))(b)
        y = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim_att, return_sequences=False))(y)
        y = tf.keras.models.Model(inputs=inputB, outputs=y)

        combined = tf.keras.layers.concatenate([x.output, y.output])
        combined = tf.keras.layers.Dense(round(rnn_dim/2), activation="relu")(combined)
        combined = tf.keras.layers.BatchNormalization()(combined)
        z = tf.keras.layers.Dense(n_classes, activation='softmax')(combined)

        model = tf.keras.models.Model(inputs=[x.input, y.input], outputs=z)

    elif Mod=='M1':
        # Define ETSB-RNN new
        inputA = tf.keras.Input(shape=(maxlen,))
        a = tf.keras.layers.Embedding(emb_dim_char,emb_dim_char,mask_zero=True)(inputA)
        a = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=True))(a)
        a = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim, return_sequences=False))(a)

        inputB = tf.keras.Input(shape=(1,))
        b = tf.keras.layers.Embedding(emb_dim_attr,emb_dim_attr)(inputB)
        b = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim_att, return_sequences=True))(b)
        b = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=rnn_dim_att, return_sequences=False))(b)

        inputC = tf.keras.Input(shape=(1,))
        c = tf.keras.layers.Dense(round(rnn_dim), activation="relu")(inputC)
        c = tf.keras.layers.Dense(round(rnn_dim), activation="relu")(c)

        combined = tf.keras.layers.concatenate([a, b, c])
        combined = tf.keras.layers.Dense(round(rnn_dim/2), activation="relu")(combined)
        combined = tf.keras.layers.BatchNormalization()(combined)

        z = tf.keras.layers.Dense(n_classes, activation='softmax')(combined)

        model = tf.keras.models.Model(inputs=[inputA, inputB, inputC], outputs=z)

    elif Mod=='M2':
        # Define MTSB-RNN
        crop1=0
        width_1=X.shape[1]+maxlen
        crop2=width_1

        inputs = tf.keras.Input(shape=(width_1,))
        inputsIn = tf.keras.layers.Reshape((width_1,1))(inputs)
        #inputsIn = tf.keras.layers.Embedding(emb_dim_char,emb_dim_char)(inputs)
        all_inputs = []
        all_outputs = []

        for index, row in attribute.iterrows():
            numuniquechar=int(row['numuniquechar']+1)
            maxnumchar=row['maxnumchar']
            In = 'In'+str(index)
            crop2-=maxnumchar

            In = tf.keras.layers.Cropping1D(cropping=(crop1,crop2))(inputsIn)
            In = tf.keras.layers.Reshape((maxnumchar,), input_shape=(maxnumchar,1))(In)
            In = tf.keras.layers.Embedding(emb_dim_char,emb_dim_char,mask_zero=True)(In)
            In = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=round((rnn_dim)), return_sequences=True))(In)
            In = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=round((rnn_dim)), return_sequences=False))(In)
            all_inputs.append(In)

            crop1+=maxnumchar

        crop2-=maxlen
        In_value = tf.keras.layers.Cropping1D(cropping=(crop1,crop2))(inputsIn)
        In_value = tf.keras.layers.Reshape((maxlen,), input_shape=(maxlen,1))(In_value)
        In_value = tf.keras.layers.Embedding(emb_dim_char,emb_dim_char,mask_zero=True)(In_value)
        In_value = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=round((rnn_dim)), return_sequences=True))(In_value)
        In_value = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=round((rnn_dim)), return_sequences=False))(In_value)
        all_inputs.append(In_value)
        crop1+=maxlen

        crop2-=1
        In_attr = tf.keras.layers.Cropping1D(cropping=(crop1,crop2))(inputsIn)
        In_attr = tf.keras.layers.Reshape((1,), input_shape=(1,1))(In_attr)
        In_attr = tf.keras.layers.Embedding(emb_dim_attr,emb_dim_attr)(In_attr)
        In_attr = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=round((rnn_dim_att)), return_sequences=True))(In_attr)
        In_attr = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=round((rnn_dim_att)), return_sequences=False))(In_attr)
        all_inputs.append(In_attr)
        crop1+=1

        combined = tf.keras.layers.concatenate(all_inputs)
        combined = tf.keras.layers.Dense(round(rnn_dim), activation="relu")(combined)
        combined = tf.keras.layers.BatchNormalization()(combined)

        z = tf.keras.layers.Dense(n_classes, activation='softmax')(combined)

        model = tf.keras.models.Model(inputs=inputs, outputs=z)

    elif Mod=='M3':
        crop1=0
        width_1=X.shape[1]-1
        crop2=width_1
        
        inputs = tf.keras.Input(shape=(width_1,))
        inputsIn = tf.keras.layers.Reshape((width_1,1))(inputs)
        all_inputs1 = []
        all_inputs2 = []
        all_outputs = []

        for index, row in attribute.iterrows():
            numuniquechar=int(row['numuniquechar']+1)
            maxnumchar=row['maxnumchar']
            In = 'In'+str(index)
            crop2-=maxnumchar
            
            In = tf.keras.layers.Cropping1D(cropping=(crop1,crop2))(inputsIn)
            In = tf.keras.layers.Reshape((maxnumchar,), input_shape=(maxnumchar,1))(In)
            In = tf.keras.layers.Embedding(emb_dim_char,emb_dim_char,mask_zero=True)(In)

            In1 = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=round(numuniquechar/2), return_sequences=True))(In)
            In1 = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=round(numuniquechar/2)))(In1)
            In1 = tf.keras.layers.Dense(numuniquechar, activation="relu")(In1)
            In1 = tf.keras.layers.Dropout(0.1)(In1)

            In2 = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=round((emb_dim_char)/2), return_sequences=True))(In)
            In2 = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(units=round((emb_dim_char)/2)))(In2)
            In2 = tf.keras.layers.Dense(emb_dim_char, activation="relu")(In2)
            In2 = tf.keras.layers.Dropout(0.1)(In2)

            all_inputs1.append(In1)
            all_inputs2.append(In2)

            crop1+=maxnumchar

        combined1 = tf.keras.layers.concatenate(all_inputs1)
        combined1 = tf.keras.layers.Dense(round(combined1.shape[1]/2), activation="relu")(combined1)
        combined1 = tf.keras.layers.Dropout(0.1)(combined1)

        combined2 = tf.keras.layers.concatenate(all_inputs2)
        combined2 = tf.keras.layers.Dense(emb_dim_char, activation="relu")(combined2)
        combined2 = tf.keras.layers.Dropout(0.1)(combined2)

        combined = tf.keras.layers.concatenate([combined1, combined2])

        for index, row in attribute.iterrows():
            Out = 'Out'+str(index)
            Out = tf.keras.layers.Dense(1, activation='sigmoid')(combined)

            all_outputs.append(Out)

        combinedOut = tf.keras.layers.concatenate(all_outputs)

        model = tf.keras.Model(inputs=inputs,outputs=combinedOut)

    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

    #model.summary()

In [8]:
def raha(num_sample):
    import pandas
    import IPython.display

    import raha
    
    app_1 = raha.Detection()
    app_1.LABELING_BUDGET = num_sample

    dataset_dictionary = {
        "name": data,
        "path": './datasets/' + data + '/dirty.csv',
        "clean_path": './datasets/' + data + '/clean.csv'
    }

    d = app_1.initialize_dataset(dataset_dictionary)

    app_1.run_strategies(d)

    app_1.generate_features(d)

    app_1.build_clusters(d)

    sampled_list = []

    while len(d.labeled_tuples) < app_1.LABELING_BUDGET:
        app_1.sample_tuple(d)
        if d.has_ground_truth:
            app_1.label_with_ground_truth(d)

        sampled_list.append(d.sampled_tuple)

    return sampled_list

In [9]:
def Test(Iteration_n,end_n,mod,sample_technique,val):
    ID_Alle = df.groupby(['id_'], as_index=False)['value'].sum()
    run=0
    for n in range(20,end_n+1,10):
        run+=1

        Loss = []
        acc = []
        pre = []
        rec = []
        F1 = []

        train_time = []
        test_time = []
        t_time = []
        
        for Iteration in range(1,Iteration_n+1):
            # Start the stopwatch / counter 
            t2_start = time.process_time()

            print('Test: ' + str(Iteration) + '/' + str(Iteration_n))
            train = df[df['id_'].isin(ID_Alle['id_'])]
            train_ID = ID_Alle['id_']

            train_ID_Rest = ID_Alle['id_']
            train_Rest = df[df['id_'].isin(train_ID_Rest)]
            train_Rest = train_Rest[~train_Rest.attribute.isin(Drop_list)]

            if sample_technique == 'RahaSet':
                train_ID_Manuel = pd.Series(raha(n))
                train_Manuel = df[df['id_'].isin(train_ID_Manuel)]
                train_Manuel = train_Manuel[~train_Manuel.attribute.isin(Drop_list)]
                train_Rest = train_Rest[~train_Rest.concat.isin(train_Manuel.concat)]
                train_ID_Rest = train_ID[~train_ID.isin(train_ID_Manuel)]

            elif sample_technique == 'RandomSet':
                #train_ID_Manuel = pd.Series(ID_Alle.sample(n, random_state=Iteration)['id_'])
                train_ID_Manuel = pd.Series(ID_Alle.sample(n)['id_'])
                train_Manuel = df[df['id_'].isin(train_ID_Manuel)]
                train_Manuel = train_Manuel[~train_Manuel.attribute.isin(Drop_list)]
                train_Rest = train_Rest[~train_Rest.concat.isin(train_Manuel.concat)]
                train_ID_Rest = train_ID[~train_ID.isin(train_ID_Manuel)]

            elif sample_technique == 'DiverSet':
                train_ID_Manuel_List = []

                # Iterate for choosing the next observation
                for i in range(0,n):
                    # For prefering empty value_x we have to compute the number of this
                    empty = train_Rest.groupby(['id_'])['empty1'].agg('sum')
                    count = train_Rest.groupby(['id_']).size().to_frame()
                    count['empty1'] = empty
                    count = count.sort_values(by=[0,'empty1'], ascending=False)
                    count.reset_index(inplace=True)
                    count = count[count[0]==count[0].max()]
                    count = count[count['empty1']==count['empty1'].max()]
                    train_ID_Manuel_List.append(int(count.sample(1, random_state=Iteration)['id_']))
                    train_ID_Manuel = pd.Series(train_ID_Manuel_List)
                    train_Manuel = df[df['id_'].isin(train_ID_Manuel)]
                    train_Manuel = train_Manuel[~train_Manuel.attribute.isin(Drop_list)]
                    train_Rest = train_Rest[~train_Rest.concat.isin(train_Manuel.concat)]
                    train_ID_Rest = train_ID[~train_ID.isin(train_ID_Manuel)]

                del i, count, train_ID_Manuel_List, empty

            elif sample_technique == 'sev1':
                train_ID_Manuel_List = []

                # Iterate for choosing the next observation
                for i in range(0,n):
                    # For prefering empty value_x we have to compute the number of this
                    empty = train_Rest.groupby(['id_'])['empty1'].agg('sum')
                    count = train_Rest.groupby(['id_']).size().to_frame()
                    length = train_Rest.groupby(['id_'])['length_norm'].agg('sum')
                    count['empty1'] = empty
                    count['length1'] = length
                    count = count.sort_values(by=[0,'empty1','length1'], ascending=False)
                    count.reset_index(inplace=True)
                    count = count[count[0]==count[0].min()]
                    count = count[count['empty1']==count['empty1'].max()]
                    #count = count[count['length1']==count['length1'].min()]
                    train_ID_Manuel_List.append(int(count.sample(1, random_state=Iteration)['id_']))
                    train_ID_Manuel = pd.Series(train_ID_Manuel_List)
                    train_Manuel = df[df['id_'].isin(train_ID_Manuel)]
                    train_Manuel = train_Manuel[~train_Manuel.attribute.isin(Drop_list)]
                    train_Rest = train_Rest[~train_Rest.concat.isin(train_Manuel.concat)]
                    train_ID_Rest = train_ID[~train_ID.isin(train_ID_Manuel)]

                del i, count, train_ID_Manuel_List, empty

            print('Number of train-tupels: ' + str(len(train_ID_Manuel)) + ' Sample technique: ' + str(sample_technique))

            # The records which we dont need for training we use for the testing
            test_ID = train_ID_Rest.copy()
            #if data == 'Tax':
            #    test_ID = test_ID.sample(10000)
            #    print(data + ' is used! -> short version for testing')
            #elif data == 'Movies':
            #    test_ID = test_ID.sample(1500)
            #    print(data + ' is used! -> short version for testing')
            test = df[df['id_'].isin(test_ID)]
            test = test[~test.attribute.isin(Drop_list)]

            X_train_3 = np.array(X[X['id_'].isin(train_ID_Manuel)].drop(columns='id_'))
            Y_train_3 = np.array(y[y['id_'].isin(train_ID_Manuel)].drop(columns='id_'))
            X_test_3 = np.array(X[X['id_'].isin(test_ID)].drop(columns='id_'))
            Y_test_3 = np.array(y[y['id_'].isin(test_ID)].drop(columns='id_'))

            del train_ID, train, train_ID_Rest, train_Rest

            train_Manuel_zus = pd.DataFrame()
            train_Manuel_new = train_Manuel.append(train_Manuel_zus)

            # Transform the text to numbers
            X_train_Manuel=pd.DataFrame(tf.keras.preprocessing.sequence.pad_sequences(tk_char.texts_to_sequences(train_Manuel_new.value_x), maxlen=maxlen, padding='post'))
            X_train_1=np.array(X_train_Manuel)
            X_train_Manuel['id_']=np.array(train_Manuel_new['id_'])

            X_train_Manuel_attribute=pd.DataFrame(tk_attr.texts_to_sequences(train_Manuel_new.attribute))
            X_train_attribute_1=np.array(X_train_Manuel_attribute)
            X_train_Manuel_attribute['id_']=np.array(train_Manuel_new['id_'])

            X_train_length_1=train_Manuel_new.length_norm.to_numpy()

            Y_train=tf.keras.utils.to_categorical(train_Manuel_new.value, num_classes=2)
            print(np.ndarray.sum(Y_train,axis=0))

            X_test_Manuel=pd.DataFrame(tf.keras.preprocessing.sequence.pad_sequences(tk_char.texts_to_sequences(test.value_x), maxlen=maxlen, padding='post'))
            X_test_1=np.array(X_test_Manuel)
            X_test_Manuel['id_']=np.array(test['id_'])

            X_test_Manuel_attribute=pd.DataFrame(tk_attr.texts_to_sequences(test.attribute))
            X_test_attribute_1=np.array(X_test_Manuel_attribute)
            X_test_Manuel_attribute['id_']=np.array(test['id_'])

            X_test_length_1=test.length_norm.to_numpy()

            Y_test=tf.keras.utils.to_categorical(test.value, num_classes=2)

            X_train=pd.merge(X, X_train_Manuel, on=['id_'])
            X_train['attr']=X_train_Manuel_attribute[0]
            X_train=np.array(X_train.drop(columns='id_'))

            X_test=pd.merge(X, X_test_Manuel, on=['id_'])
            X_test['attr']=X_test_Manuel_attribute[0]
            X_test=np.array(X_test.drop(columns='id_'))
            
            chose_Model(mod)
            # Train TSB-RNN (M0)
            if mod == 'M0':
                if val == True:
                    log = model.fit(X_train_1, Y_train, validation_data=(X_test_1, Y_test), shuffle=True, batch_size=batch_size, epochs=n_epochs, callbacks=[checkpoint], verbose=ver)
                    log_test_loss = log.history['val_loss']
                    log_train_loss = log.history['loss']
                    log_test_accuracy = log.history['val_accuracy']
                    log_train_accuracy = log.history['accuracy']
                else:
                    log = model.fit(X_train_1, Y_train, shuffle=True, batch_size=batch_size, epochs=n_epochs, callbacks=[checkpoint], verbose=ver)
            # Train ETSB-RNN (M1)
            elif mod == 'M1':
                if val == True:
                    log = model.fit(x=[X_train_1,X_train_attribute_1,X_train_length_1], y=Y_train, validation_data=([X_test_1,X_test_attribute_1,X_test_length_1], Y_test), shuffle=True, batch_size=batch_size, epochs=n_epochs, callbacks=[checkpoint], verbose=ver)
                    log_test_loss = log.history['val_loss']
                    log_train_loss = log.history['loss']
                    log_test_accuracy = log.history['val_accuracy']
                    log_train_accuracy = log.history['accuracy']
                else:
                    log = model.fit(x=[X_train_1,X_train_attribute_1,X_train_length_1], y=Y_train, shuffle=True, batch_size=batch_size, epochs=n_epochs, callbacks=[checkpoint], verbose=ver)
            elif mod == 'M2':
                log = model.fit(x=X_train, y=Y_train, shuffle=True, batch_size=batch_size, epochs=n_epochs, callbacks=[checkpoint], verbose=ver)
            elif mod == 'M3':
                log = model.fit(x=X_train_3, y=Y_train_3, shuffle=True, batch_size=batch_size_3, epochs=n_epochs, callbacks=[checkpoint], verbose=ver)
            time.sleep(3)

            # Stop the stopwatch / counter
            t2_stop = time.process_time()
            t2_time = t2_stop-t2_start

            # Start the stopwatch / counter 
            t3_start = time.process_time()

            # Load best weights
            model.load_weights(checkpoint_path)

            # Evaluate with testsets
            if mod == 'M0':
                scores = model.evaluate(X_test_1, Y_test, verbose=ver)
                Y_pred = model.predict(X_test_1)
                Y_pred_disc = np.argmax(Y_pred, axis=1)
                Y_test_disc = np.argmax(Y_test, axis=1)
            elif mod == 'M1':
                scores = model.evaluate([X_test_1,X_test_attribute_1,X_test_length_1], Y_test, verbose=ver)
                Y_pred = model.predict([X_test_1,X_test_attribute_1,X_test_length_1])
                Y_pred_disc = np.argmax(Y_pred, axis=1)
                Y_test_disc = np.argmax(Y_test, axis=1)
            elif mod == 'M2':
                scores = model.evaluate(X_test, Y_test, verbose=ver)
                Y_pred = model.predict(X_test)
                Y_pred_disc = np.argmax(Y_pred, axis=1)
                Y_test_disc = np.argmax(Y_test, axis=1)
            elif mod == 'M3':
                scores = model.evaluate(X_test_3, Y_test_3, verbose=ver)
                Y_pred = model.predict(X_test_3)
                Y_pred_disc = np.round(Y_pred)
                Y_pred_disc = Y_pred_disc.reshape((Y_pred_disc.shape[1]*Y_pred_disc.shape[0],1))
                Y_test_disc = Y_test_3.reshape((Y_test_3.shape[1]*Y_test_3.shape[0],1))
                print(Y_test_disc.shape,Y_pred_disc.shape)

            # Stop the stopwatch / counter
            t3_stop = time.process_time()
            t3_time = t3_stop-t3_start

            print('-----------------------------------------------------------------------------')
            Summe = test.groupby('value')['value_x'].count()
            #print('Error Rate: '+ str(round(100/(Summe[0]+Summe[1])*Summe[1],2)))
            loss = scores[0]
            print('Loss: {:.2f}'.format(loss))
            # accuracy: (tp + tn) / (p + n)
            accuracy = accuracy_score(Y_test_disc, Y_pred_disc)
            print('Accuracy: {:.2f}'.format(accuracy))
            # precision tp / (tp + fp)
            precision = precision_score(Y_test_disc, Y_pred_disc)
            print('Precision: {:.2f}'.format(precision))
            # recall: tp / (tp + fn)
            recall = recall_score(Y_test_disc, Y_pred_disc)
            print('Recall: {:.2f}'.format(recall))
            # f1: 2 tp / (2 tp + fp + fn)
            f1 = f1_score(Y_test_disc, Y_pred_disc)
            print('F1 score: {:.2f}'.format(f1))
            print()
            print('Traintime in sec: ',round(t1_time+t2_time,0))
            print('Testtime in sec: ',round(t3_time,0))
            print('Totaltime in sec: ',round(t1_time+t2_time+t3_time,0))

            acc.append(round(accuracy,4))
            pre.append(round(precision,4))
            rec.append(round(recall,4))
            F1.append(round(f1,4))

            train_time.append(int(t1_time+t2_time))
            test_time.append(int(t3_time))
            t_time.append(int(t1_time+t2_time+t3_time))

            if val == True:
                if Iteration == 1:
                    test_loss = log_test_loss
                    train_loss = log_train_loss
                    test_accuracy = log_test_accuracy
                    train_accuracy = log_train_accuracy
                else:
                    test_loss = np.column_stack([test_loss,log_test_loss])
                    train_loss = np.column_stack([train_loss,log_train_loss])
                    test_accuracy = np.column_stack([test_accuracy,log_test_accuracy])
                    train_accuracy = np.column_stack([train_accuracy,log_train_accuracy])

                np.savetxt('./datasets/' + data + '/' + mod + '_' + sample_technique + '_results/p12_test_loss' + str(n) + '.csv', test_loss, delimiter=',')
                np.savetxt('./datasets/' + data + '/' + mod + '_' + sample_technique + '_results/p12_train_loss' + str(n) +'.csv', train_loss, delimiter=',')
                np.savetxt('./datasets/' + data + '/' + mod + '_' + sample_technique + '_results/p12_test_accuracy' + str(n) + '.csv', test_accuracy, delimiter=',')
                np.savetxt('./datasets/' + data + '/' + mod + '_' + sample_technique + '_results/p12_train_accuracy' + str(n) +'.csv', train_accuracy, delimiter=',')

        if run == 1:            
            acc_pd = pd.DataFrame(acc,columns=[n])
            pre_pd = pd.DataFrame(pre,columns=[n])
            rec_pd = pd.DataFrame(rec,columns=[n])
            F1_pd = pd.DataFrame(F1,columns=[n])

            train_time_pd = pd.DataFrame(train_time,columns=[n])
            test_time_pd = pd.DataFrame(test_time,columns=[n])
            t_time_pd = pd.DataFrame(t_time,columns=[n])
        else:            
            acc_pd[n] = acc
            pre_pd[n] = pre
            rec_pd[n] = rec
            F1_pd[n] = F1

            train_time_pd[n] = train_time
            test_time_pd[n] = test_time
            t_time_pd[n] = t_time

        try:
            os.makedirs('./datasets/' + data + '/' + mod + '_' + sample_technique + '_results')
        except FileExistsError:
            # directory already exists
            pass
        
        if val == True:
            print('No measures saved!')
        else:
            acc_pd.to_csv('./datasets/' + data + '/' + mod + '_' + sample_technique + '_results/p12_acc.csv', index=False)
            pre_pd.to_csv('./datasets/' + data + '/' + mod + '_' + sample_technique + '_results/p12_pre.csv', index=False)
            rec_pd.to_csv('./datasets/' + data + '/' + mod + '_' + sample_technique + '_results/p12_rec.csv', index=False)
            F1_pd.to_csv('./datasets/' + data + '/' + mod + '_' + sample_technique + '_results/p12_F1.csv', index=False)

            train_time_pd.to_csv('./datasets/' + data + '/' + mod + '_' + sample_technique + '_results/p12_train_time.csv', index=False)
            test_time_pd.to_csv('./datasets/' + data + '/' + mod + '_' + sample_technique + '_results/p12_test_time.csv', index=False)
            t_time_pd.to_csv('./datasets/' + data + '/' + mod + '_' + sample_technique + '_results/p12_t_time.csv', index=False)
            
        print('-----------------------------------------------------------------------------')
        print('Average scores for ' + mod)
        print(f'> Accuracy: {round(np.mean(acc),2)} (+- {round(np.std(acc),2)})')
        print(f'> Precison: {round(np.mean(pre),2)} (+- {round(np.std(pre),2)})')
        print(f'> Recall: {round(np.mean(rec),2)} (+- {round(np.std(rec),2)})')
        print(f'> F1: {round(np.mean(F1),2)} (+- {round(np.std(F1),2)})')
        print(f'> Traintime in sec: {round(np.mean(train_time),0)} (+- {round(np.std(train_time),0)})')
        print(f'> Testtime in sec: {round(np.mean(test_time),0)} (+- {round(np.std(test_time,0))})')
        print(f'> Totaltime in sec: {round(np.mean(t_time),0)} (+- {round(np.std(t_time),0)})')
        print('-----------------------------------------------------------------------------')

In [10]:
#Data Preparation

# Start the stopwatch / counter 
t1_start = time.process_time() 

data = 'beers'
dirty_table, clean_table = Input(data)
dirty_table, clean_table, attribute, maxlen = Structure(True,dirty_table,clean_table)
df, X_roh, y = Merge(dirty_table, clean_table)
tk_char, tk_attr = Dictionary(attribute,df)

#Extend Attribute
attribute, X, y, Drop_list = attribute_extend(attribute,df,X_roh,y)

# Stop the stopwatch / counter
t1_stop = time.process_time()
t1_time = t1_stop-t1_start

print('Time in sec: ',t1_time)

The lists are not identical
The dirty and clean have the same structure. We use the columnames from clean for dirty.
Maximum value_x length:  52
value
0    22148
1     4362
Name: id_, dtype: int64

Error Rate:0.16
Number of characters: 86
{' ': 1, 'e': 2, 'n': 3, 'r': 4, 'a': 5, '0': 6, 'o': 7, 'i': 8, '1': 9, 'A': 10, 'l': 11, '.': 12, '2': 13, 't': 14, 'm': 15, 'B': 16, '5': 17, '6': 18, 's': 19, 'c': 20, 'g': 21, 'C': 22, '9': 23, 'y': 24, '4': 25, 'u': 26, '3': 27, 'w': 28, 'P': 29, 'p': 30, 'd': 31, '7': 32, '8': 33, 'h': 34, 'z': 35, 'I': 36, 'S': 37, 'N': 38, '/': 39, 'k': 40, 'M': 41, 'b': 42, 'O': 43, 'R': 44, 'W': 45, 'L': 46, 'T': 47, 'H': 48, 'D': 49, 'v': 50, '%': 51, 'F': 52, 'G': 53, 'f': 54, '(': 55, ')': 56, 'E': 57, 'V': 58, 'K': 59, "'": 60, 'Z': 61, 'x': 62, '-': 63, 'U': 64, 'Y': 65, 'X': 66, 'J': 67, 'ö': 68, '&': 69, 'q': 70, 'j': 71, 'ä': 72, 'Q': 73, '’': 74, '#': 75, '!': 76, 'è': 77, ':': 78, ',': 79, '°': 80, 'é': 81, '™': 82, 'í': 83, 'ü': 84, '‘': 85, '?':

In [11]:
# Parameter for models
n_classes = 2
ver=0

# Hyperparameter
n_epochs = 120
#batch_size=round((attribute.shape[0]-len(Drop_list))*n/4)
batch_size=round((attribute.shape[0]-len(Drop_list))*5)
batch_size_3=round(X.shape[0])

#opt = tf.keras.optimizers.RMSprop(learning_rate=0.005, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False)
opt = tf.keras.optimizers.RMSprop(learning_rate=0.002, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False)

emb_dim_char = round(len(tk_char.word_index)+1)
emb_dim_attr = round(len(tk_attr.word_index)+1)
rnn_dim = 64
rnn_dim_att = 8

In [12]:
Test(10,20,'M0','DiverSet',True)

Test: 1/10
Number of train-tupels: 20 Sample technique: DiverSet


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


[188.  32.]


2022-05-06 10:26:00.406900: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


-----------------------------------------------------------------------------
Loss: 0.05
Accuracy: 0.99
Precision: 1.00
Recall: 0.93
F1 score: 0.97

Traintime in sec:  754.0
Testtime in sec:  14.0
Totaltime in sec:  769.0
Test: 2/10
Number of train-tupels: 20 Sample technique: DiverSet
[188.  32.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.06
Accuracy: 0.99
Precision: 0.99
Recall: 0.94
F1 score: 0.97

Traintime in sec:  750.0
Testtime in sec:  15.0
Totaltime in sec:  765.0
Test: 3/10
Number of train-tupels: 20 Sample technique: DiverSet
[189.  31.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.07
Accuracy: 0.99
Precision: 0.98
Recall: 0.94
F1 score: 0.96

Traintime in sec:  746.0
Testtime in sec:  14.0
Totaltime in sec:  760.0
Test: 4/10
Number of train-tupels: 20 Sample technique: DiverSet
[192.  28.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.05
Accuracy: 0.99
Precision: 1.00
Recall: 0.94
F1 score: 0.97

Traintime in sec:  761.0
Testtime in sec:  15.0
Totaltime in sec:  776.0
Test: 5/10
Number of train-tupels: 20 Sample technique: DiverSet
[191.  29.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.05
Accuracy: 0.99
Precision: 0.99
Recall: 0.94
F1 score: 0.96

Traintime in sec:  769.0
Testtime in sec:  15.0
Totaltime in sec:  784.0
Test: 6/10
Number of train-tupels: 20 Sample technique: DiverSet
[192.  28.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.07
Accuracy: 0.99
Precision: 0.96
Recall: 0.94
F1 score: 0.95

Traintime in sec:  770.0
Testtime in sec:  15.0
Totaltime in sec:  785.0
Test: 7/10
Number of train-tupels: 20 Sample technique: DiverSet
[192.  28.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.07
Accuracy: 0.99
Precision: 0.98
Recall: 0.94
F1 score: 0.96

Traintime in sec:  756.0
Testtime in sec:  15.0
Totaltime in sec:  771.0
Test: 8/10
Number of train-tupels: 20 Sample technique: DiverSet
[192.  28.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.03
Accuracy: 0.99
Precision: 0.99
Recall: 0.95
F1 score: 0.97

Traintime in sec:  760.0
Testtime in sec:  14.0
Totaltime in sec:  774.0
Test: 9/10
Number of train-tupels: 20 Sample technique: DiverSet
[193.  27.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.06
Accuracy: 0.98
Precision: 0.97
Recall: 0.94
F1 score: 0.95

Traintime in sec:  768.0
Testtime in sec:  15.0
Totaltime in sec:  783.0
Test: 10/10
Number of train-tupels: 20 Sample technique: DiverSet
[190.  30.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.06
Accuracy: 0.99
Precision: 0.99
Recall: 0.94
F1 score: 0.97

Traintime in sec:  757.0
Testtime in sec:  15.0
Totaltime in sec:  772.0
No measures saved!
-----------------------------------------------------------------------------
Average scores for M0
> Accuracy: 0.99 (+- 0.0)
> Precison: 0.99 (+- 0.01)
> Recall: 0.94 (+- 0.0)
> F1: 0.96 (+- 0.01)
> Traintime in sec: 759.0 (+- 8.0)
> Testtime in sec: 14.0 (+- 0)
> Totaltime in sec: 773.0 (+- 8.0)
-----------------------------------------------------------------------------


In [13]:
Test(10,20,'M1','DiverSet',True)

Test: 1/10
Number of train-tupels: 20 Sample technique: DiverSet
[188.  32.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.07
Accuracy: 0.99
Precision: 1.00
Recall: 0.97
F1 score: 0.98

Traintime in sec:  798.0
Testtime in sec:  16.0
Totaltime in sec:  814.0
Test: 2/10
Number of train-tupels: 20 Sample technique: DiverSet
[188.  32.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.06
Accuracy: 0.99
Precision: 0.99
Recall: 0.97
F1 score: 0.98

Traintime in sec:  810.0
Testtime in sec:  16.0
Totaltime in sec:  827.0
Test: 3/10
Number of train-tupels: 20 Sample technique: DiverSet
[189.  31.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.18
Accuracy: 0.99
Precision: 1.00
Recall: 0.94
F1 score: 0.97

Traintime in sec:  811.0
Testtime in sec:  16.0
Totaltime in sec:  827.0
Test: 4/10
Number of train-tupels: 20 Sample technique: DiverSet
[192.  28.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.13
Accuracy: 0.99
Precision: 1.00
Recall: 0.94
F1 score: 0.97

Traintime in sec:  799.0
Testtime in sec:  16.0
Totaltime in sec:  815.0
Test: 5/10
Number of train-tupels: 20 Sample technique: DiverSet
[191.  29.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.10
Accuracy: 0.99
Precision: 0.99
Recall: 0.97
F1 score: 0.98

Traintime in sec:  804.0
Testtime in sec:  16.0
Totaltime in sec:  820.0
Test: 6/10
Number of train-tupels: 20 Sample technique: DiverSet
[192.  28.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.08
Accuracy: 1.00
Precision: 1.00
Recall: 0.97
F1 score: 0.99

Traintime in sec:  806.0
Testtime in sec:  16.0
Totaltime in sec:  823.0
Test: 7/10
Number of train-tupels: 20 Sample technique: DiverSet
[192.  28.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.06
Accuracy: 1.00
Precision: 1.00
Recall: 0.97
F1 score: 0.99

Traintime in sec:  795.0
Testtime in sec:  16.0
Totaltime in sec:  810.0
Test: 8/10
Number of train-tupels: 20 Sample technique: DiverSet
[192.  28.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.07
Accuracy: 0.99
Precision: 0.99
Recall: 0.97
F1 score: 0.98

Traintime in sec:  796.0
Testtime in sec:  16.0
Totaltime in sec:  812.0
Test: 9/10
Number of train-tupels: 20 Sample technique: DiverSet
[193.  27.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.11
Accuracy: 0.99
Precision: 0.99
Recall: 0.93
F1 score: 0.96

Traintime in sec:  785.0
Testtime in sec:  16.0
Totaltime in sec:  801.0
Test: 10/10
Number of train-tupels: 20 Sample technique: DiverSet
[190.  30.]


  train_Manuel_new = train_Manuel.append(train_Manuel_zus)


-----------------------------------------------------------------------------
Loss: 0.07
Accuracy: 0.99
Precision: 1.00
Recall: 0.97
F1 score: 0.98

Traintime in sec:  790.0
Testtime in sec:  16.0
Totaltime in sec:  806.0
No measures saved!
-----------------------------------------------------------------------------
Average scores for M1
> Accuracy: 0.99 (+- 0.0)
> Precison: 1.0 (+- 0.01)
> Recall: 0.96 (+- 0.02)
> F1: 0.98 (+- 0.01)
> Traintime in sec: 799.0 (+- 8.0)
> Testtime in sec: 16.0 (+- 0)
> Totaltime in sec: 815.0 (+- 8.0)
-----------------------------------------------------------------------------
