In [1]:
# Tensorflow / Keras
from tensorflow import keras # for building Neural Networks
from keras.models import Sequential # for creating a linear stack of layers for our Neural Network
from keras import Input # for instantiating a keras tensor
from keras.layers import Dense # for creating regular densely-connected NN layers.
import tensorflow as tf
from keras.models import load_model

# Data manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

# Sklearn
import sklearn # for model evaluation
from sklearn.model_selection import train_test_split # for splitting data into train and test samples
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Erstellung eigener Aktivierungsfunktion
from keras import backend as K
from sklearn.ensemble import RandomForestRegressor

import os
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.impute import KNNImputer
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest

In [2]:
desired_cpu_usage = 0.90
num_threads = int(os.cpu_count() * desired_cpu_usage)

os.environ['OMP_NUM_THREADS'] = str(num_threads)
os.environ['TF_NUM_INTEROP_THREADS'] = str(num_threads)
os.environ['TF_NUM_INTRAOP_THREADS'] = str(num_threads)

tf.config.threading.set_inter_op_parallelism_threads(num_threads)
tf.config.threading.set_intra_op_parallelism_threads(num_threads)

In [11]:
class DataProcessor:
    def __init__(self):
        self.raw_data = None
        self.X = None
        self.Y = None
        self.new_df = None
        self.data_inpute = None
        self.result = None

    def import_data(self, sample_size = 4200):
        csv_files = []
        for filename in os.listdir():
            if filename == "Datasets":
                for csv_file in os.listdir(filename):
                    if csv_file.endswith('.csv'):
                        csv_files.append(csv_file)

        self.raw_data = pd.read_csv(os.path.join("Datasets", csv_files[1]))
        self.raw_data.drop(["column_a"], axis=1, inplace=True)
        # Auswahl von 4000 zufälligen Zeilen
        self.raw_data = self.raw_data.sample(n=sample_size, random_state=42)   
        self.Y = self.raw_data["y"] - 1
        self.X = self.raw_data.drop(["y"], axis=1)
        return self.raw_data, self.Y, self.X

    def gen_miss_values(self, p):
        shape = self.X.shape
        self.new_df = self.X.copy().astype(np.float64)
        missing = np.random.binomial(1, p, shape)
        self.new_df[missing.astype('bool')] = np.nan
        return self.new_df

    def inpute_data(self, model):
        if model == "mean":
            self.data_inpute = self.new_df.fillna(self.new_df.mean())
            self.Y.index = self.data_inpute.index
            self.data_inpute = pd.concat([self.data_inpute, self.Y], axis=1, sort=False)
            columns = self.data_inpute.columns.tolist()

            for i in range(len(columns) - 1):
                columns[i] = "col_" + str(i + 1)
            self.data_inpute.columns = columns
            self.data_inpute.columns = [*self.data_inpute.columns[:-1], 'Y']

            self.Y = self.data_inpute["Y"]
            self.X = self.data_inpute.drop(["Y"], axis=1)
            return self.data_inpute

        elif model == "MICE":
            imputer = IterativeImputer()
            self.data_inpute = pd.DataFrame(imputer.fit_transform(self.new_df), columns=self.new_df.columns)
            self.Y.index = self.data_inpute.index
            self.data_inpute = pd.concat([self.data_inpute, self.Y], axis=1, sort=False)

            columns = self.data_inpute.columns.tolist()

            for i in range(len(columns) - 1):
                columns[i] = "col_" + str(i + 1)
            self.data_inpute.columns = columns
            self.data_inpute.columns = [*self.data_inpute.columns[:-1], 'Y']

            self.Y = self.data_inpute["Y"]
            self.X = self.data_inpute.drop(["Y"], axis=1)

            return self.data_inpute

        elif model == "kNN":
            imputer = KNNImputer()
            self.data_inpute = pd.DataFrame(imputer.fit_transform(self.new_df), columns=self.new_df.columns)
            self.Y.index = self.data_inpute.index
            self.data_inpute = pd.concat([self.data_inpute, self.Y], axis=1, sort=False)

            columns = self.data_inpute.columns.tolist()

            for i in range(len(columns) - 1):
                columns[i] = "col_" + str(i + 1)
            self.data_inpute.columns = columns
            self.data_inpute.columns = [*self.data_inpute.columns[:-1], 'Y']

            self.Y = self.data_inpute["Y"]
            self.X = self.data_inpute.drop(["Y"], axis=1)

            return self.data_inpute
        
        elif model == "RF":
            imputer = MissForest(random_state=42, n_jobs=-1, criterion='squared_error')
            self.data_inpute = pd.DataFrame(imputer.fit_transform(self.new_df), columns=self.new_df.columns)
            self.Y.index = self.data_inpute.index
            self.data_inpute = pd.concat([self.data_inpute, self.Y], axis=1, sort=False)

            columns = self.data_inpute.columns.tolist()

            for i in range(len(columns) - 1):
                columns[i] = "col_" + str(i + 1)
            self.data_inpute.columns = columns
            self.data_inpute.columns = [*self.data_inpute.columns[:-1], 'Y']

            self.Y = self.data_inpute["Y"]
            self.X = self.data_inpute.drop(["Y"], axis=1)

            return self.data_inpute
    
    def model(self, model):
        if model == 0:
            return load_model("model_D1_01.h5")
        elif model == 1:
            return load_model("model_D1_02.h5")
        elif model == 2:
            return load_model("model_D1_03.h5")

    def evaluate(self, y_test, y_nan):
        y_true = y_test
        y_pred = y_nan
        conf_matrix = confusion_matrix(y_true, y_pred)

        # Compute the accuracy
        accuracy = accuracy_score(y_true, y_pred)

        # Compute the precision
        precision_scores = precision_score(y_true, y_pred, labels=range(len(conf_matrix)), average=None)

        # Compute the recall
        recall = recall_score(y_true, y_pred, labels=range(len(conf_matrix)), average=None)

        # Compute the F1-score
        f1 = f1_score(y_true, y_pred, labels=range(len(conf_matrix)), average=None)

        # Compute the average accuracy
        avg_accuracy = sum(precision_scores) / len(conf_matrix)

        # Compute the predicted probabilities
        y_scores = np.random.rand(len(y_true), len(np.unique(y_true)))

        # Compute the AUC for each class
        n_classes = conf_matrix.shape[0]
        auc_list = []
        for i in range(n_classes):
            auc_list.append(roc_auc_score(y_true == i, y_scores[:, i]))

        # Define the result dictionary
        self.result = {
            "confusion_matrix": conf_matrix,
            "accuracy": accuracy,
            "precision": precision_scores,
            "recall": recall,
            "f1_score": f1,
            "avg_accuracy": avg_accuracy,
            "auc": auc_list
        }
        return self.result
    def save_txt(self, filename, evaluate):
        with open(filename, 'w') as f:
            f.write(str(evaluate))

In [6]:
data_processor = DataProcessor()
load_data = data_processor.import_data()
test_size = 0.95  # Konstante Testgröße
for model_number in range(0, 3):  # Modelle 1-3 durchlaufen
    model_name = f"Model_{model_number + 1}"
    output_prefix = f"prediction_ref_mean_D1_{model_name}_"
    model = data_processor.model(model_number)
    
    for missing_rate in [0.3, 0.6, 0.9]:
        miss_data = data_processor.gen_miss_values(missing_rate)
        inpute_values = data_processor.inpute_data("mean")
        Y = inpute_values["Y"]
        X = inpute_values.drop(["Y"], axis=1)

        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=42)

        y_pred = model.predict(X_test)
        y_pred_class = np.argmax(y_pred, axis=1)
        y_test = y_test[X_test.index].values

        evaluate = data_processor.evaluate(y_test, y_pred_class) 
        print(evaluate)
        filename = output_prefix + str(missing_rate).replace(".", "_")
        data_processor.save_txt(filename, evaluate)

{'confusion_matrix': array([[622,  27,  29, 117,  24],
       [ 32, 160, 263,  79, 242],
       [ 10, 116, 306,  79, 267],
       [ 11,  19,  74, 486, 222],
       [  3,  51, 128, 169, 454]], dtype=int64), 'accuracy': 0.5082706766917293, 'precision': array([0.91740413, 0.42895442, 0.3825    , 0.52258065, 0.37551696]), 'recall': array([0.75946276, 0.20618557, 0.3933162 , 0.59852217, 0.56397516]), 'f1_score': array([0.83099532, 0.27850305, 0.3878327 , 0.55797933, 0.45084409]), 'avg_accuracy': 0.5253912309418822, 'auc': [0.5112933949263183, 0.5087247159655887, 0.48231807457253806, 0.5048501551615634, 0.4818811002661934]}
{'confusion_matrix': array([[393,  50,  46, 271,  59],
       [ 19,  61, 139, 102, 455],
       [  2,  44, 159,  89, 484],
       [  4,  33,  86, 260, 429],
       [  0,  29, 123,  73, 580]], dtype=int64), 'accuracy': 0.36416040100250624, 'precision': array([0.94019139, 0.28110599, 0.2875226 , 0.32704403, 0.28898854]), 'recall': array([0.47985348, 0.07860825, 0.20437018, 

  _warn_prf(average, modifier, msg_start, len(result))


{'confusion_matrix': array([[  0,   1,  12,  10, 796],
       [  0,   0,  10,   0, 766],
       [  0,   0,   8,   1, 769],
       [  0,   2,  12,   1, 797],
       [  0,   0,  12,   0, 793]], dtype=int64), 'accuracy': 0.20100250626566415, 'precision': array([0.        , 0.        , 0.14814815, 0.08333333, 0.20224433]), 'recall': array([0.        , 0.        , 0.01028278, 0.00123153, 0.98509317]), 'f1_score': array([0.        , 0.        , 0.01923077, 0.00242718, 0.33559035]), 'avg_accuracy': 0.08674516138173369, 'auc': [0.5046604819547109, 0.5091348898825371, 0.4969234906376154, 0.5079630743380444, 0.5111935801554258]}
 25/125 [=====>........................] - ETA: 0s

  _warn_prf(average, modifier, msg_start, len(result))


{'confusion_matrix': array([[  0,   0,   4,   9, 806],
       [  0,   0,   3,   0, 773],
       [  0,   0,   7,   0, 771],
       [  0,   0,   7,   1, 804],
       [  0,   0,   6,   0, 799]], dtype=int64), 'accuracy': 0.20225563909774436, 'precision': array([0.        , 0.        , 0.25925926, 0.1       , 0.20212497]), 'recall': array([0.        , 0.        , 0.00899743, 0.00123153, 0.99254658]), 'f1_score': array([0.        , 0.        , 0.0173913 , 0.00243309, 0.3358554 ]), 'avg_accuracy': 0.1122768455275412, 'auc': [0.47961051177702074, 0.5083714772355481, 0.507413555209097, 0.5201609278072462, 0.5108874089530699]}
  1/125 [..............................] - ETA: 1s

  _warn_prf(average, modifier, msg_start, len(result))


{'confusion_matrix': array([[  0,   0,   1,   0, 818],
       [  0,   0,   0,   0, 776],
       [  0,   0,   1,   0, 777],
       [  0,   0,   0,   0, 812],
       [  0,   0,   1,   0, 804]], dtype=int64), 'accuracy': 0.20175438596491227, 'precision': array([0.        , 0.        , 0.33333333, 0.        , 0.20165538]), 'recall': array([0.        , 0.        , 0.00128535, 0.        , 0.99875776]), 'f1_score': array([0.        , 0.        , 0.00256082, 0.        , 0.33555927]), 'avg_accuracy': 0.10699774266365689, 'auc': [0.5024599073794911, 0.5077243406744975, 0.5002781183671771, 0.5037089193872901, 0.5254631083202512]}


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
data_processor = DataProcessor()
load_data = data_processor.import_data()
test_size = 0.95 # Konstante Testgröße
for model_number in range(0, 3):  # Modelle 1-3 durchlaufen
    model_name = f"Model_{model_number + 1}"
    output_prefix = f"prediction_ref_MICE_D1_{model_name}_"
    model = data_processor.model(model_number)
    
    for missing_rate in [0.3, 0.6, 0.9]:
        miss_data = data_processor.gen_miss_values(missing_rate)
        inpute_values = data_processor.inpute_data("MICE")
        Y = inpute_values["Y"]
        X = inpute_values.drop(["Y"], axis=1)

        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=42)

        y_pred = model.predict(X_test)
        y_pred_class = np.argmax(y_pred, axis=1)
        y_test = y_test[X_test.index].values

        evaluate = data_processor.evaluate(y_test, y_pred_class) 
        print(evaluate)
        filename = output_prefix + str(missing_rate).replace(".", "_")
        data_processor.save_txt(filename, evaluate)

{'confusion_matrix': array([[666,  38,  39,  69,   7],
       [ 26, 303, 325,  22, 100],
       [  7, 166, 477,  21, 107],
       [  8,  19,  42, 579, 164],
       [  0,  55, 126, 192, 432]], dtype=int64), 'accuracy': 0.6157894736842106, 'precision': array([0.94200849, 0.52151463, 0.47274529, 0.65571914, 0.53333333]), 'recall': array([0.81318681, 0.39046392, 0.61311054, 0.71305419, 0.53664596]), 'f1_score': array([0.87287025, 0.44657332, 0.53385562, 0.68318584, 0.53498452]), 'avg_accuracy': 0.625064176302234, 'auc': [0.49444350106601764, 0.4973232443112927, 0.5047676291029463, 0.5218415863991047, 0.5020997104049456]}
{'confusion_matrix': array([[544,  49,  36, 166,  24],
       [ 33, 114, 196, 102, 331],
       [  7,  82, 234, 101, 354],
       [  8,  31,  80, 399, 294],
       [  1,  49, 130, 153, 472]], dtype=int64), 'accuracy': 0.4418546365914787, 'precision': array([0.91736931, 0.35076923, 0.34615385, 0.43322476, 0.32      ]), 'recall': array([0.66422466, 0.14690722, 0.30077121, 0.



{'confusion_matrix': array([[181,  33,  86, 360, 159],
       [  6,   5,  60,  85, 620],
       [  0,   3,  60,  59, 656],
       [  1,   6,  90, 102, 613],
       [  0,   1,  51,  32, 721]], dtype=int64), 'accuracy': 0.2679197994987469, 'precision': array([0.96276596, 0.10416667, 0.17291066, 0.15987461, 0.26038281]), 'recall': array([0.22100122, 0.0064433 , 0.07712082, 0.12561576, 0.89565217]), 'f1_score': array([0.35948361, 0.01213592, 0.10666667, 0.14068966, 0.4034695 ]), 'avg_accuracy': 0.3320201409533474, 'auc': [0.4794630367005012, 0.4901802840664875, 0.5107261650558478, 0.4893258609839196, 0.5276098169798259]}




{'confusion_matrix': array([[ 91,  18, 120, 280, 310],
       [  2,   2,  43,  39, 690],
       [  0,   0,  36,  31, 711],
       [  0,   2,  66,  45, 699],
       [  0,   0,  36,   7, 762]], dtype=int64), 'accuracy': 0.23458646616541354, 'precision': array([0.97849462, 0.09090909, 0.11960133, 0.1119403 , 0.24022699]), 'recall': array([0.11111111, 0.00257732, 0.04627249, 0.05541872, 0.94658385]), 'f1_score': array([0.1995614 , 0.00501253, 0.06672845, 0.07413509, 0.38320342]), 'avg_accuracy': 0.30823446562094947, 'auc': [0.5146445061298419, 0.4990040351811341, 0.4824077127225347, 0.502634723948823, 0.4850941427693868]}
{'confusion_matrix': array([[ 11,   6,  67,  70, 665],
       [  0,   0,  14,   8, 754],
       [  0,   0,   8,   2, 768],
       [  1,   0,   8,   4, 799],
       [  0,   0,   7,   3, 795]], dtype=int64), 'accuracy': 0.2050125313283208, 'precision': array([0.91666667, 0.        , 0.07692308, 0.04597701, 0.21026184]), 'recall': array([0.01343101, 0.        , 0.01028278, 0

In [9]:
data_processor = DataProcessor()
load_data = data_processor.import_data()
test_size = 0.95  # Konstante Testgröße
for model_number in range(0, 3):  # Modelle 1-3 durchlaufen
    model_name = f"Model_{model_number + 1}"
    output_prefix = f"prediction_ref_kNN_D1_{model_name}_"
    model = data_processor.model(model_number)
    
    for missing_rate in [0.3, 0.6, 0.9]:
        miss_data = data_processor.gen_miss_values(missing_rate)
        inpute_values = data_processor.inpute_data("kNN")
        Y = inpute_values["Y"]
        X = inpute_values.drop(["Y"], axis=1)

        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=42)

        y_pred = model.predict(X_test)
        y_pred_class = np.argmax(y_pred, axis=1)
        y_test = y_test[X_test.index].values

        evaluate = data_processor.evaluate(y_test, y_pred_class) 
        print(evaluate)
        filename = output_prefix + str(missing_rate).replace(".", "_")
        data_processor.save_txt(filename, evaluate)

{'confusion_matrix': array([[631,  32,  44,  95,  17],
       [ 25, 248, 313,  35, 155],
       [  6, 156, 425,  33, 158],
       [  5,  39,  57, 467, 244],
       [  0,  62, 165, 125, 453]], dtype=int64), 'accuracy': 0.5573934837092732, 'precision': array([0.94602699, 0.46182495, 0.42330677, 0.61854305, 0.44109056]), 'recall': array([0.77045177, 0.31958763, 0.54627249, 0.57512315, 0.56273292]), 'f1_score': array([0.84925976, 0.37776085, 0.47699214, 0.5960434 , 0.49454148]), 'avg_accuracy': 0.5781584628464801, 'auc': [0.5040755873300812, 0.4960341835654578, 0.5139019166557286, 0.49422445569447593, 0.5003367883225913]}
{'confusion_matrix': array([[432,  47,  64, 216,  60],
       [ 13, 121, 253,  58, 331],
       [  1,  99, 247,  38, 393],
       [  2,  37, 112, 234, 427],
       [  0,  68, 185,  38, 514]], dtype=int64), 'accuracy': 0.3879699248120301, 'precision': array([0.96428571, 0.32526882, 0.28687573, 0.40068493, 0.29797101]), 'recall': array([0.52747253, 0.15592784, 0.31748072, 0

  _warn_prf(average, modifier, msg_start, len(result))


{'confusion_matrix': array([[  3,   9,  68,  34, 705],
       [  0,  11,  61,  20, 684],
       [  0,  11,  55,  29, 683],
       [  0,  14,  61,  23, 714],
       [  0,  12,  64,  28, 701]], dtype=int64), 'accuracy': 0.1987468671679198, 'precision': array([1.        , 0.19298246, 0.17799353, 0.17164179, 0.20103241]), 'recall': array([0.003663  , 0.01417526, 0.07069409, 0.02832512, 0.87080745]), 'f1_score': array([0.00729927, 0.02641056, 0.10119595, 0.04862579, 0.32665424]), 'avg_accuracy': 0.3487300361545884, 'auc': [0.515050351379585, 0.5043467208539958, 0.5020740827296097, 0.5004902082358084, 0.49196173835038076]}
{'confusion_matrix': array([[  1,   4,  40,  10, 764],
       [  0,   6,  33,   8, 729],
       [  0,   4,  54,  10, 710],
       [  0,   4,  53,  13, 742],
       [  0,   7,  52,   7, 739]], dtype=int64), 'accuracy': 0.2037593984962406, 'precision': array([1.        , 0.24      , 0.23275862, 0.27083333, 0.20059718]), 'recall': array([0.001221  , 0.00773196, 0.06940874, 0.

  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
data_processor = DataProcessor()
load_data = data_processor.import_data()
test_size = 0.95  # Konstante Testgröße
for model_number in range(0, 3):  # Modelle 1-3 durchlaufen
    model_name = f"Model_{model_number + 1}"
    output_prefix = f"prediction_ref_RF_D1_{model_name}_"
    model = data_processor.model(model_number)
    
    for missing_rate in [0.3, 0.6, 0.9]:
        miss_data = data_processor.gen_miss_values(missing_rate)
        inpute_values = data_processor.inpute_data("RF")
        Y = inpute_values["Y"]
        X = inpute_values.drop(["Y"], axis=1)

        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=42)

        y_pred = model.predict(X_test)
        y_pred_class = np.argmax(y_pred, axis=1)
        y_test = y_test[X_test.index].values

        evaluate = data_processor.evaluate(y_test, y_pred_class)
        print(evaluate)
        filename = output_prefix + str(missing_rate).replace(".", "_")
        data_processor.save_txt(filename, evaluate)

ValueError: Found array with 0 sample(s) (shape=(0, 177)) while a minimum of 1 is required.