In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import itertools
import winsound

data_folder = './data'

# Gesamtzeitmessung starten
start_time_total = time.time()

# Datensätze einlesen und sortieren
def read_and_sort_datasets(folder):
    datasets = []
    for filename in os.listdir(folder):
        if filename.endswith('.csv'):
            parts = filename.replace('.csv', '').split('_')
            k = int(parts[1][1:])
            l = int(parts[2][1:])
            datasets.append((k, l, filename))
    datasets.sort() 
    return datasets

sorted_datasets = read_and_sort_datasets(data_folder)
# print(sorted_datasets)

# Preprocessing-Funktion für Intervalle und Mengen
def preprocessing(df):
    def process_value(value):
        if pd.isna(value):
            return np.nan
        if isinstance(value, str):
            value = value.strip()
            if value.startswith('[') and value.endswith('['):
                try:
                    start, end = map(float, value[1:-1].split(','))
                    return (start + end) / 2
                except ValueError:
                    return np.nan
            elif value.startswith('{') and value.endswith('}'):
                try:
                    values = list(map(float, value[1:-1].split(',')))
                    return np.mean(values)
                except ValueError:
                    return np.nan
            else:
                try:
                    return float(value)
                except ValueError:
                    return np.nan
        return value
    
    for column in df.columns:
        df[column] = df[column].apply(process_value)
    return df

# Funktion zum Extrahieren des ersten numerischen Werts
def extract_numeric(value):
    value = str(value).strip()
    if value == "*":
        return np.nan
    elif value.startswith('[') and value.endswith('['):
        interval = value.strip('[').strip('[').split(', ')
        return int(interval[0])
    elif value.startswith('{') and value.endswith('}'):
        set_values = [int(v.strip()) for v in value.strip('{}').split(',')]
        return min(set_values)
    else:
        try:
            return int(value)
        except ValueError:
            return np.nan

# Anzahl plotten
def plot_cardio_count(df, output_dir):
    plt.figure(figsize=(12, 8))
    # sns.set(rc={'axes.facecolor':'c0c0c0', 'figure.facecolor':'lightblue'})
    plt.subplot(1, 2, 1)
    sns.barplot(x=df["cardio"].value_counts().index, y=df["cardio"].value_counts())
    plt.subplot(1, 2, 2)
    plt.pie(x=df["cardio"].value_counts(), autopct="%.1f%%", pctdistance=0.8,
            labels= df["cardio"].value_counts().index, shadow=True, explode=[0.05,0.05])
    plt.suptitle("Cardiovascular Disease", fontsize=16)
    plt.savefig(os.path.join(output_dir, 'cardio_count.png'))
    # plt.show()
    plt.close()

# Features plotten
def plot_column(df, column_name, output_dir):  
    plt.figure(figsize=(20, 7))

    # Extrahiere den ersten numerischen Wert und sortiere
    df[f'{column_name}_num'] = df[column_name].apply(extract_numeric)
    df_sorted = df.dropna(subset=[f'{column_name}_num']).sort_values(by=f'{column_name}_num')
    
    chart_num = df_sorted[column_name].value_counts().reindex(df_sorted[column_name].unique())
    chart = sns.barplot(x = chart_num.index, y = chart_num)
    
    for p in chart.patches:
        chart.annotate(format(p.get_height(), '.0f'), 
                       (p.get_x() + p.get_width() / 2., p.get_height()), 
                       ha = 'center', va = 'center', 
                       xytext = (0, 15), 
                       textcoords = 'offset points',
                       fontsize=8,
                       rotation=90
                      )

    chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
    plt.title(f'Distribution of {column_name}')
    plt.savefig(os.path.join(output_dir, f'{column_name}.png'))
    # plt.show()
    plt.close()

# Ergebnisse plotten

# Confusion Matrix
def plot_confusion_matrix(y_test, y_pred, classifier_name, output_dir):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure()
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["0", "1"])
    disp.plot(cmap='Blues')
    plt.title(f'Confusion Matrix - {classifier_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(os.path.join(output_dir, f'confusion_matrix_{classifier_name}.png'))
    # plt.show()
    plt.close()
    
# ROC AUC Kurve
def plot_roc_auc(y_test, y_proba, roc_auc, classifier_name, output_dir):
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure()
    plt.plot(fpr, tpr, color='blue', label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC AUC - {classifier_name}')
    plt.legend(loc='lower right')
    plt.savefig(os.path.join(output_dir, f'roc_auc_{classifier_name}.png'))
    # plt.show()
    plt.close()
    
# Feature Importances (nur für Random Forest und Gradient Boosting)
def plot_feature_importances(classifier, classifier_name, X_train, X, output_dir):
    if hasattr(classifier, 'feature_importances_'):
        importances = classifier.feature_importances_
        #indices = np.argsort(importances)[::-1]
        plt.figure(figsize=(12, 6))
        plt.title(f'Feature Importances - {classifier_name}')
        bars = plt.bar(range(X_train.shape[1]), importances, align='center')
        plt.xticks(range(X_train.shape[1]), X.columns, rotation=90)
        plt.xlim([-1, X_train.shape[1]])

        for bar, importance in zip(bars, importances):
            yval = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2.0, yval, round(importance, 2), ha='center', va='bottom')
        
        plt.savefig(os.path.join(output_dir, f'feature_importances_{classifier_name}.png'))
        # plt.show()
        plt.close()

# Korrelations heatmap erstellen
def plot_correlation_heatmap(X, classifier_name, output_dir):
    plt.figure(figsize=(16, 10))
    correlation_matrix = X.corr()
    fig, ax = plt.subplots(figsize=(16, 10))
    im = ax.imshow(correlation_matrix, cmap='Blues')
    ax.set_xticks(np.arange(len(X.columns)))
    ax.set_yticks(np.arange(len(X.columns)))
    ax.set_xticklabels(X.columns, rotation=45)
    ax.set_yticklabels(X.columns)
    # Korrelationswerte ausgeben
    for i in range(len(X.columns)):
        for j in range(len(X.columns)):
            text = ax.text(j, i, f"{correlation_matrix.iloc[i, j]:.2f}", ha="center", va="center", color="black")
    plt.colorbar(im)
    plt.title("Correlation Heatmap")
    plt.savefig(os.path.join(output_dir, f'correlation_heatmap_{classifier_name}.png'))
    # plt.show()
    plt.close()


def classify_and_evaluate(df, classifier_name, clf, output_dir):
    target_column = 'cardio'
    if target_column not in df.columns:
        print(f"Die Zielspalte '{target_column}' wurde nicht im DataFrame gefunden.")
        return

    df = df.dropna()
    
    # Feature- und Zielvariablen definieren
    X = df.drop(columns=['cardio'])
    y = df['cardio']

    # Train-Test-Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Daten normalisieren
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Zeitmesser starten
    start_time = time.time()
    
    # Modelltraining
    clf.fit(X_train, y_train)

    # Zeitmesser stoppen
    end_time = time.time()
    total_time = end_time - start_time
    
    # Vorhersagen
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1] if hasattr(clf, "predict_proba") else None
    
    # Classification Report
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba) if y_proba is not None else None
    }
    report = classification_report(y_test, y_pred)
    
    # Speichern der Ergebnisse
    with open(os.path.join(output_dir, f'classification_report_{classifier_name}.txt'), 'w') as f:
        f.write(report)
        f.write(f'\nAccuracy: {metrics["accuracy"]}\n')
        f.write(f'ROC AUC: {metrics["roc_auc"]}\n')
        f.write(f"Laufzeit: {total_time:.2f} Sekunden\n")

    # Plotten der Ergebnisse
    plot_confusion_matrix(y_test, y_pred, classifier_name, output_dir)
    plot_roc_auc(y_test, y_proba, metrics['roc_auc'], classifier_name, output_dir)
    plot_feature_importances(clf, classifier_name, X_train, X, output_dir)
    plot_correlation_heatmap(X, classifier_name, output_dir)

    return metrics



# EDA & Classifying
results = []
num_iterations = 100

output_base_dir = 'results'
os.makedirs(output_base_dir, exist_ok=True)

classifiers = ['lr', 'rf', 'gb']
metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']

# Überprüfen, welche k- und l-Werte bereits vorhanden sind
existing_files = [f for f in os.listdir(output_base_dir) if f.startswith('cvd_k')]
existing_k_l_pairs = []

for f in existing_files:
    parts = f.split('_')
    
    # Überprüfen, ob der Name aus 3 Teilen besteht und das richtige Format hat
    if len(parts) == 3 and parts[0] == 'cvd' and parts[1].startswith('k') and parts[2].startswith('l'):
        try:
            # Extrahiere die numerischen Werte von k und l
            k = int(parts[1][1:])  # Entfernt das 'k' und konvertiert den Rest in eine Ganzzahl
            l = int(parts[2][1:])  # Entfernt das 'l' und konvertiert den Rest in eine Ganzzahl
            existing_k_l_pairs.append((k, l))
        except ValueError:
            continue  # Falls es einen Fehler bei der Konvertierung gibt, überspringe diese Datei

# Bestimmen des höchsten k-Werts
if existing_k_l_pairs:
    max_k, max_l = max(existing_k_l_pairs)
else:
    max_k, max_l = -1, -1


# Lesen der bestehenden CSV-Datei
csv_file_path = f'{output_base_dir}/classification_results_multiple_iterations.csv'
if os.path.exists(csv_file_path):
    df_existing = pd.read_csv(csv_file_path)
else:
    df_existing = pd.DataFrame()


# Schleife über die Datensätze
for k, l, filename in sorted_datasets:
    # Wenn k kleiner ist als der höchste existierende k-Wert, überspringe diesen Durchlauf
    if k < max_k:
        continue

    # Wenn k gleich dem höchsten existierenden k-Wert ist, lösche die entsprechenden Einträge in der CSV
    if k == max_k:
        df_existing = df_existing[df_existing['k'] != k]
    
        
    try:
        #df = pd.read_csv(os.path.join(data_folder, filename), sep=';', index_col=False)
        df = pd.read_csv(os.path.join(data_folder, filename), sep=',', index_col=False)
        print(f'Successfully read {filename}')

        output_dir = f'results/cvd_k{k}_l{l}'
        os.makedirs(output_dir, exist_ok=True)
            
        # id entfernen
        df = df.drop('id', axis=1)

        # Zeilen mit "*" entfernen
        df = df[~df.isin(['*']).any(axis=1)]

        # EDA plotten
        plot_cardio_count(df, output_dir)
        for column in df.columns:
            plot_column(df, column, output_dir)
                
        # Alle Spalten, die mit '_num' enden, entfernen
        df = df[[col for col in df.columns if not col.endswith('_num')]]

        # Zeilen mit NaN Werten entfernen
        df = df.dropna()

        # Klassifizierung mit verschiedenen Modellen
        classifiers = {
            'lr': LogisticRegression(max_iter=10000),
            'rf': RandomForestClassifier(),
            'gb': GradientBoostingClassifier()
        }

        df = preprocessing(df)

        for clf_name, clf in classifiers.items():
            for i in range(num_iterations):
                print(f'Classifying with {clf_name} for {filename}, iteration {i+1}/{num_iterations}')
                metrics = classify_and_evaluate(df, clf_name, clf, output_dir)

                results.append({
                    'k':k, 
                    'l':l,
                    'filename': filename,
                    'classifier': clf_name,
                    **metrics
                })

                # Fortschritt nach jeder Iteration anhängen
                df_results = pd.DataFrame(results)
                
                # Löschen der bestehenden Einträge für den aktuellen k-Wert und Anhängen neuer Ergebnisse
                df_existing = pd.concat([df_existing, df_results], ignore_index=True)
                df_existing.to_csv(csv_file_path, index=False)
                
                results = []  # Clear results after writing to CSV to avoid duplications
    
    except pd.errors.ParserError as e:
        print(f'Error parsing {filename}: {e}')
        continue

winsound.Beep(1000, 1000)


# Endzeit messen
end_time_total = time.time()

# Berechne die verstrichene Zeit
elapsed_time_total = end_time_total - start_time_total

hours_total, rem_total = divmod(elapsed_time_total, 3600)
minutes_total, seconds_total = divmod(rem_total, 60)

print(f"Gesamte Ausführung hat {int(hours_total)} Stunden, {int(minutes_total)} Minuten, und {seconds_total:.2f} Sekunden gedauert.")

KeyError: 'k'