In [None]:
from dm_lib import read_sql_complete, column_str, attributes, create_dataframe_from_columns, undersample_data, oversample_with_smote
from dm_lib import load_dataframe_from_disk
from dm_lib import tranSet_A, tranSet_B, tranSet_C, tranSet_D


import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, precision_recall_curve, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef, cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import os
import re

def sanitize_filename(filename):
    forbidden_chars = r'[<>:"/\\|?*, ]'
    sanitized = re.sub(forbidden_chars, '_', filename)
    return sanitized

def preprocess_data(df, target_column, reduction_method='pca', n_components=2):
    X = df.drop(columns=[target_column])
    y = df[target_column]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    if reduction_method == 'PCA':
        reducer = PCA(n_components=n_components)
    elif reduction_method == 't-SNE':
        reducer = TSNE(n_components=n_components,perplexity=50)
    elif reduction_method == 'UMAP':
        reducer = umap.UMAP(n_components=n_components,n_neighbors=50)
    else:
        raise ValueError("Invalid reduction method.")

    X_reduced = reducer.fit_transform(X_scaled)
    return X_reduced, y

def train_models(X, y):
    models = {
        'Nejaušais mežs': RandomForestClassifier(),
        'Naivais Beijess': GaussianNB(),
        'Atbalsta vektoru mašīnas': SVC(probability=True),
        'Loģistiskā regresija': LogisticRegression()
    }

    for name, model in models.items():
        model.fit(X, y)
    return models

def evaluate_models(models, X_test, y_test):
    metrics = {}

    for name, model in models.items():
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        fpr, tpr, _ = roc_curve(y_test, y_proba)
        precision, recall, _ = precision_recall_curve(y_test, y_proba)

        roc_auc = auc(fpr, tpr)
        pr_auc = auc(recall, precision)
        f1 = f1_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
        mcc = matthews_corrcoef(y_test, y_pred)
        kappa = cohen_kappa_score(y_test, y_pred)

        metrics[name] = {
            'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc,
            'precision': precision, 'recall': recall, 'pr_auc': pr_auc, 'f1_score': f1,
            'confusion_matrix': cm, 'mcc': mcc, 'kappa': kappa
        }
    return metrics

def plot_confusion_matrix(cm, class_names, title, filename):
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(cmap=plt.cm.Blues)
    
    plt.title(title)
    plt.xlabel("Prognozētā klase")
    plt.ylabel("Patiesā klase")
    plt.savefig(filename)
    plt.show()


def plot_metrics(metrics_dict, output_dir):
    class_names = ['Īsta transakcija', 'Krāpnieciska transakcija']
    rows = []

    for method, metrics in metrics_dict.items():
        plt.figure(figsize=(14, 7))

        plt.subplot(1, 2, 1)
        for name, metric in metrics.items():
            plt.plot(metric['fpr'], metric['tpr'], label=f'{name} (AUC = {metric["roc_auc"]:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('Kļūdainas atbilsmes koeficients')
        plt.ylabel('Pareizas atbilsmes koeficients')
        plt.title(f'ROC līkne ({method})')
        plt.legend(loc='best')
        roc_filename = os.path.join(output_dir, f"roc_curve_{sanitize_filename(method)}.png")
        plt.savefig(roc_filename)

        plt.subplot(1, 2, 2)
        for name, metric in metrics.items():
            plt.plot(metric['recall'], metric['precision'], label=f'{name} (AUC = {metric["pr_auc"]:.2f})')
        plt.xlabel('Pārklājums')
        plt.ylabel('Precizitāte')
        plt.title(f'Precizitātes-Pārklājuma līkne ({method})')
        plt.legend(loc='best')
        pr_filename = os.path.join(output_dir, f"pr_curve_{sanitize_filename(method)}.png")
        plt.savefig(pr_filename)

        plt.tight_layout()
        plt.show()

        for name, metric in metrics.items():
            cm_filename = os.path.join(output_dir, f"confusion_matrix_{sanitize_filename(method)}_{name}.png")
            plot_confusion_matrix(metric['confusion_matrix'], class_names, f'Pārpratumu matrica ({method} - {name})', cm_filename)
            print(f"Confusion matrix for {name} ({method}) saved to: {cm_filename}")

            rows.append({
                'Method': method,
                'Model': name,
                'ROC AUC': metric['roc_auc'],
                'PR AUC': metric['pr_auc'],
                'F1 Score': metric['f1_score'],
                'MCC': metric['mcc'],
                'Cohen\'s Kappa': metric['kappa']
            })

    metrics_df = pd.DataFrame(rows)
    metrics_table_filename = os.path.join(output_dir, "metrics_table.csv")
    metrics_df.to_csv(metrics_table_filename, sep=',', decimal=',', index=False)

    plt.figure(figsize=(12, len(metrics_df) * 0.4 + 1))
    plt.table(cellText=metrics_df.values, colLabels=metrics_df.columns, cellLoc='center', loc='center')
    plt.axis('off')
    table_filename = os.path.join(output_dir, "metrics_table.png")
    plt.savefig(table_filename)
    plt.show()

    plt.figure(figsize=(10, 6))
    for method, metrics in metrics_dict.items():
        f1_scores = [metric['f1_score'] for metric in metrics.values()]
        plt.plot(list(metrics.keys()), f1_scores, marker='o', label=method.upper())
    plt.xlabel('Modeļi')
    plt.ylabel('F1 mērs')
    plt.title('F1 mēra salīdzinājums')
    plt.legend(loc='best')
    f1_filename = os.path.join(output_dir, "f1_score_comparison.png")
    plt.savefig(f1_filename)
    plt.show()

    for metric_name, ylabel in zip(['mcc', 'kappa'], ['MKK', 'Cohen\'s Kappa']):
        plt.figure(figsize=(10, 6))
        for method, metrics in metrics_dict.items():
            scores = [metric[metric_name] for metric in metrics.values()]
            plt.plot(list(metrics.keys()), scores, marker='o', label=method)
        plt.xlabel('Modeļi')
        plt.ylabel(ylabel)
        plt.title(f'{ylabel} salīdzinājums')
        plt.legend(loc='best')
        filename = os.path.join(output_dir, f"{metric_name}_comparison.png")
        plt.savefig(filename)
        plt.show()
        print(f"{ylabel} comparison plot saved to: {filename}")

metrics_dict = {}    

for setName in ('B'):#'A','B'
    df = load_dataframe_from_disk('dataSet_'+setName+'.pkl')
    for entry in attributes:
        if entry['type'] in ('aggregated','relieff','cart'):
            continue
        for type in ('o','u',''):
            column_list = entry['value'][:]
            column_list.insert(0, 'fraud')
            source_df = create_dataframe_from_columns(df,column_list)
            if type == 'o':
                classifier_df = oversample_with_smote(source_df, 'fraud')
            elif type == 'u':
                classifier_df = undersample_data(source_df, 'fraud', desired_ratio=1)
            else:
                classifier_df = source_df

            target_column = 'fraud'
            
            reduction_methods = ['UMAP','t-SNE','PCA']
            
            for method in reduction_methods:
                X_reduced, y = preprocess_data(classifier_df, target_column, reduction_method=method)
        
                X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=42)

                models = train_models(X_train, y_train)

                id = 'Datu kopa: '+setName+type+', atribūtu kopa: '+entry['name']+', '+method

                metrics = evaluate_models(models, X_test, y_test)
                metrics_dict[id] = metrics

plot_metrics(metrics_dict, 'classifiers')