In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import umap
from sklearn.preprocessing import StandardScaler
from dm_lib import load_dataframe_from_disk
from dm_lib import column_str, attributes, create_dataframe_from_columns, undersample_data, oversample_with_smote

def plot_umap(df, fraud_label, n_neighbors=15, plot_title="UMAP",filename=None):
    scaler = StandardScaler()
    features = df.drop(columns=[fraud_label])
    scaled_features = scaler.fit_transform(features)

    reducer = umap.UMAP(n_neighbors=n_neighbors)
    embedding = reducer.fit_transform(scaled_features)
    
    default_colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
    color_map = {0: default_colors[0], 1: default_colors[1]}

    plt.figure(figsize=(12, 8))
    colors = df[fraud_label].map(color_map)
    scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=colors, edgecolor='k', alpha=0.7)
    plt.title(plot_title, fontsize=20)
    plt.xlabel('UMAP dimensija 1')
    plt.ylabel('UMAP dimensija 2')
    plt.grid(True)
    
    plt.legend(handles=[
        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color_map[0], markersize=10, label='Īsta transackija'),
        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color_map[1], markersize=10, label='Krāpnieciska transakcija')
    ])
    
    if filename:
        plt.savefig(filename)
        plt.close()
    else:
        plt.show()

for setName in ('B'):#'A','B'
    df = load_dataframe_from_disk('dataSet_'+setName+'.pkl')
    for entry in attributes:
        if entry['name'] not in ('binary'):
            continue
        for type in ('o','u',''):#
            column_list = entry['value'][:]
            column_list.insert(0, 'fraud')
            source_df = create_dataframe_from_columns(df, column_list)
            if type == 'o':
                umap_df = oversample_with_smote(source_df, 'fraud')
            elif type == 'u':
                umap_df = undersample_data(source_df, 'fraud', desired_ratio=1)
            else:
                umap_df = source_df
            for n in [15,50,100]:#
                f = 'UMAP/UMAP_'+entry['name']+'_n'+str(n)+'_set_'+setName+type+'.png'
                title = 'Datu kopa: '+setName+type+', atribūtu kopa:'+entry['name']+', n_neighbors:'+str(n)
                plot_umap(umap_df, 'fraud',n_neighbors = n, plot_title = title,filename=f)




