In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from dm_lib import read_sql_complete, column_str, attributes, create_dataframe_from_columns, undersample_data, oversample_with_smote
from dm_lib import load_dataframe_from_disk
from dm_lib import tranSet_A, tranSet_B, tranSet_C, tranSet_D

def apply_pca(df, title, n_components=2, save_path=None):
    features = df.drop('fraud', axis=1)
    labels = df['fraud']
    
    features = StandardScaler().fit_transform(features)
    
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(features)

    principal_df = pd.DataFrame(data = principal_components,
                                columns = [f'Galvenā komponente {i+1}' for i in range(n_components)])
    principal_df['fraud'] = labels
    
    plt.figure(figsize=(8, 6))
    plt.xlabel('Galvenā komponente 1', fontsize=15)
    plt.ylabel('Galvenā komponente 2', fontsize=15)
    plt.title(title, fontsize=20)
    
    colors = ['blue', 'orange']
    labels_dict = {0: 'Īsta transakcija', 1: 'Krāpnieciska transakcija'}
    
    for label, color in zip(labels_dict.keys(), colors):
        idx_to_keep = principal_df['fraud'] == label
        plt.scatter(principal_df.loc[idx_to_keep, 'Galvenā komponente 1'],
                    principal_df.loc[idx_to_keep, 'Galvenā komponente 2'],
                    color=color, label=labels_dict[label], s=50)
    
    plt.legend()
    plt.grid()

    if save_path:
        plt.savefig(save_path)
        plt.close()  # Close the figure to free up memory
    else:
        plt.show()

for setName in ('B'):#'A','B'
    df = load_dataframe_from_disk('dataSet_'+setName+'.pkl')
    for entry in attributes:
        if entry['type'] not in ('relieff','cart'):
            continue
        for type in ('o','u',''):
            column_list = entry['value'][:]
            column_list.insert(0, 'fraud')
            source_df = create_dataframe_from_columns(df,column_list)
            if type == 'o':
                pca_df = oversample_with_smote(source_df, 'fraud')
            elif type == 'u':
                pca_df = undersample_data(source_df, 'fraud', desired_ratio=1)
            else:
                pca_df = source_df
            fileName = 'PCA/PCA_'+entry['name']+'_set_'+setName+type+'.png'
            title = 'Datu kopa: '+setName+type+', atribūtu kopa:'+entry['name']
            apply_pca(pca_df,title,save_path=fileName)
