In [None]:
from dm_lib import load_dataframe_from_disk
from dm_lib import column_str, attributes, create_dataframe_from_columns, undersample_data, oversample_with_smote

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, RocCurveDisplay
from sklearn.preprocessing import LabelEncoder
import pydotplus
from IPython.display import Image
import matplotlib.pyplot as plt
import os

def evaluate_decision_tree(df, target_column='fraud', train_depth=10, plot_depth=2, output_file='tree.png', roc_output_file='roc_auc.png'):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    le = LabelEncoder()
    for col in X.columns:
        if X[col].dtype == 'object':
            X[col] = le.fit_transform(X[col])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    clf = DecisionTreeClassifier(criterion='entropy', max_depth=train_depth)
    clf.fit(X_train, y_train)
    
    dot_data = export_graphviz(clf, out_file=None, 
                               feature_names=X.columns,
                               class_names=['Īsta transakcija', 'Krāpnieciska transakcija'],
                               filled=True, rounded=True,
                               special_characters=True,
                               max_depth=plot_depth)
    
    graph = pydotplus.graph_from_dot_data(dot_data)
    image = Image(graph.create_png())
    display(image)
    graph.write_png(output_file)

    importance = pd.DataFrame({'Atribūts': X.columns, 'Svarīgums': clf.feature_importances_}).sort_values(by='Svarīgums', ascending=False)
    print("Atribūtu svarīgums:")
    print(importance)

    y_scores = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC līkne (laukums = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Kļūdainas atbilsmes koeficioents')
    plt.ylabel('Patiesas atbilsmes koeficients')
    plt.title('ROC')
    plt.legend(loc="lower right")
    plt.savefig(roc_output_file)
    plt.show()

    return roc_auc, importance

feature_importances_dict = {}

roc_auc_df = pd.DataFrame(index=[fs['name'] for fs in attributes])

for setName in ('B'):#'A','B','C'
    df = load_dataframe_from_disk('dataSet_'+setName+'.pkl')
    for entry in attributes:
        if entry['type'] == 'cart':
            continue
        column_list = entry['value'][:]
        column_list.insert(0, 'fraud')
        source_df = create_dataframe_from_columns(df,column_list)
        for type in ('o','u',''):
            fileName = 'CART/CART_'+entry['name']+'_set_'+setName+type+'_TREE.png'
            name = entry['name']+'_set_'+setName+type
            rocFileName = 'CART/CART_'+entry['name']+'_set_'+setName+type+'_ROC.png'
            if type == 'o':
                cart_df = oversample_with_smote(source_df, 'fraud')
            elif type == 'u':
                cart_df = undersample_data(source_df, 'fraud', desired_ratio=1)
            else:
                cart_df = source_df
            roc_auc, feature_importances = evaluate_decision_tree(cart_df, output_file=fileName, roc_output_file=rocFileName)
                    
            if (setName+type) not in roc_auc_df.columns:
                roc_auc_df[setName+type] = pd.NA
            roc_auc_df.loc[entry['name'], (setName+type)] = roc_auc
            
            for index, row in feature_importances.iterrows():
                if row['Atribūts'] not in feature_importances_dict:
                    feature_importances_dict[row['Atribūts']] = {}
                feature_importances_dict[row['Atribūts']][name] = row['Svarīgums']

feature_importances_df = pd.DataFrame.from_dict(feature_importances_dict, orient='index')
feature_importances_df.fillna(0, inplace=True)

print("Atribūtu svarīgums:")
print(feature_importances_df)
print("ROC-AUC rezultāti:")
print(roc_auc_df)


In [None]:
from IPython.display import display, HTML

sorted_df = roc_auc_df.sort_values(by='Bo', ascending=False)
display(HTML(roc_auc_df.to_html()))


all_df = feature_importances_df.filter(regex='^all_set_')
all_no_0_df = all_df.loc[(all_df != 0).any(axis=1)]
sorted_all_df = all_no_0_df.sort_values(by='all_set_A', ascending=False)
#sorted_all_df.head(10)
#all_no_0_df.to_csv('all_df_scores.csv', index=False)

In [None]:
A = ['tran_declined', 'amount', 'pin_present', 'country_day_avg_60d', 'declined_count_60d', 'all_count_30d', 'merchant_day_avg_07d', 'mcc_tran_avg_01d', 'amount_diff', 'able_to_enter_pin']

Au = ['pin_present', 'tran_declined', 'amount', 'all_sum_60d', 'amount_diff', 'country_count_60d', 'all_count_60d', 'time_since_prev_tran', 'all_day_avg_01d', 'declined_count_60d']

Ao = ['tran_declined', 'pan_from_chip', 'amount', 'nochip_sum_60d', 'amount_diff', 'all_count_60d', 'nochip_count_60d', 'nochip_day_avg_01d', 'declined_count_60d', 'all_day_avg_30d']

B = ['pan_from_chip', 'tran_declined', 'amount', 'mcc_sum_01d', 'high_risk_mcc', 'mcc_count_01d', 'mcc_day_avg_30d', 'declined_count_60d', 'country_count_60d', 'nochip_day_avg_30d']

Bu = ['pan_from_chip', 'tran_declined', 'nochip_count_60d', 'mcc_sum_01d', 'amount', 'nochip_sum_07d', 'all_sum_60d', 'country_count_30d', 'merchant_tran_avg_07d', 'merchant_sum_07d']

Bo = ['pan_from_chip', 'tran_declined', 'amount', 'mcc_sum_01d', 'all_count_60d', 'mcc_count_01d', 'cnp_count_01d', 'time_since_prev_tran', 'all_day_avg_07d', 'all_count_30d']


unique_values = set(A + B + Au + Bu + Ao + Bo)
print(unique_values)
print(len(unique_values))

combined_list = A + B + Au + Bu + Ao + Bo
from collections import Counter

value_counts = Counter(combined_list)

sorted_values = sorted(value_counts.items(), key=lambda x: x[1], reverse=True)

for value, count in sorted_values:
    print(f"{value}: {count}")