In [None]:
import pandas as pd
import numpy as np
from skrebate import ReliefF
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from dm_lib import load_dataframe_from_disk
from dm_lib import column_str, attributes, create_dataframe_from_columns, undersample_data, oversample_with_smote
from IPython.display import display, HTML

def apply_relief(df, target='fraud', n_neighbors=100, n_features_to_select=10):
    X = df.drop(target, axis=1)
    y = df[target]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

    y_train = y_train.values

    fs = ReliefF(n_neighbors=n_neighbors)
    fs.fit(X_train, y_train)

    feature_importances = fs.feature_importances_

    feature_names = X.columns.tolist()

    feature_scores_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

    feature_scores_df = feature_scores_df.sort_values(by='Importance', ascending=False)

    top_features_indices = feature_importances.argsort()[-n_features_to_select:][::-1]
    features_selected = X.columns[top_features_indices].tolist()

    X_train_transformed = X_train[:, top_features_indices]
    X_test_transformed = X_test[:, top_features_indices]
    transformed_df = pd.DataFrame(np.vstack((X_train_transformed, X_test_transformed)), columns=features_selected)

    return transformed_df, features_selected, feature_scores_df

for setName in ('B'):#'A','B'
    df = load_dataframe_from_disk('dataSet_'+setName+'.pkl')
    for entry in attributes:
        if entry['name'] == 'all':
            column_list = entry['value'][:]
            column_list.append('fraud')
            source_df = create_dataframe_from_columns(df, column_list)
            for type in ('u','o',''):
                if type == 'o':
                    relief_df = oversample_with_smote(source_df, 'fraud')
                elif type == 'u':
                    relief_df = undersample_data(source_df, 'fraud', desired_ratio=1)
                else:
                    relief_df=source_df
                selected_data, selected_features, feature_scores_df= apply_relief(relief_df)
                print(setName+type)
                print(selected_features)
                display(HTML(feature_scores_df.to_html()))



In [None]:
A = ['pan_from_chip', 'pin_present', 'tran_declined', 'able_to_enter_pin', 'first_mcc_tran', 'time_since_prev_mcc_tran', 'sepa_country', 'nochip_count_01d', 'previous_tran_declined', 'mcc_count_60m']
B = ['pan_from_chip', 'able_to_enter_pin', 'pin_present', 'sepa_country', 'first_mcc_tran', 'time_since_prev_mcc_tran', 'tran_at_night', 'tran_declined', 'same_country', 'first_merchant_tran']

#undersampled
uA = ['pan_from_chip', 'pin_present', 'tran_declined', 'first_nochip_tran', 'first_tran', 'time_since_prev_tran', 'previous_tran_declined', 'nochip_count_01d', 'all_count_30d', 'able_to_enter_pin']
uB = ['pan_from_chip', 'pin_present', 'able_to_enter_pin', 'tran_declined', 'sepa_country', 'nochip_count_30d', 'tran_at_night', 'same_merchant', 'nochip_tran_avg_60d', 'nochip_count_60d']
values_not_commonU = list(set(uA) ^ set(uB))

print(values_not_commonU)
#oversampled
oA = ['pan_from_chip', 'pin_present', 'tran_declined', 'all_count_60d', 'all_count_30d', 'able_to_enter_pin', 'previous_tran_declined', 'nochip_count_30d', 'nochip_count_60d', 'amount']
oB = ['pan_from_chip', 'sepa_country', 'pin_present', 'tran_declined', 'able_to_enter_pin', 'nochip_count_60d', 'nochip_count_30d', 'all_count_60d', 'previous_tran_declined', 'all_count_30d']
values_not_commonO = list(set(oA) ^ set(oB))
print(values_not_commonO)

unique_values = set(A + B + uA + uB + oA + oB)
print(unique_values)
print(len(unique_values))

combined_list = A + B + uA + uB + oA + oB
from collections import Counter
value_counts = Counter(combined_list)

sorted_values = sorted(value_counts.items(), key=lambda x: x[1], reverse=True)
 
for value, count in sorted_values:
    print(f"{value}: {count}")