In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings("ignore")

from category_encoders.woe import WOEEncoder
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import train_test_split  
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Training data
train_tr = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
train_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')

# Test data
test_tr  = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')
test_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')


train = train_tr.merge(train_id, on='TransactionID', how='left')
test = test_tr.merge(test_id, on='TransactionID', how='left')

# Data Cleaning

In [None]:
print("Train Data Columns:")
print(train.columns)
print("\nTest Data Columns:")
print(test.columns)

In [None]:
#ზემოდან ჩანს, რომ id- და id_ განსხვავებული აქვთ, ამიტომ გავხადოთ ორივესთან id_
test.columns = test.columns.str.replace(r'^id-', 'id_', regex=True)

# Feature Engineering

In [None]:
train['userId'] = train['card1'].astype(str)+train['P_emaildomain'].astype(str)+train['addr1'].astype(str)
test['userId'] = test['card1'].astype(str)+test['P_emaildomain'].astype(str)+test['addr1'].astype(str)

train['TransactionMinute'] = train['TransactionDT'].apply(lambda x: (x // 60) % 60)
test['TransactionMinute'] = test['TransactionDT'].apply(lambda x: (x // 60) % 60)

train['TransactionHour'] = (train['TransactionDT']//3600)%24
test['TransactionHour'] = (test['TransactionDT']//3600)%24

train['TransactionDay'] = train['TransactionDT']//(24*3600)
test['TransactionDay'] = test['TransactionDT']//(24*3600)

train['TransactionWD'] = (train['TransactionDT']//(24*3600))%7
test['TransactionWD'] = (test['TransactionDT']//(24*3600))%7

train['same_email'] = (train['P_emaildomain'] == train['R_emaildomain']).astype(int)
test['same_email'] = (test['P_emaildomain'] == test['R_emaildomain']).astype(int)

train['same_card'] = (train['card1'] == train['addr1']).astype(int) 
test['same_card'] = (test['card1'] == test['addr1']).astype(int)

train['Tr_log'] = np.log1p(train['TransactionAmt'])
test['Tr_log'] = np.log1p(test['TransactionAmt'])

In [None]:
X = train.drop(['TransactionID','isFraud'], axis = 1)
y = train['isFraud']

test = test.drop('TransactionID', axis = 1)


def high_missing_list(data, threshold):
    missing_ratio = data.isnull().mean()
    columns_to_remove = missing_ratio[missing_ratio >= threshold].index.tolist()
    return columns_to_remove


#Split the data into training and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(train, y, test_size=0.3, random_state=42)
# დავდროფავ იმ მონაცემებს, რომელთა 90% არის missing
high_missing_cols = high_missing_list(train,threshold = 0.9)
X_tr = X_tr.drop(columns = high_missing_cols,axis =1)
X_val = X_val.drop(columns = high_missing_cols,axis =1)
test = test.drop(columns = high_missing_cols,axis =1)

In [None]:
from collections import Counter
# the isFraud is imbalanced, lets plot it 
c = Counter(y)
overall_mean = y.mean()
labels = ['Non-Fraud', 'Fraud']
sizes = [c[0], c[1]]
colors = ['#66b3ff', '#ff69b4']  
plt.pie(sizes, labels=labels, autopct='%1.2f%%', colors=colors, startangle=140)
plt.title('isFraud Imbalance')
plt.axis('equal')
plt.show()

In [None]:
# Differentiate categorical and numerical
cat_coluka = ['userId','P_emaildomain','R_emaildomain','DeviceType','DeviceInfo','ProductCD','addr1', 'addr2',]
cat_cards = ['card' + str(i) for i in range(1, 7)]
cat_ms = ['M' + str(i) for i in range(1, 10)]
cat_ids = ['id_' + str(i) for i in range(12, 39)]
cat_cols = cat_coluka + cat_cards + cat_ms + cat_ids

print("Categorical Columns:")
print(cat_cols)
print("\nCard Columns:")
print(cat_cards)
print("\nM Columns:")
print(cat_ms)
print("\nID Columns:")
print(cat_ids)
print("\nAll Categorical Features:")
print(cat_cols)
print(len(cat_cols))


num_cols = [col for col in X.columns if col not in cat_cols and col != 'isFraud']

print("\nNumerical Columns Amount:")
print(len(num_cols))

In [None]:
print("High Missing Values:")
print(high_missing_cols)
print(len(high_missing_cols))

def to_be_removed(cols ,num ,cat):
    to_num = []
    to_cat = []
    for c in cols:
        if c in num:
            to_num.append(c)
        elif c in cat:
            to_cat.append(c)
    
    return to_num, to_cat

belongs_to_num, belongs_to_cat = to_be_removed(high_missing_cols, num_cols, cat_cols)

print("Numeric :")
print(belongs_to_num)
print(len(belongs_to_num))

print("Categorical :")
print(belongs_to_cat)
print(len(belongs_to_cat))

num_cols = [col for col in num_cols if col not in belongs_to_num]
cat_cols = [col for col in cat_cols if col not in belongs_to_cat]

print("Final Numerics: ")
print(len(num_cols))
print("Final Categorical: ")
print(len(cat_cols))

In [None]:
print()

In [None]:
X_tr_num  = X_tr[num_cols]
X_tr_cat = X_tr[cat_cols]
X_val_num = X_val[num_cols]
X_val_cat = X_val[cat_cols]

Getting Data Ready for Removal/Encoding

In [None]:
#Work on numerical features 
#Fill the missing values with medians
def fill_num(num_cols, X):
    for c in num_cols:
        val = X[c].median()
        X[c] = X[c].fillna(val)
    return X

X_tr_num = fill_num(num_cols, X_tr_num)
X_val_num = fill_num(num_cols, X_val_num)

In [None]:
has_null_01 = X_tr_num.isnull().any().any()
has_null_02 = X_val_num.isnull().any().any()
print(has_null_01 or has_null_02)

In [None]:
def remove_low_var(inp, threshold=0.01):
    spcfc = VarianceThreshold(threshold=threshold)
    spcfc.fit(inp)
    
    hiding_m = spcfc.get_support()
    kept_feat = inp.columns[hiding_m]
    delete_feat = inp.columns[~hiding_m]
    
    return inp[kept_feat], delete_feat

X_tr_num, deleted_num_features = remove_low_var(X_tr_num)
X_val_num = X_val_num[X_tr_num.columns]

In [None]:
deleted_num_features

In [None]:
# Now handle categorical data,firstly transform numerical categorical in strings
def number_to_str(inp):
    for c in inp.columns:
        if inp[c].dtype in ['int64', 'float64']:
            inp[c] = inp[c].astype(str)
    return inp

X_tr_cat = number_to_str(X_tr_cat)
X_val_cat = number_to_str(X_val_cat)

In [None]:
def fill_cat(cat_cols, X):
    for c in cat_cols:
        X[c]=X[c].fillna('NotAv')
    return X
    
fill_cat(cat_cols, X_tr_cat)
fill_cat(cat_cols, X_val_cat)


has_null_03 = X_tr_cat.isnull().any().any()
has_null_04 = X_val_cat.isnull().any().any()

print(has_null_03 or has_null_04)

Encoding Categorical Values

In [None]:
# When encoding, need to separate nominal and ordinal cats
def categorize_cols(col, inp, threshold, list_bin, list_multi):
    if inp[col].dtype == 'object':
        non_rep = inp[col].nunique()
        if non_rep <= threshold:
            list_bin.append(col)
        else:
            list_multi.append(col)

def separate_input(inp, threshold=2):
    tr_binary = []
    tr_multi = []

    for col in inp.columns:
        categorize_cols(col, inp, threshold, tr_binary, tr_multi)
    return tr_binary, tr_multi    

# Create WOE encoder
def woe_trans(inp, dest, columns):
    ecdr = WOEEncoder(cols=columns)
    ecdr.fit(inp[columns], dest)
    return ecdr

def use_woe(ecdr, train, val, columns):
    train_trans = ecdr.transform(train[columns])
    val_trans = ecdr.transform(val[columns])
    return train_trans, val_trans

def one_hot_enc(inp, columns):
    return pd.get_dummies(inp, columns=columns, drop_first=True, dtype=int)

# --- Pipeline ---

# 1. Separate binary and multi-categorical features
binary_cols, multi_cols = separate_input(X_tr_cat)

# 2. Train WOE encoder on multi-categorical features
encoder = woe_trans(X_tr_cat, y_tr, multi_cols)

# 3. Apply WOE transformation
X_tr_cat = X_tr_cat.copy()
X_val_cat = X_val_cat.copy()

X_tr_woe, X_val_woe = use_woe(encoder, X_tr_cat, X_val_cat, multi_cols)

# 4. Replace multi-categorical columns with WOE-transformed values
X_tr_cat[multi_cols] = X_tr_woe
X_val_cat[multi_cols] = X_val_woe

# 5. Apply one-hot encoding to binary categorical columns
X_tr_cat = one_hot_enc(X_tr_cat, binary_cols)
X_val_cat = one_hot_enc(X_val_cat, binary_cols)

print("MORHCA")

In [None]:
def return_high_corr(X, threshold = 0.85):
    corr_checker = X.corr().abs()

    h_c = []
    for c1 in corr_checker.columns:
        for c2 in corr_checker.columns:
            if c1 != c2 and corr_checker.loc[c1,c2] >= threshold:
                if c1 not in h_c:
                    h_c.append(c1)
    return h_c


def remove_hc(X, l, h_c):
    X_sup = X.drop(columns = h_c)
    if l is not None:
        l_removed = l.drop(columns = h_c)
    else:
        l_removed = None
    return X_sup, l_removed


def work_on_corr(X, l =None, threshold = 0.85):
    h_c = return_high_corr(X, threshold)
    X_removed, l_removed  = remove_hc(X,l,h_c)
    return X_removed, l_removed


X_tr_num, X_val_num = work_on_corr(X_tr_num, X_val_num)
X_tr_cat, X_val_cat = work_on_corr(X_tr_cat, X_val_cat)

In [None]:
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTE

X_tr_combined = pd.concat([X_tr_num, X_tr_cat], axis=1)
X_val_combined = pd.concat([X_val_num, X_val_cat], axis=1)

smote = SMOTE(random_state=42)
X_tr_smote, y_tr_smote = smote.fit_resample(X_tr_combined, y_tr)
print("Before SMOTE, balance:", dict(Counter(y_tr)))
print("After SMOTE balance:", dict(Counter(y_tr_smote)))