In [1]:
import numpy as np
import pandas as pd
import json
import gc
import re
import os

# -------------------------------------------------------------------
# Kaggle-specific file paths
# Replace <your-dataset-folder> with the folder name in /kaggle/input
# -------------------------------------------------------------------
DATA_PATH = "/kaggle/input/intelligent-bank/"

# Load CSV files
transaction_df = pd.read_csv(os.path.join(DATA_PATH, "transactions_data_south_africa.csv"))
card_df = pd.read_csv(os.path.join(DATA_PATH, "cards_data_south_africa.csv"))
users_df = pd.read_csv(os.path.join(DATA_PATH, "user_data_south_africa.csv"))

# Load MCC JSON file
mcc_series = pd.read_json(os.path.join(DATA_PATH, "mcc_codes.json"), typ='series')
mcc_df = mcc_series.reset_index()
mcc_df.columns = ['mcc_code', 'description']

# Load labels JSON
file_path = os.path.join(DATA_PATH, 'train_fraud_labels.json')

with open(file_path, 'r') as f:
    raw_json_data = json.load(f)

transaction_labels_dict = raw_json_data['target']

train_fraud_labels = pd.Series(transaction_labels_dict).reset_index()
train_fraud_labels.columns = ['transaction_id', 'is_fraud']
train_fraud_labels['transaction_id'] = pd.to_numeric(train_fraud_labels['transaction_id'])

print("\nAll data files loaded successfully in Kaggle environment.")



All data files loaded successfully in Kaggle environment.


In [2]:
# 1. BASIC DATA INSPECTION
print("\n--- Shape of DataFrames ---")
print("Transactions:", transaction_df.shape)
print("Cards:", card_df.shape)
print("Users:", users_df.shape)
print("MCC Codes:", mcc_df.shape)
print("Fraud Labels:", train_fraud_labels.shape)


--- Shape of DataFrames ---
Transactions: (12858754, 12)
Cards: (6146, 13)
Users: (2000, 12)
MCC Codes: (109, 2)
Fraud Labels: (8914963, 2)


In [3]:
print("\n--- Missing Values Summary ---")
print(transaction_df.isnull().sum())
print(card_df.isnull().sum())
print(users_df.isnull().sum())


--- Missing Values Summary ---
id                       0
date                     0
client_id                0
card_id                  0
amount                   0
use_chip                 0
merchant_id              0
merchant_city            0
merchant_state     1509523
zip                1509523
mcc                      1
errors            12654526
dtype: int64
id                       0
client_id                0
card_brand               0
card_type                0
card_number              0
expires                  0
cvv                      0
has_chip                 0
num_cards_issued         0
credit_limit             0
acct_open_date           0
year_pin_last_changed    0
card_on_dark_web         0
dtype: int64
id                   0
current_age          0
retirement_age       0
birth_year           0
birth_month          0
gender               0
address              0
per_capita_income    0
yearly_income        0
total_debt           0
credit_score         0
num_credit_car

In [4]:
# Preview samples
print("\n--- Sample Rows ---")
print(transaction_df.head())
print(card_df.head())
print(users_df.head())
print(mcc_df.head())
print(train_fraud_labels.head())


--- Sample Rows ---
        id                 date  client_id  card_id   amount  \
0  7475327  2022-02-19 05:51:55       1556     2972 -1386.00   
1  7475328  2023-01-13 02:58:58        561     4575   262.26   
2  7475329  2024-07-03 23:41:24       1129      102  1440.00   
3  7475331  2022-06-12 09:33:41        430     2860  3600.00   
4  7475332  2023-08-26 10:05:48        848     3915   835.38   

            use_chip  merchant_id     merchant_city merchant_state     zip  \
0  Swipe Transaction        59935         Kimberley  Northern Cape  8300.0   
1  Swipe Transaction        67570  Pietermaritzburg  KwaZulu-Natal  3200.0   
2  Swipe Transaction        27092    Port Elizabeth   Eastern Cape  6000.0   
3  Swipe Transaction        27092      Bloemfontein     Free State  9300.0   
4  Swipe Transaction        13051         Polokwane        Limpopo   700.0   

      mcc errors  
0  5499.0    NaN  
1  5311.0    NaN  
2  4829.0    NaN  
3  4829.0    NaN  
4  5813.0    NaN  
     id  cl