# Data Preprocessing

In [1]:
import pandas as pd

df = pd.read_csv("datasets/creditcard_cleaned.csv")

## Feature Selection

V10, V12, V14 and V17 have strong negative correlations with the "Class" label.

V4 and V11 have strong positive correlations with the "Class" label.

Therefore we will choose theses features for training our models.

In [2]:
features = ["V4", "V10", "V11", "V12", "V14", "V17"]

## Handling Outliers

In [3]:
print(f"DataFrame shape before dropping outliers: {df.shape}")
rows_before = df.shape[0]

for feature in features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1

    # Define the lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out the outliers
    df = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]

print(f"DataFrame shape after dropping outliers: {df.shape}")
print(f"Rows Dropped: {rows_before - df.shape[0]}")

DataFrame shape before dropping outliers: (283726, 31)
DataFrame shape after dropping outliers: (237332, 31)
Rows Dropped: 46394


## Selecting Features and Target

In [4]:
x = df[features]
y = df["Class"]

## Handling Unbalanced Data

Use SMOTE Oversampling for the minority

In [5]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy="minority", random_state=42, k_neighbors=5)
x_res, y_res = smote.fit_resample(x, y)

### Original vs Resampled

In [6]:
print(f"Original class distribution: {y.value_counts()}")
print(f"Resampled class distribution: {y_res.value_counts()}")

Original class distribution: Class
0    237296
1        36
Name: count, dtype: int64
Resampled class distribution: Class
0    237296
1    237296
Name: count, dtype: int64


## Exporting Balanced Dataset

In [7]:
df_res = pd.concat([x_res, y_res], axis=1)

df_res.to_csv("datasets/creditcard_balanced.csv", index=False)