In [1]:
# 🛠️ Data Preprocessing & EDA - B5W8 Fraud Detection

# This notebook covers Task 1 of the B5W8 challenge, focusing on:
# - Data loading and cleaning
# - Feature engineering
# - Exploratory Data Analysis (EDA)
# - Class imbalance handling strategy


# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import ipaddress

# Set display options
pd.set_option('display.max_columns', None)
sns.set(style='whitegrid')


In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import ipaddress

# Set display options
pd.set_option('display.max_columns', None)
sns.set(style='whitegrid')


# 📁 Load Data
fraud_df = pd.read_csv('../data/Fraud_Data.csv')
ip_df = pd.read_csv('../data/IpAddress_to_Country.csv')
creditcard_df = pd.read_csv('../data/creditcard.csv')

fraud_df.head()


NameError: name 'pd' is not defined

In [None]:
# 🔍 Data Cleaning & Preprocessing

# Check missing values
print("Fraud Dataset:\n", fraud_df.isnull().sum())
print("\nIP Dataset:\n", ip_df.isnull().sum())
print("\nCreditCard Dataset:\n", creditcard_df.isnull().sum())

# Drop duplicates
fraud_df.drop_duplicates(inplace=True)
creditcard_df.drop_duplicates(inplace=True)


In [None]:
# 🌍 Convert IP to Country

# Convert IP address to integer
fraud_df['ip_integer'] = fraud_df['ip_address'].apply(lambda x: int(ipaddress.IPv4Address(x)))

# Merge by checking if ip_integer is between lower and upper bounds
def find_country(ip_int, ip_data):
    row = ip_data[(ip_data['lower_bound_ip_address'] <= ip_int) & 
                  (ip_data['upper_bound_ip_address'] >= ip_int)]
    return row['country'].values[0] if not row.empty else 'Unknown'

fraud_df['country'] = fraud_df['ip_integer'].apply(lambda x: find_country(x, ip_df))
fraud_df[['ip_address', 'country']].head()


In [None]:
# 🧠 Feature Engineering

# Convert time columns
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

# Feature: time since signup
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds()

# Feature: hour of day & day of week
fraud_df['hour_of_day'] = fraud_df['purchase_time'].dt.hour
fraud_df['day_of_week'] = fraud_df['purchase_time'].dt.dayofweek

fraud_df[['time_since_signup', 'hour_of_day', 'day_of_week']].head()


In [None]:
# 📊 Exploratory Data Analysis (EDA)

# Plot fraud vs non-fraud distribution
sns.countplot(x='class', data=fraud_df)
plt.title('Fraud Class Distribution')
plt.show()

# Age distribution by class
sns.histplot(data=fraud_df, x='age', hue='class', kde=True, element='step')
plt.title('Age Distribution by Fraud Class')
plt.show()


In [None]:
# ⚖️ Class Imbalance Strategy

fraud_distribution = fraud_df['class'].value_counts(normalize=True)
print('Fraud Class Distribution (%):')
print(fraud_distribution * 100)

# Commentary
print(\"\"\"We observe a strong class imbalance in the fraud dataset.
We'll explore SMOTE (oversampling) and random undersampling in the modeling phase.
Resampling will be applied to training data only to avoid data leakage.\"\"\")
