In [182]:
!pip install geopandas

Collecting geopandas
  Downloading geopandas-1.0.1-py3-none-any.whl.metadata (2.2 kB)
Collecting pyogrio>=0.7.2 (from geopandas)
  Downloading pyogrio-0.10.0-cp311-cp311-win_amd64.whl.metadata (5.6 kB)
Collecting pyproj>=3.3.0 (from geopandas)
  Downloading pyproj-3.7.0-cp311-cp311-win_amd64.whl.metadata (31 kB)
Collecting shapely>=2.0.0 (from geopandas)
  Downloading shapely-2.0.6-cp311-cp311-win_amd64.whl.metadata (7.2 kB)
Downloading geopandas-1.0.1-py3-none-any.whl (323 kB)
Downloading pyogrio-0.10.0-cp311-cp311-win_amd64.whl (16.2 MB)
   ---------------------------------------- 0.0/16.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/16.2 MB ? eta -:--:--
    --------------------------------------- 0.3/16.2 MB ? eta -:--:--
    --------------------------------------- 0.3/16.2 MB ? eta -:--:--
   - -------------------------------------- 0.5/16.2 MB 699.0 kB/s eta 0:00:23
   - -------------------------------------- 0.8/16.2 MB 781.4 kB/s eta 0:00:20
   - -----------

# Load & Prepare Dataset

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import warnings
import geopandas as gpd

In [None]:
data = pd.read_csv('./data/Fraud_Data.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
print(data.columns)

In [None]:
data['signup_time'] = pd.to_datetime(data['signup_time'])
data['purchase_time'] = pd.to_datetime(data['purchase_time'])

In [None]:
data = data.drop(columns=['user_id', 'device_id'])

In [None]:
data.info()

# Feature Engineering

In [None]:
# Extract meaningful features
data['signup_month'] = data['signup_time'].dt.month
data['signup_day'] = data['signup_time'].dt.day
data['signup_hour'] = data['signup_time'].dt.hour
data['signup_weekday'] = data['signup_time'].dt.weekday  # 0 = Monday, 6 = Sunday

In [None]:
# Sine and Cosine Transformations for cyclical 'signup_hour'
data['signup_hour_sin'] = np.sin(2 * np.pi * data['signup_hour'] / 24)
data['signup_hour_cos'] = np.cos(2 * np.pi * data['signup_hour'] / 24)

In [None]:
# Do the same for purchase_time
data['purchase_month'] = data['purchase_time'].dt.month
data['purchase_day'] = data['purchase_time'].dt.day
data['purchase_hour'] = data['purchase_time'].dt.hour
data['purchase_weekday'] = data['purchase_time'].dt.weekday  # 0 = Monday, 6 = Sunday

In [None]:
# Sine and Cosine Transformations for cyclical 'purchase_hour'
data['purchase_hour_sin'] = np.sin(2 * np.pi * data['purchase_hour'] / 24)
data['purchase_hour_cos'] = np.cos(2 * np.pi * data['purchase_hour'] / 24)

In [None]:
# Calculate time difference in seconds
data['time_diff'] = (data['purchase_time'] - data['signup_time']).dt.total_seconds()

In [None]:
data = data.drop(columns=['signup_time', 'purchase_time'])

# IP Addresses

In [None]:
ip_map = pd.read_csv('./data/IpAddress_to_Country.csv')  

In [None]:
ip_map.head()

In [None]:
ip_map.info()

In [None]:
def map_country(ip):
    row = ip_map[(ip_map['lower_bound_ip_address'] <= ip) & (ip_map['upper_bound_ip_address'] >= ip)]
    if not row.empty:
        return row['country'].values[0]
    return 'Unknown'

In [None]:
warnings.filterwarnings('ignore')

In [None]:
data['country'] = data['ip_address'].apply(map_country)

In [None]:
data['country'].value_counts()

# EDA (Part 1)

In [None]:
# EDA: Summary statistics and missing values
print("Missing values:\n", data.isnull().sum())

In [None]:
data.describe()

In [None]:
numerical_cols = [
    'purchase_value', 'age', 'time_diff', 'signup_month', 'signup_day', 'signup_weekday',
    'purchase_month', 'purchase_day', 'purchase_weekday'
]

In [None]:
for col in numerical_cols:
    plt.figure(figsize=(8, 5))
    sns.histplot(data[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
# Visualization 2: Histogram of 'purchase_value'
plt.figure(figsize=(8, 6))
sns.histplot(data['purchase_value'], bins=30, kde=True)
plt.title("Purchase Value Distribution")
plt.xlabel("Purchase Value")
plt.ylabel("Frequency")
plt.show()

In [None]:
categorical_cols = ['source', 'browser', 'sex']
for col in categorical_cols:
    plt.figure(figsize=(8, 5))
    sns.countplot(x=data[col])
    plt.title(f'Count of {col}')
    plt.show()

In [None]:
# Boxplots to detect outliers
for col in numerical_cols:
    plt.figure(figsize=(8, 5))
    sns.boxplot(x=data[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
# Bar plots for categorical features
categorical_cols = ['source', 'browser', 'sex']
for col in categorical_cols:
    plt.figure(figsize=(8, 5))
    sns.countplot(x=data[col])
    plt.title(f'Count of {col}')
    plt.show()

In [None]:
# Correlation heatmap for numeric columns
numeric_df = data.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_df.corr()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Scatter plot for relationships between numerical features
plt.figure(figsize=(8, 5))
sns.scatterplot(x='age', y='purchase_value', hue='class', data=data)
plt.title('Age vs Purchase Value by Class')
plt.show()

In [None]:
# Visualization 4: Bar plot of fraud cases by 'signup_month'
if 'signup_month' in data.columns:
    plt.figure(figsize=(8, 6))
    sns.countplot(data=data, x='signup_month', hue='class')
    plt.title("Fraud Cases by Signup Month")
    plt.xlabel("Signup Month")
    plt.ylabel("Count")
    plt.show()

In [None]:
# Frequency distribution of countries (before encoding)
country_frequency = data['country'].value_counts()

In [None]:
# Define a frequency threshold
threshold = 100

# Filter countries with frequency above the threshold
frequent_countries = country_frequency[country_frequency > threshold]

# Bar plot for country frequencies above the threshold
plt.figure(figsize=(12, 6))
frequent_countries.plot(kind='bar')
plt.title(f'Country Distribution (Countries with > {threshold} Transactions)')
plt.xlabel('Country')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()

In [None]:
fraud_by_country = data[data['class'] == 1]['country'].value_counts()

In [None]:
# Fraud frequency by country above the threshold
fraud_by_country_filtered = fraud_by_country[fraud_by_country > threshold]

plt.figure(figsize=(12, 6))
fraud_by_country_filtered.plot(kind='bar', color='red')
plt.title(f'Fraud Frequency by Country (Countries with > {threshold} Fraudulent Transactions)')
plt.xlabel('Country')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Load the world shapefile from the downloaded Natural Earth data
world = gpd.read_file("./data/ne_110m_admin_0_countries.shp")

In [None]:
# Merge country data for fraud visualization
fraud_map_data = fraud_by_country.reset_index()
fraud_map_data.columns = ['country', 'fraud_count']
world = world.merge(fraud_map_data, how='left', left_on='NAME', right_on='country')
world['fraud_count'] = world['fraud_count'].fillna(0)

In [None]:
# Normalize fraud count for color scaling
world['fraud_count_normalized'] = np.log1p(world['fraud_count'])  # Log transformation for large ranges

In [None]:
# Plot fraud map
plt.figure(figsize=(15, 10))
world.plot(column='fraud_count_normalized', cmap='Reds', legend=True, 
           legend_kwds={'label': "Normalized Fraud Count by Country", 'orientation': "horizontal"})
plt.title('Geographic Distribution of Fraud (Log-Scaled)')
plt.show()

# Outlier Handling

In [None]:
pre_df = data.copy()

In [None]:
# Handle outliers using Z-score
z_scores = np.abs(stats.zscore(pre_df[numerical_cols]))
pre_df = pre_df[(z_scores < 3).all(axis=1)]  # Retain rows with z-score < 3

# Encoding

In [None]:
pre_df = pd.get_dummies(pre_df, columns=categorical_cols, drop_first=True)

In [None]:
# Convert True/False to 1/0 for all boolean columns
for col in pre_df.columns:
    if pre_df[col].dtype == bool:
        pre_df[col] = pre_df[col].astype(int)

In [None]:
# Frequency encoding for 'country'
country_frequency = pre_df['country'].value_counts()
pre_df['country_encoded'] = pre_df['country'].map(country_frequency)

# Drop the original 'country' column after encoding
pre_df = pre_df.drop(columns=['country'])

To avoid data leakage:

In [None]:
# Train/Test Split (Before Scaling)
X = pre_df.drop(columns=['class'])
y = pre_df['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [None]:
scaler = StandardScaler()
X_train[['country_encoded']] = scaler.fit_transform(X_train[['country_encoded']])
X_test[['country_encoded']] = scaler.transform(X_test[['country_encoded']])

In [None]:
numerical_cols += ['signup_hour_sin', 'signup_hour_cos', 'purchase_hour_sin', 'purchase_hour_cos']
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# EDA (Part 2)

In [None]:
# Compute the correlation matrix
correlation_matrix = pre_df.corr()

# Filter correlations above a threshold
threshold = 0.3
high_correlation = correlation_matrix[(correlation_matrix.abs() > threshold) & (correlation_matrix != 1.0)]

# Display correlations above the threshold
plt.figure(figsize=(12, 8))
sns.heatmap(high_correlation, annot=True, cmap='coolwarm', fmt=".2f", cbar=False)
plt.title(f'Correlations Above {threshold}')
plt.show()

In [None]:
# Save scaled datasets
train_scaled = pd.concat([X_train, y_train], axis=1)
test_scaled = pd.concat([X_test, y_test], axis=1)

train_scaled.to_parquet('./data/train_scaled.parquet', index=False)
test_scaled.to_parquet('./data/test_scaled.parquet', index=False)