In [1]:
# Importing necessary lebraries
import numpy as np
import pandas as pd
import os
import sys

In [4]:
sys.path.append(os.path.join(os.path.abspath('..')))
# Import modules
from src import data_loading as dl

In [None]:
sys.path.append(os.path.abspath("../"))

# Now import your scripts
from scripts import univariate as uni
from scripts import bivariate as bi
from scripts.FeatureEngineering import FeatureEngineering as fe  
from scripts.logger import logger

In [6]:
# Use raw strings or double backslashes for file paths
fraud_df = dl.load_data("Fraud_Data.csv")
ip_df = dl.load_data("ipAddress_to_Country.csv")
credit_df = dl.load_data("creditcard.csv")

In [None]:
print("head of the Fraud_Data.csv ")
fraud_df.head()

In [7]:
print("head of the ipAddress_to_Country.csv ")
ip_df.head()

head of the ipAddress_to_Country.csv 


Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China
3,16778240.0,16779263,Australia
4,16779264.0,16781311,China


In [None]:
print("head of the creditcard.csv ")
credit_df.head()

In [None]:
fraud_df.info()

In [8]:
duplicate_count = fraud_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 0


In [9]:
duplicate_count = ip_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 0


In [None]:
duplicate_count = credit_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

In [None]:
# Remove duplicates in the credit card data
credit_df = credit_df.drop_duplicates()
duplicate_count = credit_df.duplicated().sum()
print(f"Number of duplicate rows after drop duplicate : {duplicate_count}")

In [None]:
print(fraud_df.dtypes)


In [None]:
# Correct data types
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])
fraud_df['purchase_value'] = fraud_df['purchase_value'].astype(float)
fraud_df['ip_address'] = fraud_df['ip_address'].astype(int)

In [None]:
print(ip_df.dtypes)


In [None]:
# Correct data types
ip_df['lower_bound_ip_address'] = ip_df['lower_bound_ip_address'].astype(int)

# Verify corrected data types
print(ip_df.dtypes)

In [None]:
print(credit_df.dtypes)


In [None]:
#Univariate Analysis on Fraud Data
uni.summary_statistics(fraud_df, "Fraud Data")
uni.plot_histograms(fraud_df, "Fraud Data")
uni.plot_boxplots(fraud_df, ["purchase_value"], "Fraud Data")

In [None]:
#Univariate Analysis on Credit Card Data
uni.summary_statistics(credit_df, "Credit Card Data")
uni.plot_histograms(credit_df, "Credit Card Data")

In [None]:
#Bivariate Analysis
bi.correlation_heatmap(credit_df, "Credit Card Data")
bi.plot_boxplot(fraud_df, "class", "purchase_value", "Fraud Data")
bi.pairplot_features(credit_df, ["V1", "V2", "V3", "Amount", "Class"], "Class", "Credit Card Data")

In [10]:
#Merge fraud_df with ip_df based on the IP address range
merged_df = pd.merge(
    fraud_df,
    ip_df,
    how='left',
    left_on='ip_address',
    right_on='lower_bound_ip_address'
)

# Filter to include rows where the ip_address is within the range of lower and upper bound
merged_df = merged_df[(merged_df['ip_address'] >= merged_df['lower_bound_ip_address']) & (merged_df['ip_address'] <= merged_df['upper_bound_ip_address'])]

# Drop the unnecessary columns
merged_df = merged_df.drop(columns=['lower_bound_ip_address', 'upper_bound_ip_address'])

# Display the first few rows of the merged DataFrame
print(merged_df.head())

# Create the data folder if it doesn't exist
if not os.path.exists('data'):
    os.makedirs('data')
    print("Created 'data' folder.")

# Save the merged DataFrame to the data folder
merged_df.to_csv('../data/merged_data.csv', index=False)

# Confirm the file is saved
print("Merged data has been saved as '(''../data/merged_data.csv''.")

In [None]:
# Initialize FeatureEngineering with fraud_df
feature_engineering = fe(fraud_df, logger)

# Run the feature engineering pipeline
feature_engineering.pipeline()

# Retrieve the processed data
processed_data = feature_engineering.get_processed_data()

# Display the first few rows of the processed DataFrame
processed_data.head()

In [None]:
# Save the processed datase to the csv for modeling
processed_data.to_csv('../data/processed/processed_fraud_data.csv')