# Overview of the data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('../data/fraud.csv.bz2')

# Parse the time column as datetime
df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Display basic information
print("Dataset Shape:")
print(df.shape)
print("=" * 60)

print("First few rows:")
print(df.head())
print("=" * 60)

print("Dataset Info:")
print(df.info())
print("=" * 60)

print("Statistical Summary:")
print(df.describe())
print("=" * 60)

print("Missing Values:")
print(df.isnull().sum())
print("=" * 60)

print("Duplicate Rows:")
print(df.duplicated().sum())
print("=" * 60)

# Display data types
print("Data Types:")
print(df.dtypes)

## Product Category

In [None]:
# Create a frequency table for product_category
counts = df['product_category'].value_counts(normalize=True, sort=False).sort_index()

plt.figure(figsize=(12, 6))
counts.plot(kind='bar', edgecolor='black')
plt.xlabel('Product Category')
plt.ylabel('Frequency')
plt.title('Product Category Frequencies')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Create a frequency table for credit_score
counts = df['credit_score'].value_counts(normalize=True, sort=False).sort_index()

plt.figure(figsize=(12, 6))
counts.plot(kind='bar', edgecolor='black')
plt.xlabel('Credit Score')
plt.ylabel('Frequency')
plt.title('Credit Score Frequencies')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
df[df['amount'] <= 400]['amount'].hist(bins=50, edgecolor='black')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.title('Distribution of Transaction Amount')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Create a frequency table for fraud
counts = df['fraud'].value_counts(normalize=True, sort=False).sort_index()

plt.figure(figsize=(12, 6))
counts.plot(kind='bar', edgecolor='black')
plt.xlabel('Fraud')
plt.ylabel('Frequency')
plt.title('Fraud Frequencies')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

print(df['fraud'].value_counts())

## Relationship between amount and fraud

In [None]:
from scipy import stats

# Get the average amount of fraudulent and non-fraudulent cases
avg_amount_by_fraud = df.groupby('fraud')['amount'].mean()
print(avg_amount_by_fraud)
# Perform t-test to check if the difference in average amounts is statistically significant
t_stat, p_value = stats.ttest_ind(df[df['fraud'] == True]['amount'], df[df['fraud'] == False]['amount'])
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.6f}")

plt.figure(figsize=(12, 6))
avg_amount_by_fraud.plot(kind='bar', edgecolor='black')
plt.tight_layout()
plt.show()

## Relationship between credit score and fraud

In [None]:
avg_credscore_by_fraud = df.groupby('fraud')['credit_score'].mean()
print(avg_credscore_by_fraud)

# Perform t-test to check if the difference in average credit score is statistically significant
t_stat, p_value = stats.ttest_ind(df[df['fraud'] == True]['credit_score'], df[df['fraud'] == False]['credit_score'])
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.6f}")

## Relationship of Gender and Fraud 

In [None]:
avg_fraud_by_gender = df.groupby('gender')['fraud'].mean()
print(avg_fraud_by_gender)

## Relationship of Location and Fraud

In [None]:
avg_fraud_by_location = df.groupby('address_state')['fraud'].mean()
print(avg_fraud_by_location)

plt.figure(figsize=(12, 6))
avg_fraud_by_location.plot(kind='bar', edgecolor='black')
plt.tight_layout()
plt.show()

## Relationship of Product Category and Fraud

In [None]:
avg_fraud_by_product_category = df.groupby('product_category')['fraud'].mean()
print(avg_fraud_by_product_category)

plt.figure(figsize=(12, 6))
avg_fraud_by_product_category.plot(kind='bar', edgecolor='black')
plt.tight_layout()
plt.show()

In [None]:
# check relationship between time of day and fraud
# Extract hour and inspect fraud rate by hour of day
df['hour'] = df['time'].dt.hour

avg_fraud_by_hour = df.groupby('hour')['fraud'].mean()
print(avg_fraud_by_hour)

plt.figure(figsize=(12, 6))
avg_fraud_by_hour.plot(kind='bar', edgecolor='black')
plt.xlabel('Hour of Day')
plt.ylabel('Average Fraud Rate')
plt.title('Fraud Rate by Hour of Day')
plt.xticks(rotation=0)
