# Day 11: Payment Fraud Risk Detection in Online Transactions

You are a data analyst in Stripe's risk management team investigating transaction patterns to identify potential fraud. The team needs to develop a systematic approach to screen transactions for financial risks. Your goal is to create an initial risk assessment methodology using transaction characteristics.

In [None]:
import pandas as pd
import numpy as np

dim_risk_flags_data = [
  {
    "risk_level": "Low",
    "risk_flag_id": 1,
    "transaction_id": 2
  },
  {
    "risk_level": "Medium",
    "risk_flag_id": 2,
    "transaction_id": 7
  },
  {
    "risk_level": "High",
    "risk_flag_id": 3,
    "transaction_id": 11
  },
  {
    "risk_level": "High",
    "risk_flag_id": 4,
    "transaction_id": 12
  },
  {
    "risk_level": "High",
    "risk_flag_id": 5,
    "transaction_id": 13
  },
  {
    "risk_level": "Medium",
    "risk_flag_id": 6,
    "transaction_id": 14
  },
  {
    "risk_level": "High",
    "risk_flag_id": 7,
    "transaction_id": 15
  },
  {
    "risk_level": "Low",
    "risk_flag_id": 8,
    "transaction_id": 1
  },
  {
    "risk_level": "Medium",
    "risk_flag_id": 9,
    "transaction_id": 6
  },
  {
    "risk_level": "Low",
    "risk_flag_id": 10,
    "transaction_id": 3
  }
]
dim_risk_flags = pd.DataFrame(dim_risk_flags_data)

fct_transactions_data = [
  {
    "customer_email": "alice@gmail.com",
    "transaction_id": 1,
    "transaction_date": "2024-10-05",
    "transaction_amount": 120,
    "fraud_detection_score": 10
  },
  {
    "customer_email": "bob@customdomain.com",
    "transaction_id": 2,
    "transaction_date": "2024-10-15",
    "transaction_amount": 250.5,
    "fraud_detection_score": 20
  },
  {
    "customer_email": "charlie@yahoo.com",
    "transaction_id": 3,
    "transaction_date": "2024-10-20",
    "transaction_amount": 75.25,
    "fraud_detection_score": 15
  },
  {
    "customer_email": "dana@hotmail.com",
    "transaction_id": 4,
    "transaction_date": "2024-10-25",
    "transaction_amount": 100,
    "fraud_detection_score": 30
  },
  {
    "customer_email": "eve@biz.org",
    "transaction_id": 5,
    "transaction_date": "2024-10-30",
    "transaction_amount": 300,
    "fraud_detection_score": 40
  },
  {
    "customer_email": "frank@gmail.com",
    "transaction_id": 6,
    "transaction_date": "2024-11-03",
    "transaction_amount": 150.75,
    "fraud_detection_score": 25
  },
  {
    "customer_email": "grace@outlook.com",
    "transaction_id": 7,
    "transaction_date": "2024-11-10",
    "transaction_amount": null,
    "fraud_detection_score": 50
  },
  {
    "customer_email": "ivan@yahoo.com",
    "transaction_id": 8,
    "transaction_date": "2024-11-15",
    "transaction_amount": 200,
    "fraud_detection_score": 35
  },
  {
    "customer_email": "judy@hotmail.com",
    "transaction_id": 9,
    "transaction_date": "2024-11-21",
    "transaction_amount": 250,
    "fraud_detection_score": 45
  },
  {
    "customer_email": "ken@domain.net",
    "transaction_id": 10,
    "transaction_date": "2024-11-29",
    "transaction_amount": 300,
    "fraud_detection_score": 55
  },
  {
    "customer_email": "laura@riskmail.com",
    "transaction_id": 11,
    "transaction_date": "2024-12-02",
    "transaction_amount": 100,
    "fraud_detection_score": 80
  },
  {
    "customer_email": "mike@securepay.com",
    "transaction_id": 12,
    "transaction_date": "2024-12-03",
    "transaction_amount": 180,
    "fraud_detection_score": 85
  },
  {
    "customer_email": "nina@trusthub.com",
    "transaction_id": 13,
    "transaction_date": "2024-12-09",
    "transaction_amount": 220,
    "fraud_detection_score": 90
  },
  {
    "customer_email": "oscar@fintech.com",
    "transaction_id": 14,
    "transaction_date": "2024-12-16",
    "transaction_amount": 140,
    "fraud_detection_score": 70
  },
  {
    "customer_email": "paula@alertsys.com",
    "transaction_id": 15,
    "transaction_date": "2024-12-23",
    "transaction_amount": 260,
    "fraud_detection_score": 95
  }
]
fct_transactions = pd.DataFrame(fct_transactions_data)


## Question 1

How many transactions in October 2024 have a customer email ending with a domain other than 'gmail.com', 'yahoo.com', or 'hotmail.com'? This metric will help us identify transactions associated with less common email providers that may indicate emerging risk patterns.

In [None]:
fct_transactions['transaction_date'] = pd.to_datetime(fct_transactions['transaction_date'])

october_2024_mask = (
    (fct_transactions['transaction_date'].dt.year == 2024) & 
    (fct_transactions['transaction_date'].dt.month == 10)
)
october_transactions = fct_transactions[october_2024_mask]

common_domains = ['gmail.com', 'yahoo.com', 'hotmail.com']


def has_uncommon_domain(email):
    """Check if email has a domain other than gmail, yahoo, or hotmail"""
    if pd.isna(email):
        return False
    email_lower = email.lower()
    return not any(email_lower.endswith('@' + domain) for domain in common_domains)

uncommon_domain_mask = october_transactions['customer_email'].apply(has_uncommon_domain)
uncommon_domain_transactions = october_transactions[uncommon_domain_mask]


transaction_count = len(uncommon_domain_transactions)
print(f"Transactions in October 2024 with uncommon email domains: {transaction_count}")


unique_customers = uncommon_domain_transactions['customer_email'].nunique()
avg_transaction_amount = uncommon_domain_transactions['transaction_amount'].mean()
avg_fraud_score = uncommon_domain_transactions['fraud_detection_score'].mean()

print(f"\nDetailed Analysis:")
print(f"- Total transactions: {transaction_count}")
print(f"- Unique customers: {unique_customers}")
print(f"- Average transaction amount: ${avg_transaction_amount:.2f}")
print(f"- Average fraud detection score: {avg_fraud_score:.2f}")


def extract_domain(email):
    """Extract domain from email address"""
    if pd.isna(email):
        return None
    return email.split('@')[-1].lower()


uncommon_domain_transactions['email_domain'] = uncommon_domain_transactions['customer_email'].apply(extract_domain)


domain_analysis = uncommon_domain_transactions.groupby('email_domain').agg({
    'transaction_id': 'count',
    'customer_email': 'nunique',
    'transaction_amount': 'mean',
    'fraud_detection_score': 'mean'
}).rename(columns={
    'transaction_id': 'transaction_count',
    'customer_email': 'unique_customers',
    'transaction_amount': 'avg_transaction_amount',
    'fraud_detection_score': 'avg_fraud_score'
}).round(2)

top_domains = domain_analysis.sort_values('transaction_count', ascending=False).head(10)

print(f"\nTop 10 Email Domains (Non-Common):")
print(top_domains)


high_risk_domains = domain_analysis[domain_analysis['avg_fraud_score'] > domain_analysis['avg_fraud_score'].mean()]
high_risk_domains_sorted = high_risk_domains.sort_values('avg_fraud_score', ascending=False)

print(f"\nHigh-Risk Domains (Above Average Fraud Score):")
print(high_risk_domains_sorted)


print(f"\nRisk Insights:")
print(f"- Domains with avg fraud score > 0.7: {len(domain_analysis[domain_analysis['avg_fraud_score'] > 0.7])}")
print(f"- Domains with only 1 transaction: {len(domain_analysis[domain_analysis['transaction_count'] == 1])}")
print(f"- Corporate domains (containing company indicators): ", end="")


corporate_indicators = ['.org', '.edu', '.gov', '.co.', 'corp', 'company']
corporate_domains = domain_analysis[
    domain_analysis.index.str.contains('|'.join(corporate_indicators), case=False, na=False)
]
print(f"{len(corporate_domains)}")



print(f"- Transactions with uncommon domains: {transaction_count}")

## Question 2

For transactions occurring in November 2024, what is the average transaction amount, using 0 as a default for any missing values? This calculation will help us detect abnormal transaction amounts that could be related to fraudulent activity.

In [None]:
fct_transactions['transaction_date'] = pd.to_datetime(fct_transactions['transaction_date'])

november_2024_mask = (
    (fct_transactions['transaction_date'].dt.year == 2024) & 
    (fct_transactions['transaction_date'].dt.month == 11)
)
november_transactions = fct_transactions[november_2024_mask].copy()

november_transactions['transaction_amount'] = november_transactions['transaction_amount'].fillna(0)

avg_transaction_amount = november_transactions['transaction_amount'].mean()

print(f"Average transaction amount for November 2024: ${avg_transaction_amount:.2f}")

print(f"\nDetailed Analysis for November 2024:")
print(f"- Total transactions: {len(november_transactions)}")
print(f"- Missing values filled with 0: {fct_transactions[november_2024_mask]['transaction_amount'].isna().sum()}")
print(f"- Transactions with $0 amount: {(november_transactions['transaction_amount'] == 0).sum()}")

transaction_stats = november_transactions['transaction_amount'].describe()
print(f"\nTransaction Amount Statistics:")
print(f"- Mean: ${transaction_stats['mean']:.2f}")
print(f"- Median: ${transaction_stats['50%']:.2f}")
print(f"- Standard deviation: ${transaction_stats['std']:.2f}")
print(f"- Minimum: ${transaction_stats['min']:.2f}")
print(f"- Maximum: ${transaction_stats['max']:.2f}")


print(f"\nFraud Detection Insights:")

Q1 = transaction_stats['25%']
Q3 = transaction_stats['75%']
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_low = november_transactions[november_transactions['transaction_amount'] < lower_bound]
outliers_high = november_transactions[november_transactions['transaction_amount'] > upper_bound]

print(f"- Outliers (unusually low amounts): {len(outliers_low)} transactions")
print(f"- Outliers (unusually high amounts): {len(outliers_high)} transactions")

high_value_threshold = transaction_stats['mean'] + 2 * transaction_stats['std']
high_value_transactions = november_transactions[november_transactions['transaction_amount'] > high_value_threshold]
print(f"- High-value transactions (>2 std dev above mean): {len(high_value_transactions)}")

zero_amount_transactions = november_transactions[november_transactions['transaction_amount'] == 0]
if len(zero_amount_transactions) > 0:
    avg_fraud_score_zero = zero_amount_transactions['fraud_detection_score'].mean()
    print(f"- Average fraud score for $0 transactions: {avg_fraud_score_zero:.3f}")


correlation = november_transactions['transaction_amount'].corr(november_transactions['fraud_detection_score'])
print(f"- Correlation between transaction amount and fraud score: {correlation:.3f}")


print(f"\nRisk Segmentation:")
november_transactions['amount_category'] = pd.cut(
    november_transactions['transaction_amount'], 
    bins=[0, 50, 200, 1000, float('inf')], 
    labels=['Low ($0-50)', 'Medium ($50-200)', 'High ($200-1000)', 'Very High (>$1000)']
)

risk_by_amount = november_transactions.groupby('amount_category', observed=True).agg({
    'transaction_id': 'count',
    'fraud_detection_score': 'mean'
}).rename(columns={
    'transaction_id': 'transaction_count',
    'fraud_detection_score': 'avg_fraud_score'
}).round(3)

print(risk_by_amount)

print(f"\n🎯 ANSWER: ${avg_transaction_amount:.2f}")

## Question 3

Among transactions flagged as 'High' risk in December 2024, which day of the week recorded the highest number of such transactions? This analysis is intended to pinpoint specific days with concentrated high-risk activity and support the development of our preliminary fraud detection score.

In [None]:
fct_transactions['transaction_date'] = pd.to_datetime(fct_transactions['transaction_date'])


transactions_with_risk = fct_transactions.merge(
    dim_risk_flags, 
    on='transaction_id', 
    how='inner'
)

december_2024_mask = (
    (transactions_with_risk['transaction_date'].dt.year == 2024) & 
    (transactions_with_risk['transaction_date'].dt.month == 12)
)
high_risk_mask = transactions_with_risk['risk_level'] == 'High'

december_high_risk = transactions_with_risk[december_2024_mask & high_risk_mask].copy()

december_high_risk['day_of_week'] = december_high_risk['transaction_date'].dt.day_name()
december_high_risk['day_of_week_num'] = december_high_risk['transaction_date'].dt.dayofweek

day_counts = december_high_risk.groupby('day_of_week').size().reset_index(name='transaction_count')

day_counts_sorted = day_counts.sort_values('transaction_count', ascending=False)

highest_day = day_counts_sorted.iloc[0]['day_of_week']

print(highest_day)

Made with ❤️ by [Interview Master](https://www.interviewmaster.ai)