In [1]:
import pandas as pd
from pathlib import Path
import joblib
import sys

# Resolve current working directory (for notebooks)
project_root = Path().resolve()

# Walk up until 'src/' is found
while not (project_root / 'src').exists() and project_root != project_root.parent:
    project_root = project_root.parent

# Add to Python path
sys.path.append(str(project_root))

print("Project root set to:", project_root)

from src.data_loader import load_raw_data


Project root set to: C:\Users\jonat\fraud-detection-project


In [2]:

# === Paths ===
raw_dir = project_root / "data" / "raw"
processed_dir = project_root / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)


In [3]:

# === Load raw data ===
transaction, _ = load_raw_data(raw_dir)


Loaded transaction shape: (590540, 394)
Loaded identity shape: (144233, 41)
Transaction columns: ['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V6

In [4]:

# === Columns to generate maps for ===
target_columns = ["ProductCD", "P_emaildomain"]


In [5]:

# === Generate frequency maps ===
frequency_maps = {}
for col in target_columns:
    if col in transaction.columns:
        freq_map = transaction[col].value_counts(normalize=True, dropna=False).to_dict()
        frequency_maps[col] = freq_map
    else:
        print(f"Column {col} not found in transaction data.")


In [6]:

# === Save maps ===
joblib.dump(frequency_maps, processed_dir / "frequency_maps.pkl")
print("Saved frequency maps to:", processed_dir / "frequency_maps.pkl")


Saved frequency maps to: C:\Users\jonat\fraud-detection-project\data\processed\frequency_maps.pkl


In [7]:
import pandas as pd
from pathlib import Path

# Path to the raw transaction CSV
file_path = Path("C:/Users/jonat/fraud-detection-project/data/raw/train_transaction.csv")

# Load only the email columns for speed
df = pd.read_csv(file_path, usecols=["P_emaildomain", "R_emaildomain"])

# Normalize to lowercase
df["P_emaildomain"] = df["P_emaildomain"].str.lower()
df["R_emaildomain"] = df["R_emaildomain"].str.lower()

# Display unique domain counts
print("\n=== P_emaildomain Value Counts ===")
print(df["P_emaildomain"].value_counts(dropna=False))

print("\n=== R_emaildomain Value Counts ===")
print(df["R_emaildomain"].value_counts(dropna=False))



=== P_emaildomain Value Counts ===
P_emaildomain
gmail.com           228355
yahoo.com           100934
NaN                  94456
hotmail.com          45250
anonymous.com        36998
aol.com              28289
comcast.net           7888
icloud.com            6267
outlook.com           5096
msn.com               4092
att.net               4033
live.com              3041
sbcglobal.net         2970
verizon.net           2705
ymail.com             2396
bellsouth.net         1909
yahoo.com.mx          1543
me.com                1522
cox.net               1393
optonline.net         1011
charter.net            816
live.com.mx            749
rocketmail.com         664
mail.com               559
earthlink.net          514
gmail                  496
outlook.es             438
mac.com                436
juno.com               322
aim.com                315
hotmail.es             305
roadrunner.com         305
windstream.net         305
hotmail.fr             295
frontier.com           280
embar

In [1]:
import pandas as pd
import joblib
from pathlib import Path


In [2]:

# === Paths ===
csv_path = Path("C:/Users/jonat/fraud-detection-project/data/raw/train_transaction.csv")
output_path = Path("C:/Users/jonat/fraud-detection-project/data/processed/frequency_maps.pkl")


In [3]:

# === Load CSV ===
df = pd.read_csv(csv_path, encoding="utf-8-sig")
print("Data loaded:", df.shape)


Data loaded: (590540, 394)


In [4]:

# === Filter Legitimate Transactions Only ===
df_legit = df[df["isFraud"] == 0].copy()
print("Legitimate transactions only:", df_legit.shape)


Legitimate transactions only: (569877, 394)


In [5]:

# === Compute Frequency Maps ===
product_freq = df_legit["ProductCD"].value_counts(normalize=True).to_dict()
email_freq = df_legit["P_emaildomain"].str.lower().value_counts(normalize=True).to_dict()


In [8]:
print("Product frequency map:", product_freq)
print("Email frequency map:", email_freq)

Product frequency map: {'W': 0.7557788786001189, 'C': 0.10618256220201903, 'R': 0.06365057722982327, 'H': 0.05518734744515044, 'S': 0.019200634522888272}
Email frequency map: {'gmail.com': 0.45672726056071483, 'yahoo.com': 0.20626250755419678, 'hotmail.com': 0.0896131623906602, 'anonymous.com': 0.07557124365604305, 'aol.com': 0.057865670174880965, 'comcast.net': 0.01598039359195, 'icloud.com': 0.012693141730324061, 'outlook.com': 0.009648460616757039, 'att.net': 0.008370781935170877, 'msn.com': 0.008368690808032438, 'sbcglobal.net': 0.006185554075502237, 'live.com': 0.006183462948363798, 'verizon.net': 0.005610494112431542, 'ymail.com': 0.004905784266777635, 'bellsouth.net': 0.0038811319689425797, 'yahoo.com.mx': 0.003193151140396185, 'me.com': 0.0031262350719661403, 'cox.net': 0.0028522974168306457, 'optonline.net': 0.0020785803756082567, 'charter.net': 0.001654081566505162, 'live.com.mx': 0.0014805180140147341, 'rocketmail.com': 0.001384326165646545, 'earthlink.net': 0.00105183695063

In [6]:

# === Save Frequency Maps ===
freq_maps = {
    "ProductCD": product_freq,
    "P_emaildomain": email_freq
}


In [7]:

joblib.dump(freq_maps, output_path)
print(f"Frequency maps saved to: {output_path}")


Frequency maps saved to: C:\Users\jonat\fraud-detection-project\data\processed\frequency_maps.pkl
