In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../data/fraud.csv.bz2')

# Parse the time column as datetime
df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Display basic information
print("Dataset Shape:")
print(df.shape)
print("=" * 60)

print("Dataset Info:")
print(df.info())
print("=" * 60)


In [None]:
# setup train / test split
from sklearn.model_selection import train_test_split

df_cleaned = df.dropna().copy()

X = df_cleaned.drop('fraud', axis=1)
y = df_cleaned['fraud']

# TODO stratify?
# hard code random state for reproducibility (test later)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=566571358)

In [None]:
# check the train data

print(X_train.shape)
X_train.info()

print(y_train.shape)
print(y_train.value_counts())

## Preprocessing

In [None]:
# encode categorical variables
from sklearn.preprocessing import OneHotEncoder

X = X_train.copy()
y = y_train.copy()

# add hour of day as a feature
X['hour'] = X['time'].dt.hour

# Fit OneHotEncoders on training data
enc_product = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
enc_gender = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
enc_state = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
enc_hour = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform - reshape needed because OneHotEncoder expects 2D input
product_encoded = enc_product.fit_transform(X[['product_category']])
gender_encoded = enc_gender.fit_transform(X[['gender']])
state_encoded = enc_state.fit_transform(X[['address_state']])
hour_encoded = enc_hour.fit_transform(X[['hour']])

# Get feature names for the encoded columns
product_cols = enc_product.get_feature_names_out(['product_category'])
gender_cols = enc_gender.get_feature_names_out(['gender'])
state_cols = enc_state.get_feature_names_out(['address_state'])
hour_cols = enc_hour.get_feature_names_out(['hour'])

# Create DataFrames from encoded arrays
product_df = pd.DataFrame(product_encoded, columns=product_cols, index=X.index)
gender_df = pd.DataFrame(gender_encoded, columns=gender_cols, index=X.index)
state_df = pd.DataFrame(state_encoded, columns=state_cols, index=X.index)
hour_df = pd.DataFrame(hour_encoded, columns=hour_cols, index=X.index)

# Drop original columns and concatenate encoded ones
X = X.drop(['product_category', 'gender', 'address_state', 'time'], axis=1)
# X = pd.concat([X, product_df, gender_df, state_df], axis=1) # everything
X = pd.concat([hour_df, product_df, gender_df, X["amount"].to_frame()], axis=1) # only hour, product_category, amount
print(X.info())


In [None]:
# balance X/y: keep all True, sample smaller number of False
pos_idx = y[y].index
neg_idx = y[~y].sample(n=len(pos_idx)*4).index

balanced_idx = pos_idx.union(neg_idx)
X = X.loc[balanced_idx]
y = y.loc[balanced_idx]

print(y.value_counts())

In [None]:
# Setup validation set
X, X_valid, y, y_valid = train_test_split(X, y, test_size=0.1)

print(X.shape)
print(X_valid.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
# Logistic Regression
model = LogisticRegression(max_iter=1000, class_weight="balanced", verbose=1)
model.fit(X, y)

In [None]:
# SVM
from sklearn.svm import SVC
model = SVC(kernel='poly', degree=2, C=.01, gamma=.01, class_weight='balanced', max_iter=100000000, verbose=True)
model.fit(X, y)

In [None]:
# neural network
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=[10,10], learning_rate = "adaptive", tol=1e-6, max_iter=800, verbose=True)
model.fit(X, y)

In [None]:
# random forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=20, class_weight='balanced', verbose=1)
model.fit(X, y)

### Training Performance

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

# Get predictions on training data
if True:
  y_pred_proba = model.predict_proba(X)[:, 1]
  threshold = 0.3  # Set your custom threshold here
  y_pred = y_pred_proba >= threshold
else:
  y_pred = model.predict(X)

print(pd.crosstab(y, y_pred, rownames=['Actual'], colnames=['Predicted']))

# Calculate metrics
recall = recall_score(y, y_pred)
precision = precision_score(y, y_pred)
f1 = f1_score(y, y_pred)

print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")

### Validation Performance

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

# Get predictions on validation data
if True:
  y_pred_proba = model.predict_proba(X_valid)[:, 1]
  threshold = 0.1  # Set your custom threshold here
  y_pred = y_pred_proba >= threshold
else:
  y_pred = model.predict(X_valid)

print(pd.crosstab(y_valid, y_pred, rownames=['Actual'], colnames=['Predicted']))

# Calculate metrics
recall = recall_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)

print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")

## Store the model

In [None]:
import pickle

# Save the model and encoders
with open('../model/model.pkl', 'wb') as f:
    pickle.dump({
        'model': model,
        'enc_product': enc_product,
        'enc_hour': enc_hour,
        'enc_gender': enc_gender,
        'enc_state': enc_state,
        'product_cols': product_cols,
        'hour_cols': hour_cols,
        'gender_cols': gender_cols,
        'state_cols': state_cols,
        'model_version': '0.1',
        'train_date': pd.Timestamp.now().isoformat()
    }, f)

print("Model and encoders saved successfully!")

# Final Test on Withheld Data

In [None]:
# Apply encoding on Test Data
X_final_test = X_test.copy()
y_final_test = y_test.copy()

# add hour of day as a feature
X_final_test['hour'] = X_final_test['time'].dt.hour

# Use already trained encoders!
product_encoded = enc_product.transform(X_final_test[['product_category']])
gender_encoded = enc_gender.transform(X_final_test[['gender']])
state_encoded = enc_state.transform(X_final_test[['address_state']])
hour_encoded = enc_hour.transform(X_final_test[['hour']])

# Get feature names for the encoded columns
product_cols = enc_product.get_feature_names_out(['product_category'])
gender_cols = enc_gender.get_feature_names_out(['gender'])
state_cols = enc_state.get_feature_names_out(['address_state'])
hour_cols = enc_hour.get_feature_names_out(['hour'])

# Create DataFrames from encoded arrays
product_df = pd.DataFrame(product_encoded, columns=product_cols, index=X_final_test.index)
gender_df = pd.DataFrame(gender_encoded, columns=gender_cols, index=X_final_test.index)
state_df = pd.DataFrame(state_encoded, columns=state_cols, index=X_final_test.index)
hour_df = pd.DataFrame(hour_encoded, columns=hour_cols, index=X_final_test.index)

# Drop original columns and concatenate encoded ones
X_final_test = X_final_test.drop(['product_category', 'gender', 'address_state', 'time'], axis=1)
X_final_test = pd.concat([X_final_test, product_df, gender_df, state_df], axis=1)
X_final_test = pd.concat([hour_df, product_df, gender_df, X_final_test["amount"].to_frame()], axis=1) # only hour, product_category, amount
print(X_final_test.info())



In [None]:
# Get Predictions on Test Data

from sklearn.metrics import f1_score, precision_score, recall_score

# Get predictions on training data
if True:
  y_pred_proba = model.predict_proba(X_final_test)[:, 1]
  threshold = 0.3  # Set your custom threshold here
  y_pred = y_pred_proba >= threshold
else:
  y_pred = model.predict(X_final_test)

print(pd.crosstab(y_final_test, y_pred, rownames=['Actual'], colnames=['Predicted']))

# Calculate metrics
recall = recall_score(y_final_test, y_pred)
precision = precision_score(y_final_test, y_pred)
f1 = f1_score(y_final_test, y_pred)

print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")