# MLOps Project: Model Training

This notebook covers the data preparation, preprocessing, and model training phases of the project.

In [None]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# Load Data
print("Loading data...")
# Load a sample to avoid memory issues if file is huge
df = pd.read_csv('../synthetic_fraud_data.csv', nrows=100000)

In [None]:
# Preprocessing Setup
print("Preprocessing...")
drop_cols = [
    'transaction_id', 'customer_id', 'card_number', 'timestamp', 
    'merchant', 'city', 'device_fingerprint', 'ip_address', 
    'velocity_last_hour'
]
df = df.drop(columns=drop_cols, errors='ignore')

X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

print(f"Numerical cols: {numerical_cols}")
print(f"Categorical cols: {categorical_cols}")

In [None]:
# Define Preprocessor
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Transform data
X_processed = preprocessor.fit_transform(X)

In [None]:
# PCA (Embedding)
n_components = 10
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_processed)

In [None]:
# Save Ref Data
pca_cols = [f'PCA_{i+1}' for i in range(n_components)]
ref_df = pd.DataFrame(X_pca, columns=pca_cols)
ref_df['target'] = y.values

os.makedirs('../data', exist_ok=True)
ref_df.to_csv('../data/ref_data.csv', index=False)
print("Saved data/ref_data.csv")

In [None]:
# Train Model
print("Training model...")
clf = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
clf.fit(X_pca, y)
print("Model trained.")

In [None]:
# Save Artifacts
os.makedirs('../artifacts', exist_ok=True)

with open('../artifacts/preprocessor.pickle', 'wb') as f:
    pickle.dump(preprocessor, f)
    
with open('../artifacts/pca.pickle', 'wb') as f:
    pickle.dump(pca, f)
    
with open('../artifacts/model.pickle', 'wb') as f:
    pickle.dump(clf, f)

print("Artifacts saved in artifacts/")