In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import psutil

In [2]:
# 📌 Load Data
train_data = pd.read_csv("fraudTrain.csv" , nrows=10000)  # Replace with actual file
test_data = pd.read_csv("fraudTest.csv", nrows=10000)    # Replace with actual file

In [3]:
# 📌 Drop Unnecessary Columns
cols_to_drop = [
    "Unnamed: 0", "first", "last", "street", "dob", "trans_num",
    "unix_time", "lat", "long", "merch_lat", "merch_long"
]
train_data.drop(columns=cols_to_drop, errors="ignore", inplace=True)
test_data.drop(columns=cols_to_drop, errors="ignore", inplace=True)

In [4]:
# 📌 Define Features (X) and Target (y)
target_col = "is_fraud"
categorical_cols = ["category", "gender", "merchant", "city", "state", "job"]

In [5]:
X = train_data.drop(columns=[target_col])
y = train_data[target_col]

X_test = test_data.drop(columns=[target_col], errors='ignore')

In [6]:
if 'trans_date_trans_time' in X.columns:
    X['trans_date_trans_time'] = pd.to_datetime(X['trans_date_trans_time'])
    X['year'] = X['trans_date_trans_time'].dt.year
    X['month'] = X['trans_date_trans_time'].dt.month
    X['day'] = X['trans_date_trans_time'].dt.day
    X['hour'] = X['trans_date_trans_time'].dt.hour
    X['minute'] = X['trans_date_trans_time'].dt.minute
    X['second'] = X['trans_date_trans_time'].dt.second
    X.drop(columns=['trans_date_trans_time'], inplace=True)

In [7]:
if 'trans_date_trans_time' in X_test.columns:
    X_test['trans_date_trans_time'] = pd.to_datetime(X_test['trans_date_trans_time'])
    X_test['year'] = X_test['trans_date_trans_time'].dt.year
    X_test['month'] = X_test['trans_date_trans_time'].dt.month
    X_test['day'] = X_test['trans_date_trans_time'].dt.day
    X_test['hour'] = X_test['trans_date_trans_time'].dt.hour
    X_test['minute'] = X_test['trans_date_trans_time'].dt.minute
    X_test['second'] = X_test['trans_date_trans_time'].dt.second
    X_test.drop(columns=['trans_date_trans_time'], inplace=True)

In [8]:
if all(col in X.columns for col in categorical_cols):
    # 📌 One-Hot Encode Categorical Variables
    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

In [9]:
# Fit on Train Data and Transform Both Train & Test
X_cat = encoder.fit_transform(X[categorical_cols])
X_test_cat = encoder.transform(X_test[categorical_cols])

In [10]:
# Convert Encoded Data to DataFrame
X_cat_df = pd.DataFrame(X_cat, columns=encoder.get_feature_names_out(categorical_cols))
X_test_cat_df = pd.DataFrame(X_test_cat, columns=encoder.get_feature_names_out(categorical_cols))

In [11]:
# Merge Encoded Columns
X = pd.concat([X, X_cat_df], axis=1)
X_test = pd.concat([X_test, X_test_cat_df], axis=1)

In [12]:
# Drop Original Categorical Columns
X.drop(columns=categorical_cols, inplace=True)
X_test.drop(columns=categorical_cols, inplace=True)

In [13]:
# 📌 Split Train Data into Training and Validation Sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42,class_weight='balanced'),
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(random_state=42,class_weight='balanced')
}


In [18]:
for name, model in models.items():
    print(f"\n=== Training {name} ===")
    model.fit(X_train, y_train)  # Train Model
    y_pred = model.predict(X_valid)  # Predict on Validation Set

    accuracy = accuracy_score(y_valid, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_valid, y_pred))


=== Training Random Forest ===
Accuracy: 0.9995
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1994
           1       1.00      0.83      0.91         6

    accuracy                           1.00      2000
   macro avg       1.00      0.92      0.95      2000
weighted avg       1.00      1.00      1.00      2000


=== Training Logistic Regression ===
Accuracy: 0.9970
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1994
           1       0.00      0.00      0.00         6

    accuracy                           1.00      2000
   macro avg       0.50      0.50      0.50      2000
weighted avg       0.99      1.00      1.00      2000


=== Training Decision Tree ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9950
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1994
           1       0.36      0.83      0.50         6

    accuracy                           0.99      2000
   macro avg       0.68      0.91      0.75      2000
weighted avg       1.00      0.99      1.00      2000

