In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# ---------------------------------------------------
# ----------------Load data--------------------------
# ---------------------------------------------------
df = pd.read_csv("fraudTest.csv")  
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'fraudTest.csv'

In [None]:


# Percentage pie chart
df['is_fraud'].value_counts().plot.pie(autopct='%1.1f%%', labels=['Not Fraud', 'Fraud'])
plt.title("Fraud Distribution")
plt.show()


In [None]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['hour'] = df['trans_date_trans_time'].dt.hour

sns.countplot(x='hour', hue='is_fraud', data=df)
plt.title("Fraud by Hour of Day")
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x='long', y='lat', hue='is_fraud', data=df)
plt.title("Transaction Locations (Fraud vs Non-Fraud)")
plt.show()


In [None]:
# Preprocessing----------
# Drop unwanted columns--
to_drop = ["trans_date_trans_time", "cc_num", "first", "last", "street", "city", "job", "dob", "trans_num"]
df = df.drop(columns=[col for col in to_drop if col in df.columns])
df_encoded = df.copy()

#--------Encode all categorical columns----------
label_encoders = {}
for col in df_encoded.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    label_encoders[col] = le

# Features & target
X = df_encoded.drop("is_fraud", axis=1)
y = df_encoded["is_fraud"]

# ------------------------
#  Scale numeric features
# ------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # fit on training data
#  Train-test split
# ------------------------
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# ------------------------
# Train models
# ------------------------

# -------------------------
# Logistic Regression
# -------------------------
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
cm_lr = confusion_matrix(y_test, y_pred_lr)
print("\n\n")
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\n\n")

plt.figure(figsize=(6,4))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', xticklabels=[0,1], yticklabels=[0,1])
plt.title("Logistic Regression Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# -------------------------
# Decision Tree
# -------------------------
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
cm_dt = confusion_matrix(y_test, y_pred_dt)
print("\n\n")
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\n\n")
plt.figure(figsize=(6,4))
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Greens', xticklabels=[0,1], yticklabels=[0,1])
plt.title("Decision Tree Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()



In [None]:
# Random Forest
# -------------------------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("\n\n")
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\n\n")
plt.figure(figsize=(6,4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Oranges', xticklabels=[0,1], yticklabels=[0,1])
plt.title("Random Forest Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# ------------------------
# 5. Test on a new entry
# ------------------------
new_data = {
    "amt": [500.75],
    "merchant": ["fraud_NewMerchant"],
    "category": ["electronics"],
    "gender": ["M"],
    "city_pop": [500000],
    "state": ["NewState"],
    "lat": [40.7128],
    "long": [-74.0060],
    "unix_time": [1371816892],
    "merch_lat": [40.7130],
    "merch_long": [-74.0070]
}

new_entry_df = pd.DataFrame(new_data)

# Keep only columns that exist in training
for col in X.columns:
    if col not in new_entry_df.columns:
        new_entry_df[col] = 0
new_entry_df = new_entry_df[X.columns]

# Encode categorical columns safely
for col in new_entry_df.select_dtypes(include=["object"]).columns:
    if col in label_encoders:
        le = label_encoders[col]
        new_entry_df[col] = new_entry_df[col].apply(lambda x: x if x in le.classes_ else "Unknown")
        if "Unknown" not in le.classes_:
            le.classes_ = np.append(le.classes_, "Unknown")
        new_entry_df[col] = le.transform(new_entry_df[col])

# Scale using the SAME fitted scaler
new_entry_scaled = scaler.transform(new_entry_df)
