In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import numpy as np

In [4]:
try:
    df = pd.read_csv('C:/Users/Akash/OneDrive - Erin.N.Nagarvala Day School/Desktop/jupyter notebook/creditcard.csv') 
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'creditcard.csv' not found. Please ensure your credit card fraud dataset file is in the correct directory.")

Dataset loaded successfully!


In [6]:
np.random.seed(42)
num_samples = 10000
df = pd.DataFrame({'Time': np.arange(num_samples),
        'V1': np.random.randn(num_samples),
        'V2': np.random.randn(num_samples),
        'V3': np.random.randn(num_samples),
        'Amount': np.random.rand(num_samples) * 1000,
        'Class': np.zeros(num_samples, dtype=int) 
})

In [10]:
num_fraud = int(num_samples * 0.01)
fraud_indices = np.random.choice(num_samples, num_fraud, replace=False)
df.loc[fraud_indices, 'Class'] = 1
df.loc[fraud_indices, 'V1'] += np.random.randn(num_fraud) * 2
df.loc[fraud_indices, 'Amount'] += np.random.rand(num_fraud) * 500

In [12]:
print("\n--- Original Dataset Head ---")
print(df.head())
print("\n--- Dataset Info ---")
df.info()
print("\n--- Class Distribution (Before handling imbalance) ---")
print(df['Class'].value_counts())
print(df['Class'].value_counts(normalize=True) * 100)
print("\n--- Missing Values Before Preprocessing ---")
print(df.isnull().sum().sum()) 


--- Original Dataset Head ---
   Time        V1        V2        V3      Amount  Class
0     0  0.496714 -0.678495  0.348286  831.868615      0
1     1 -0.138264 -0.305499  0.283324  881.643755      0
2     2  0.647689 -0.597381 -0.936520  433.250424      0
3     3  1.523030  0.110418  0.579584  374.688021      0
4     4 -0.234153  1.197179 -1.490083  612.698947      0

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    10000 non-null  int64  
 1   V1      10000 non-null  float64
 2   V2      10000 non-null  float64
 3   V3      10000 non-null  float64
 4   Amount  10000 non-null  float64
 5   Class   10000 non-null  int64  
dtypes: float64(4), int64(2)
memory usage: 468.9 KB

--- Class Distribution (Before handling imbalance) ---
Class
0    9800
1     200
Name: count, dtype: int64
Class
0    98.0
1     2.0
Name: proportion

In [13]:
if 'Time' in df.columns:
    df = df.drop('Time', axis=1)

X = df.drop('Class', axis=1)
y = df['Class']

In [14]:
scaler = StandardScaler()
X['Amount'] = scaler.fit_transform(X[['Amount']])

In [15]:
print("\n--- Preprocessed Dataset Head (Amount Scaled) ---")
print(X.head())



--- Preprocessed Dataset Head (Amount Scaled) ---
         V1        V2        V3    Amount
0  0.496714 -0.678495  0.348286  1.117517
1 -0.138264 -0.305499  0.283324  1.287253
2  0.647689 -0.597381 -0.936520 -0.241790
3  1.523030  0.110418  0.579584 -0.441490
4 -0.234153  1.197179 -1.490083  0.370138


In [16]:
X_final = X 
y_final = y 

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.3, random_state=42, stratify=y_final)

print(f"\nTraining features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing target shape: {y_test.shape}")
print(f"Training set class distribution: {y_train.value_counts(normalize=True)}")
print(f"Testing set class distribution: {y_test.value_counts(normalize=True)}")




Training features shape: (7000, 4)
Testing features shape: (3000, 4)
Training target shape: (7000,)
Testing target shape: (3000,)
Training set class distribution: Class
0    0.98
1    0.02
Name: proportion, dtype: float64
Testing set class distribution: Class
0    0.98
1    0.02
Name: proportion, dtype: float64


In [22]:
model = LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

print("\n--- Model Training Complete (Logistic Regression) ---")
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] 

print("\n--- Model Evaluation ---")

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


--- Model Training Complete (Logistic Regression) ---

--- Model Evaluation ---
Accuracy: 0.6197


In [23]:
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)
print(f"  True Negatives (TN): {cm[0,0]}")
print(f"  False Positives (FP): {cm[0,1]} (Type I error - predicting fraud when it's not)")
print(f"  False Negatives (FN): {cm[1,0]} (Type II error - failing to detect fraud)")
print(f"  True Positives (TP): {cm[1,1]}")



Confusion Matrix:
[[1827 1113]
 [  28   32]]
  True Negatives (TN): 1827
  False Positives (FP): 1113 (Type I error - predicting fraud when it's not)
  False Negatives (FN): 28 (Type II error - failing to detect fraud)
  True Positives (TP): 32


In [24]:
precision = precision_score(y_test, y_pred)
print(f"\nPrecision (for Fraudulent Class): {precision:.4f}")

recall = recall_score(y_test, y_pred)
print(f"Recall (for Fraudulent Class): {recall:.4f}")

f1 = f1_score(y_test, y_pred)
print(f"F1-Score (for Fraudulent Class): {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Precision (for Fraudulent Class): 0.0279
Recall (for Fraudulent Class): 0.5333
F1-Score (for Fraudulent Class): 0.0531

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.62      0.76      2940
           1       0.03      0.53      0.05        60

    accuracy                           0.62      3000
   macro avg       0.51      0.58      0.41      3000
weighted avg       0.97      0.62      0.75      3000

