In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import RandomOverSampler

In [2]:
#Load dataset
data = pd.read_csv('creditcard.csv')

In [14]:
#Separating features and target variable

#Checking for missing values in the target variable  #Error: NaN came in first attempt
print(f"Missing values in y: {y.isnull().sum()}")

#Dropping rows with NaN values in the target variable
data = data.dropna(subset=['Class'])

#Reassigning X and y after dropping NaNs
X = data.drop(columns=['Class'])
y = data['Class']

#Filling missing values in features if any
X.fillna(X.mean(), inplace=True)

#Normalizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

Missing values in y: 0


In [15]:
#Handle class imbalance using SMOTE
'''smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)'''

#SMOTE didn't work. Needed at least 6 samples. Swtiching to ros
#Handling class imbalance using Random Oversampling
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

#Train Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train_resampled, y_train_resampled)

#Train Random Forest model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_resampled, y_train_resampled)

In [16]:
#Making predictions
log_reg_preds = log_reg.predict(X_test)
rf_preds = rf_clf.predict(X_test)

In [17]:
#Evaluate models
def evaluate_model(model_name, y_true, y_pred):
    print(f'Performance Metrics for {model_name}:')
    print(f'Precision: {precision_score(y_true, y_pred):.4f}')
    print(f'Recall: {recall_score(y_true, y_pred):.4f}')
    print(f'F1 Score: {f1_score(y_true, y_pred):.4f}')
    print(classification_report(y_true, y_pred))
    print('-' * 50)

evaluate_model("Logistic Regression", y_test, log_reg_preds)
evaluate_model("Random Forest", y_test, rf_preds)

Performance Metrics for Logistic Regression:
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1194
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00      1195
   macro avg       0.50      0.50      0.50      1195
weighted avg       1.00      1.00      1.00      1195

--------------------------------------------------
Performance Metrics for Random Forest:
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1194
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00      1195
   macro avg       0.50      0.50      0.50      1195
weighted avg       1.00      1.00      1.00      1195

--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
#Function to predict a new transaction
def predict_transaction(transaction, model):
    """Parameters:
    - transaction (list or np.array): New transaction data (same number of features as training data)
    - model: Trained model (log_reg or rf_clf)
    Returns:
    - 'Fraudulent' or 'Genuine'
    """
    try:
        transaction = np.array(transaction).reshape(1, -1)

        # Ensure correct feature count
        if transaction.shape[1] != X.shape[1]:
            raise ValueError(f"Feature mismatch: Expected {X.shape[1]} features, got {transaction.shape[1]}")

        transaction_scaled = scaler.transform(transaction)
        prediction = model.predict(transaction_scaled)
        return "Fraudulent" if prediction[0] == 1 else "Genuine"
    except Exception as e:
        return f"Error: {str(e)}"

#Example new transaction
new_transaction = np.array([-1.359807, -0.072781, 2.536347, 1.378155, -0.338321, 0.462388, 0.239599, 0.098698, 0.363787, 0.090794, -0.551600, -0.617801, -0.991390, -0.311169, 1.468177, -0.470401, 0.207971, 0.025790, 0.403993, 0.251412, -0.018307, 0.277838, -0.110474, 0.066928, 0.128539, -0.189115, 0.133558, -0.021053, 149.62, 0.0])  # Adjusted to match expected feature count

#Predicting using Random Forest model
prediction = predict_transaction(new_transaction, rf_clf)
print(f"Prediction for the new transaction: {prediction}")

#Predicting using the Logistic Regression model (optional)
prediction_log_reg = predict_transaction(new_transaction, log_reg)
print(f"Prediction (Logistic Regression): {prediction_log_reg}")


Prediction for the new transaction: Genuine
Prediction (Logistic Regression): Genuine


