In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define a function to process data chunk by chunk
def process_data(file_path, chunk_size=10000):
    chunks = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        # Drop rows with missing values
        chunk.dropna(inplace=True)

        # Encode categorical variables
        for col in chunk.select_dtypes(include=['object']).columns:
            chunk[col] = LabelEncoder().fit_transform(chunk[col])

        chunks.append(chunk)

    return pd.concat(chunks, ignore_index=True)

In [2]:
train_data = process_data('fraudTrain.csv')
test_data = process_data('fraudTest.csv')

In [3]:
train_data, test_data = train_data.align(test_data, join='inner', axis=1)
     

In [4]:
y_train = train_data['is_fraud']
X_train = train_data.drop('is_fraud', axis=1)

y_test = test_data['is_fraud']
X_test = test_data.drop('is_fraud', axis=1)

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
log_reg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()

In [9]:
log_reg.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

In [10]:
log_reg_pred = log_reg.predict(X_test)
decision_tree_pred = decision_tree.predict(X_test)
random_forest_pred = random_forest.predict(X_test)
     

In [11]:
print("Logistic Regression:")
print(classification_report(y_test, log_reg_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, log_reg_pred))
print("Accuracy:", accuracy_score(y_test, log_reg_pred))
     

Logistic Regression:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719

Confusion Matrix:
 [[553222    352]
 [  2145      0]]
Accuracy: 0.9955067219224104


In [12]:
print("\nDecision Tree:")
print(classification_report(y_test, decision_tree_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, decision_tree_pred))
print("Accuracy:", accuracy_score(y_test, decision_tree_pred))
     


Decision Tree:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.30      0.60      0.40      2145

    accuracy                           0.99    555719
   macro avg       0.65      0.80      0.70    555719
weighted avg       1.00      0.99      0.99    555719

Confusion Matrix:
 [[550645   2929]
 [   865   1280]]
Accuracy: 0.9931728085597217


In [13]:
print("\nRandom Forest:")
print(classification_report(y_test, random_forest_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, random_forest_pred))
print("Accuracy:", accuracy_score(y_test, random_forest_pred))
     


Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.82      0.45      0.58      2145

    accuracy                           1.00    555719
   macro avg       0.91      0.72      0.79    555719
weighted avg       1.00      1.00      1.00    555719

Confusion Matrix:
 [[553357    217]
 [  1186    959]]
Accuracy: 0.9974753427541617


In [21]:
def predict_fraud():
    print("Enter the row number of the transaction you want to check (0 to {}):".format(len(X_test) - 1))
    row_number = int(input())  # User inputs the row number

    if row_number < 0 or row_number >= len(X_test):
        print("Invalid row number. Please try again.")
        return

    # Extract the transaction data as a NumPy array
    transaction = X_test[row_number].reshape(1, -1)  # Reshape for prediction

    # Predictions from each model
    log_reg_pred = log_reg.predict(transaction)
    decision_tree_pred = decision_tree.predict(transaction)
    random_forest_pred = random_forest.predict(transaction)

    # Print results
    print("\nTransaction Details (Feature Values):")
    print(X_test[row_number])  # Accessing the row directly as a NumPy array

    print("\nModel Predictions:")
    print("Logistic Regression Prediction: ", "Fraud" if log_reg_pred[0] == 1 else "Not Fraud")
    print("Decision Tree Prediction: ", "Fraud" if decision_tree_pred[0] == 1 else "Not Fraud")
    print("Random Forest Prediction: ", "Fraud" if random_forest_pred[0] == 1 else "Not Fraud")

# Call the function to get input and predict
predict_fraud()

   

Enter the row number of the transaction you want to check (0 to 555718):


 1000



Transaction Details (Feature Values):
[-1.72937795 -1.38488964 -0.31859487  1.34997658 -0.05820637  0.04945836
  1.2206322  -0.13657713  1.09986093 -0.60063153 -1.58448906  0.72206604
  1.82247954  1.43647683 -2.05470001 -0.2898514  -0.94843719  0.15392157
  0.92512357  1.75946018  1.52921793 -2.05430194]

Model Predictions:
Logistic Regression Prediction:  Not Fraud
Decision Tree Prediction:  Not Fraud
Random Forest Prediction:  Not Fraud
