In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
import os


In [4]:
# Verify if the file exists at the specified path
file_path = '/content/creditcard.csv'
if not os.path.exists(file_path):
    raise FileNotFoundError(f"The file at {file_path} does not exist. Please check the file path.")

In [5]:
# Load the dataset
# Load the dataset with proper error handling
try:
    dataset = pd.read_csv('/content/creditcard.csv', encoding='ISO-8859-1')  # Use appropriate encoding if needed
except FileNotFoundError:
    print("File not found. Please check the file path.")
except pd.errors.EmptyDataError:
    print("File is empty. Please check the file content.")
except pd.errors.ParserError:
    print("Error parsing file. Please check the file format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [6]:
dataset.shape

(284807, 31)

In [7]:
# Check if the dataset shape matches the expected shape
expected_shape = (284808, 31)
if dataset.shape != expected_shape:
    print(f"Warning: Expected dataset shape is {expected_shape}, but got {dataset.shape}.")
else:
    print("Dataset loaded correctly.")



In [8]:
# Check for missing values in the dataset
print("Missing values in each column:\n", dataset.isna().sum())

Missing values in each column:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [9]:
# Display the first few rows of the dataset
print("First few rows of the dataset:\n", dataset.head())

First few rows of the dataset:
    Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   



In [10]:
# Display class distribution
print("Class distribution:\n", pd.value_counts(dataset['Class']))

Class distribution:
 Class
0    284315
1       492
Name: count, dtype: int64


In [None]:
# Plot class distribution
sns.countplot(dataset['Class'])
plt.title('Class Distribution')
plt.show()

In [None]:
# Plot the correlation matrix
corrmat=dataset.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corrmat, vmax=0.8 , square=True)
plt.show()

In [None]:
print("Number of legitimate transactions:", len(dataset[dataset['Class'] == 0]))

In [None]:
print("Number of fraudulent transactions:", len(dataset[dataset['Class'] == 1]))

In [None]:
# Splitting the dataset into features and target variable
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
# Check for NaN values in y
if np.isnan(y).any():
    print("NaN values found in target variable y. Removing rows with NaN values.")
    not_nan_indices = ~np.isnan(y)
    X = X[not_nan_indices]
    y = y[not_nan_indices]

In [None]:
# Convert imbalanced data to balanced data
ros = RandomOverSampler(random_state=0)
x_res, y_res = ros.fit_resample(X, y)

In [None]:
# Display the shape of the original and resampled feature sets
print("Original X shape:", X.shape)
print("Resampled X shape:", x_res.shape)

In [None]:
# Display class distribution before and after resampling
print("Class distribution before resampling:", Counter(y))
print("Class distribution after resampling:", Counter(y_res))

In [None]:
# Splitting the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_res, y_res, test_size=0.3, random_state=42)

In [None]:
print("Training set shape:", x_train.shape)

In [None]:
print("Testing set shape:", x_test.shape)

In [None]:
# Training the Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=641, random_state=0)
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
# Calculate the number of errors
n_errors = (y_pred != y_test).sum()
print("Number of errors:", n_errors)

In [None]:
# Display the shape of the test set
print("Test set shape:", y_test.shape)

In [None]:
# Calculate and plot the confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Print accuracy, precision, recall, and F1 score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

In [None]:
from sklearn.metrics import classification_report
# Print detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred))