<a href="https://colab.research.google.com/github/khangsheng1/Learning-PyTorch/blob/main/CC_Fraud_Kaggle/CC_Fraud_Kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import torch
import torch.nn as nn
import kaggle
import zipfile
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Get the current working directory
current_path = os.getcwd()
print(f'{current_path}')

c:\Users\khang\Desktop\PyTorch Tutorial\Kaggle Credit Card Fraud


In [None]:
!kaggle datasets download -d mlg-ulb/creditcardfraud -p ./datasets/

Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
License(s): DbCL-1.0
creditcardfraud.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
# Define the path to your zip file
zip_file_path = './datasets/creditcardfraud.zip'
extracted_path = './datasets/extracted/'

# Check if the file exists
if os.path.exists(zip_file_path):
    # Open the zip file in read mode
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all the contents of the zip file to a directory
        zip_ref.extractall(extracted_path)
        print(f'Files extracted to: {extracted_path}')
else:
    print("Zip file does not exist.")

Files extracted to: ./datasets/extracted/


In [None]:
# Assuming there's a CSV file in the extracted folder
csv_file_path = os.path.join(extracted_path, 'creditcard.csv')

# Load the CSV into a pandas DataFrame
if os.path.exists(csv_file_path):
    df = pd.read_csv(csv_file_path)
    print(df.head())  # Print the first 5 rows of the DataFrame
else:
    print("CSV file not found.")

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [None]:
df.shape

(284807, 31)

In [None]:
# Separate into X and y
X = df.drop(columns=['Time','Class'])
y = df['Class']

# Check the shape of X and y to ensure they're correct
print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")

Features shape: (284807, 29)
Labels shape: (284807,)


In [None]:
# Check the distribution of the target variable y
class_distribution = y.value_counts()

print(class_distribution)

Class
0    284315
1       492
Name: count, dtype: int64


In [None]:
# Perform an 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the DataFrames to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float)

# Check the shapes of the tensors
print(f"X_train_tensor shape: {X_train_tensor.shape}")
print(f"X_test_tensor shape: {X_test_tensor.shape}")
print(f"y_train_tensor shape: {y_train_tensor.shape}")
print(f"y_test_tensor shape: {y_test_tensor.shape}")

X_train_tensor shape: torch.Size([227845, 29])
X_test_tensor shape: torch.Size([56962, 29])
y_train_tensor shape: torch.Size([227845])
y_test_tensor shape: torch.Size([56962])


In [None]:
# Check if CUDA is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'You are using {device}.')

You are using cuda.


In [None]:
# Move tensors to the appropriate device
X_train_tensor = X_train_tensor.to(device)
X_test_tensor = X_test_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)
y_test_tensor = y_test_tensor.to(device)

# Verify the device of the tensors
print(f"X_train_tensor is on: {X_train_tensor.device}")
print(f"X_test_tensor is on: {X_test_tensor.device}")
print(f"y_train_tensor is on: {y_train_tensor.device}")
print(f"y_test_tensor is on: {y_test_tensor.device}")

X_train_tensor is on: cuda:0
X_test_tensor is on: cuda:0
y_train_tensor is on: cuda:0
y_test_tensor is on: cuda:0


In [None]:
# Build model

class CC_Fraud_Model(nn.Module):
    def __init__(self, input_features, output_features, hidden_units=10):
        super().__init__()
        self.linear_layer_stack = nn.Sequential(
            nn.Linear(in_features=input_features, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=output_features)
        )

    def forward(self, x):
        return self.linear_layer_stack(x)

input_features = X_train_tensor.shape[1]
print(input_features)
unique_classes = torch.unique(y_test_tensor)
unique_classes.numel()
print(unique_classes.numel())

# Create instance of model
model = CC_Fraud_Model(input_features=X_test_tensor.shape[1],output_features=unique_classes.numel(),hidden_units=20)
model.to(device)
print(model)
print(f"Model is on: {next(model.parameters()).device}")

29
2
CC_Fraud_Model(
  (linear_layer_stack): Sequential(
    (0): Linear(in_features=29, out_features=20, bias=True)
    (1): ReLU()
    (2): Linear(in_features=20, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=2, bias=True)
  )
)
Model is on: cuda:0


In [None]:
# Create loss and optimizer
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [None]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100
    return acc

In [None]:
torch.manual_seed(42)
epochs = 1000

for epoch in range(epochs):
    # 1. Forward pass
    y_logits = model(X_train_tensor)
    y_pred = torch.round(torch.sigmoid(y_logits))

    # 2. Calculate loss and accuracy
    loss = loss_fn(y_logits, y_train_tensor)
    acc = accuracy_fn(y_true=y_train_tensor,
                      y_pred=y_pred)

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backward
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    # Testing
    model.eval()
    with torch.inference_mode():
        # 1. Forward pass
        test_logits = model(X_test_tensor).squeeze()
        test_pred = torch.round(torch.sigmoid(test_logits))
        # 2. Calculate loss and accuracy
        test_loss = loss_fn(test_logits, y_test_tensor)
        test_acc = accuracy_fn(y_true=y_test_tensor,
                                y_pred=test_pred)


    # Print out what's happening
    if epoch % 50 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test Loss: {test_loss:.5f}, Test Accuracy: {test_acc:.2f}%")

ValueError: Target size (torch.Size([227845])) must be the same as input size (torch.Size([227845, 2]))