# Neural network

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [None]:
train_processed, test_processed = preprocess_and_detect_uid(train_df, test_df, train_identity, test_identity)
test_processed.columns = test_processed.columns.str.replace('-', '_')

Starting preprocessing...
Mem. usage decreased to 542.35 Mb (0.0% reduction)
Mem. usage decreased to 472.59 Mb (0.0% reduction)
Preprocessing and UID detection complete.


In [None]:
train_processed.replace([np.inf, -np.inf], np.nan, inplace=True)
test_processed.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np


def preprocess_for_NN(input_train_df, input_test_df, target_column='isFraud'):
    # Work on local copies to avoid modifying global variables
    train_df = input_train_df.copy()
    test_df = input_test_df.copy()

    # Drop target column
    if target_column in train_df.columns:
        train_df = train_df.drop(columns=[target_column])

    # Identify numeric and categorical columns that exist in the datasets
    numeric_columns = train_df.select_dtypes(include=['int64', 'float64']).columns
    categorical_columns = train_df.select_dtypes(include=['object']).columns
    print(categorical_columns)
    # Ensure numeric and categorical columns exist in both train and test
    numeric_columns = [col for col in numeric_columns if col in test_df.columns]
    categorical_columns = [col for col in categorical_columns if col in test_df.columns]

    # Handle missing values for numeric columns
    numeric_imputer = SimpleImputer(strategy='median')
    train_df[numeric_columns] = numeric_imputer.fit_transform(train_df[numeric_columns])
    test_df[numeric_columns] = numeric_imputer.transform(test_df[numeric_columns])

    # Handle missing values for categorical columns
    categorical_imputer = SimpleImputer(strategy='constant', fill_value='missing')
    train_df[categorical_columns] = categorical_imputer.fit_transform(train_df[categorical_columns])
    test_df[categorical_columns] = categorical_imputer.transform(test_df[categorical_columns])

    # # Special case: Encoding for id_12
    # if 'id_12' in categorical_columns:
    #     train_df['id_12'] = train_df['id_12'].fillna('missing').astype(str)
    #     test_df['id_12'] = test_df['id_12'].fillna('missing').astype(str)

    #     mapping = {'NotFound': 0, 'Found': 1, 'missing': -1}
    #     train_df['id_12'] = train_df['id_12'].map(mapping)
    #     test_df['id_12'] = test_df['id_12'].map(mapping)

    #     # Check if mapping was successful
    #     if train_df['id_12'].isnull().any() or test_df['id_12'].isnull().any():
    #         raise ValueError("Mapping failed! Check values in id_12 for unmapped categories.")

    # # Frequency Encoding for remaining categorical columns
    # for col in categorical_columns:
    #     if col != 'id_12':  # Skip id_12 as it's already encoded
    #         combined = pd.concat([train_df[col], test_df[col]])
    #         freq_encoding = combined.value_counts(normalize=True)  # Frequency proportions
    #         train_df[col] = train_df[col].map(freq_encoding).fillna(0)  # Encode train
    #         test_df[col] = test_df[col].map(freq_encoding).fillna(0)    # Encode test

    # Align column order between train and test
    all_columns = train_df.columns
    test_df = test_df[all_columns]

    # Verify all columns are numeric
    for col in train_df.columns:
        if not pd.api.types.is_numeric_dtype(train_df[col]):
            raise ValueError(f"Column {col} is not numeric after preprocessing! Values: {train_df[col].unique()}")

    # Remove any infinite values
    train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Re-impute numeric columns to handle infinities replaced with NaN
    train_df[numeric_columns] = numeric_imputer.fit_transform(train_df[numeric_columns])
    test_df[numeric_columns] = numeric_imputer.transform(test_df[numeric_columns])

    # Standardize numeric columns
    scaler = StandardScaler()
    train_df = scaler.fit_transform(train_df)
    test_df = scaler.transform(test_df)

    return train_df, test_df


In [None]:
def preprocess_for_NN(input_train_df, input_test_df, target_column='isFraud'):
    print("Starting preprocessing (Method B)...")
    train_df = input_train_df.copy()
    test_df = input_test_df.copy()

    # Drop target column
    if target_column in train_df.columns:
        train_df = train_df.drop(columns=[target_column])

    # Identify numeric and categorical columns
    numeric_columns = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_columns = train_df.select_dtypes(include=['object']).columns.tolist()

    # Ensure numeric and categorical columns exist in both train and test
    numeric_columns = [col for col in numeric_columns if col in test_df.columns]
    categorical_columns = [col for col in categorical_columns if col in test_df.columns]

    # Handle missing values
    numeric_imputer = SimpleImputer(strategy='median')
    train_df[numeric_columns] = numeric_imputer.fit_transform(train_df[numeric_columns])
    test_df[numeric_columns] = numeric_imputer.transform(test_df[numeric_columns])

    categorical_imputer = SimpleImputer(strategy='constant', fill_value='missing')
    train_df[categorical_columns] = categorical_imputer.fit_transform(train_df[categorical_columns])
    test_df[categorical_columns] = categorical_imputer.transform(test_df[categorical_columns])

    # Frequency Encoding for categorical columns
    for col in categorical_columns:
        combined = pd.concat([train_df[col], test_df[col]])
        freq_encoding = combined.value_counts(normalize=True)
        train_df[col] = train_df[col].map(freq_encoding).fillna(0)
        test_df[col] = test_df[col].map(freq_encoding).fillna(0)

    # Align column order
    test_df = test_df[train_df.columns]

    # Replace infinities with NaN
    train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Final imputation for NaNs
    train_df = numeric_imputer.fit_transform(train_df)
    test_df = numeric_imputer.transform(test_df)

    # Standardize numeric columns
    scaler = StandardScaler()
    train_df = scaler.fit_transform(train_df)
    test_df = scaler.transform(test_df)

    print("Preprocessing (Method B) complete.")
    return train_df, test_df

In [None]:
train_df_NN, test_df_NN = preprocess_for_NN(train_processed, test_processed, target_column='isFraud')
verify_preprocessed_data(train_df_NN, test_df_NN)
# Convert to PyTorch tensors for neural network
train_tensor = torch.tensor(train_df_NN, dtype=torch.float32)
test_tensor = torch.tensor(test_df_NN, dtype=torch.float32)

Starting preprocessing (Method B)...
Preprocessing (Method B) complete.
=== Verification of Preprocessed Data ===
NaN values in train data: False
NaN values in test data: False
Infinity values in train data: False
Infinity values in test data: False
Max value in train data: 604.9377178842773
Min value in train data: -186.37770000411408
Max value in test data: 2431.967463663471
Min value in test data: -186.37770000411408
Shape of train data: (590540, 445)
Shape of test data: (506691, 445)
Zero-variance columns in train data: 0
Zero-variance columns in test data: 0
=== Verification Completed ===


In [None]:
verify_preprocessed_data(train_df_NN, test_df_NN)

=== Verification of Preprocessed Data ===
NaN values in train data: True
NaN values in test data: True
Infinity values in train data: False
Infinity values in test data: False
Max value in train data: nan
Min value in train data: nan
Max value in test data: nan
Min value in test data: nan
Shape of train data: (590540, 445)
Shape of test data: (506691, 445)
Zero-variance columns in train data: 0
Zero-variance columns in test data: 0
=== Verification Completed ===


In [None]:
import pandas as pd

# Replace with actual column names if you have them
column_names = [f"feature_{i}" for i in range(train_df_NN.shape[1])]

# Convert to DataFrame
train_df_NN_df = pd.DataFrame(train_df_NN, columns=column_names)
test_df_NN_df = pd.DataFrame(test_df_NN, columns=column_names)


print("Checking for NaN values in training dataset:")
nan_train = train_df_NN_df.isnull().sum()
nan_train = nan_train[nan_train > 0]  # Filter only columns with NaN values
print(nan_train)

print("\nChecking for NaN values in testing dataset:")
nan_test = test_df_NN_df.isnull().sum()
nan_test = nan_test[nan_test > 0]  # Filter only columns with NaN values
print(nan_test)

Checking for NaN values in training dataset:
Series([], dtype: int64)

Checking for NaN values in testing dataset:
Series([], dtype: int64)


In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assume train_tensor and y_train are ready from preprocessing

# Train-Validation Split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    train_tensor.numpy(),  # Convert to NumPy for splitting
    y_train,               # Target labels
    test_size=0.2,         # 20% validation split
    random_state=42        # For reproducibility
)

# Target for validation

scaler = StandardScaler()
X_train_split = scaler.fit_transform(X_train_split)
X_val_split = scaler.transform(X_val_split)

# Convert to tensors
train_tensor_split = torch.tensor(X_train_split, dtype=torch.float32)
val_tensor = torch.tensor(X_val_split, dtype=torch.float32)

# Define the Autoencoder Neural Network
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Initialize the Autoencoder
input_dim = train_tensor_split.shape[1]  # Number of features
encoding_dim = 32  # Bottleneck size
model = Autoencoder(input_dim, encoding_dim)

# Define Loss Function and Optimizer
criterion = nn.MSELoss()  # Reconstruction loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
epochs = 50
batch_size = 128

for epoch in range(epochs):
    model.train()
    permutation = torch.randperm(train_tensor_split.size(0))
    epoch_loss = 0

    for i in range(0, train_tensor_split.size(0), batch_size):
        indices = permutation[i:i + batch_size]
        batch_data = train_tensor_split[indices]

        # Forward pass
        outputs = model(batch_data)
        loss = criterion(outputs, batch_data)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_tensor_split):.4f}")



Epoch 1/50, Loss: 0.0026
Epoch 2/50, Loss: 0.0020
Epoch 3/50, Loss: 0.0019
Epoch 4/50, Loss: 0.0017
Epoch 5/50, Loss: 0.0016
Epoch 6/50, Loss: 0.0016
Epoch 7/50, Loss: 0.0015
Epoch 8/50, Loss: 0.0015
Epoch 9/50, Loss: 0.0014
Epoch 10/50, Loss: 0.0014
Epoch 11/50, Loss: 0.0015
Epoch 12/50, Loss: 0.0014
Epoch 13/50, Loss: 0.0014
Epoch 14/50, Loss: 0.0013
Epoch 15/50, Loss: 0.0013
Epoch 16/50, Loss: 0.0013
Epoch 17/50, Loss: 0.0012
Epoch 18/50, Loss: 0.0012
Epoch 19/50, Loss: 0.0013
Epoch 20/50, Loss: 0.0013
Epoch 21/50, Loss: 0.0012
Epoch 22/50, Loss: 0.0012
Epoch 23/50, Loss: 0.0012
Epoch 24/50, Loss: 0.0012
Epoch 25/50, Loss: 0.0011
Epoch 26/50, Loss: 0.0011
Epoch 27/50, Loss: 0.0011
Epoch 28/50, Loss: 0.0013
Epoch 29/50, Loss: 0.0013
Epoch 30/50, Loss: 0.0012
Epoch 31/50, Loss: 0.0012
Epoch 32/50, Loss: 0.0011
Epoch 33/50, Loss: 0.0013
Epoch 34/50, Loss: 0.0012
Epoch 35/50, Loss: 0.0013
Epoch 36/50, Loss: 0.0013
Epoch 37/50, Loss: 0.0013
Epoch 38/50, Loss: 0.0012
Epoch 39/50, Loss: 0.

AttributeError: 'Series' object has no attribute 'numpy'

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

# Define the Autoencoder Neural Network
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Assume train_tensor and y_train are ready from preprocessing

# Scale the entire dataset first
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_tensor.numpy())
y = y_train.values  # Assuming y_train is a pandas Series

# Cross-validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

encoding_dim = 32  # Bottleneck size
epochs = 50
batch_size = 128

auc_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled, y)):
    print(f"Starting Fold {fold + 1}")
    X_train_split, X_val_split = X_scaled[train_idx], X_scaled[val_idx]
    y_train_split, y_val_split = y[train_idx], y[val_idx]

    # Convert to tensors
    train_tensor_split = torch.tensor(X_train_split, dtype=torch.float32)
    val_tensor = torch.tensor(X_val_split, dtype=torch.float32)

    # Initialize the Autoencoder
    input_dim = train_tensor_split.shape[1]
    model = Autoencoder(input_dim, encoding_dim)

    # Define Loss Function and Optimizer
    criterion = nn.MSELoss()  # Reconstruction loss
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training Loop
    for epoch in range(epochs):
        model.train()
        permutation = torch.randperm(train_tensor_split.size(0))
        epoch_loss = 0

        for i in range(0, train_tensor_split.size(0), batch_size):
            indices = permutation[i:i + batch_size]
            batch_data = train_tensor_split[indices]

            # Forward pass
            outputs = model(batch_data)
            loss = criterion(outputs, batch_data)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Fold {fold + 1}, Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_tensor_split):.4f}")

    # Evaluate on validation data
    model.eval()
    with torch.no_grad():
        val_reconstructed = model(val_tensor)
        reconstruction_loss = torch.mean((val_reconstructed - val_tensor) ** 2, dim=1).numpy()

    # Anomaly Detection Threshold
    threshold = np.percentile(reconstruction_loss, 95)  # Adjust the percentile as needed
    y_pred = (reconstruction_loss > threshold).astype(int)

    # Calculate AUC
    auc = roc_auc_score(y_val_split, reconstruction_loss)
    auc_scores.append(auc)
    print(f"Fold {fold + 1}, AUC Score: {auc:.4f}")

# Final Cross-Validation AUC
print(f"Final Cross-Validation AUC: {np.mean(auc_scores):.4f}")


Starting Fold 1
Fold 1, Epoch 1/50, Loss: 0.0027
Fold 1, Epoch 2/50, Loss: 0.0020
Fold 1, Epoch 3/50, Loss: 0.0018
Fold 1, Epoch 4/50, Loss: 0.0016
Fold 1, Epoch 5/50, Loss: 0.0017
Fold 1, Epoch 6/50, Loss: 0.0015
Fold 1, Epoch 7/50, Loss: 0.0014
Fold 1, Epoch 8/50, Loss: 0.0014
Fold 1, Epoch 9/50, Loss: 0.0014
Fold 1, Epoch 10/50, Loss: 0.0013
Fold 1, Epoch 11/50, Loss: 0.0012
Fold 1, Epoch 12/50, Loss: 0.0014
Fold 1, Epoch 13/50, Loss: 0.0013
Fold 1, Epoch 14/50, Loss: 0.0014
Fold 1, Epoch 15/50, Loss: 0.0013
Fold 1, Epoch 16/50, Loss: 0.0012
Fold 1, Epoch 17/50, Loss: 0.0013
Fold 1, Epoch 18/50, Loss: 0.0012
Fold 1, Epoch 19/50, Loss: 0.0012
Fold 1, Epoch 20/50, Loss: 0.0012
Fold 1, Epoch 21/50, Loss: 0.0012
Fold 1, Epoch 22/50, Loss: 0.0011
Fold 1, Epoch 23/50, Loss: 0.0011
Fold 1, Epoch 24/50, Loss: 0.0011
Fold 1, Epoch 25/50, Loss: 0.0012
Fold 1, Epoch 26/50, Loss: 0.0011
Fold 1, Epoch 27/50, Loss: 0.0011
Fold 1, Epoch 28/50, Loss: 0.0015
Fold 1, Epoch 29/50, Loss: 0.0012
Fold 1,

In [None]:
#assert not torch.isnan(train_tensor_split).any(), "NaN values found in train_tensor_split"
assert not torch.isinf(train_tensor_split).any(), "Inf values found in train_tensor_split"


In [None]:

# Prediction for Test Set
test_tensor = torch.tensor(test_df_NN, dtype=torch.float32)  # Assuming test_tensor is ready
with torch.no_grad():
    test_reconstructed = model(test_tensor)
    test_reconstruction_loss = torch.mean((test_reconstructed - test_tensor) ** 2, dim=1).numpy()

# Submission
submission = pd.DataFrame({
    "TransactionID": test_processed["TransactionID"],
    "isFraud": test_reconstruction_loss  # Use reconstruction loss directly as probabilities
})
submission.to_csv("autoencoder_submission.csv", index=False)
print("Submission file created: autoencoder_submission.csv")

Submission file created: autoencoder_submission.csv


In [None]:
torch.save(model.state_dict(), "autoencoder_model.pth")
print("Model saved as 'autoencoder_model.pth'")

Model saved as 'autoencoder_model.pth'
