In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'm-2-bdia-dl-project-2024:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F84729%2F9513708%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240920%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240920T225108Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D6872048ea196418e2fd07c1370c603e05edf8ccd9a823b4704d335bc6b4bc12a4908add7dea91e70021ea336e8bb9d0a3b535a90fca4e39eb0ee7e78ebedd48f9ba102c843f21883f206b8bc701fe70ba73cf2a671bafb212149ad7200299ee77e43101c8e2195c563c30112dded67adcc8aceea036cd24fd8fec63859fec7ee673600eb1005a44611b6a0788a08d6d243f3710a876d21fd4129ef2dd7a8d21e5ebf2ae4816eee4838f7f59f860e4bcdae1f028febc8446be6aa2c07e1e91f829bd60c46f6ebac4bb3beb71fb22f89fb90f83b3698c02b7bdeba9100693002a6e5bb335d697fbd355effb9836297446225759ea07dd7397f847226ec0f856cbe'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading m-2-bdia-dl-project-2024, 350412 bytes compressed
Downloaded and uncompressed: m-2-bdia-dl-project-2024
Data source import complete.


In [2]:
filename = "data.csv"
fileid = "1Dk6lxonmTQirLcq1XXaKmdun1X2_8lGh"
!wget -O $filename 'https://docs.google.com/uc?export=download&id='$fileid -q

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv(filepath_or_buffer="/kaggle/input/m-2-bdia-dl-project-2024/data.csv")
df

Unnamed: 0,air_time1,disp_index1,gmrt_in_air1,gmrt_on_paper1,max_x_extension1,max_y_extension1,mean_acc_in_air1,mean_acc_on_paper1,mean_gmrt1,mean_jerk_in_air1,...,mean_jerk_in_air25,mean_jerk_on_paper25,mean_speed_in_air25,mean_speed_on_paper25,num_of_pendown25,paper_time25,pressure_mean25,pressure_var25,total_time25,class
0,4860,0.000013,236.876312,308.575985,3171,7348,0.277021,0.214148,272.726149,0.040834,...,0.043534,0.021311,3.926618,3.026629,102,64740,1253.154541,193816.27650,108925,1
1,2005,0.000010,174.435893,158.661271,1798,5962,0.209676,0.125318,166.548582,0.031859,...,0.137580,0.019412,4.593336,4.676877,79,38670,1357.563486,121864.44040,88000,1
2,12960,0.000019,154.094876,66.137920,1335,9809,0.303533,0.168974,110.116398,0.036623,...,0.207932,0.019402,5.171081,2.851422,81,33010,1583.533172,165641.91900,96545,1
3,7870,0.000012,102.981366,79.647541,1420,6142,0.320674,0.156004,91.314453,0.054545,...,0.080464,0.017964,3.053848,2.758524,74,41340,878.310595,202783.16910,122115,1
4,3590,0.000009,241.464110,143.991636,1557,6218,0.220933,0.163247,192.727873,0.034884,...,0.169032,0.025185,5.383940,3.779826,52,15930,1523.603892,231407.77220,29980,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,1270,0.000008,324.458938,180.769514,1382,6677,0.322650,0.159450,252.614226,0.042374,...,0.212218,0.021903,5.322031,3.239461,81,31500,1876.255079,154462.68020,49935,0
100,4115,0.000008,177.294869,103.118202,1257,4903,0.570892,0.160674,140.206535,0.100500,...,0.133040,0.017024,3.351319,2.769629,82,33765,626.638087,73347.34392,81715,1
101,2630,0.000013,276.714723,181.317565,1639,11729,0.218019,0.164965,229.016144,0.030151,...,0.126723,0.018062,4.296292,2.584401,84,46880,1007.974936,162033.71240,128215,1
102,4830,0.000012,205.765881,98.679157,1643,8121,0.267541,0.150560,152.222519,0.046280,...,0.040198,0.025202,1.323565,1.272008,32,139575,558.595486,218550.93970,213105,1


## Split of the data : train and validation

In [4]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook

seed = 42

X = df.drop(columns=['class'])
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=seed, shuffle=True)

In [5]:
X_train

Unnamed: 0,air_time1,disp_index1,gmrt_in_air1,gmrt_on_paper1,max_x_extension1,max_y_extension1,mean_acc_in_air1,mean_acc_on_paper1,mean_gmrt1,mean_jerk_in_air1,...,mean_gmrt25,mean_jerk_in_air25,mean_jerk_on_paper25,mean_speed_in_air25,mean_speed_on_paper25,num_of_pendown25,paper_time25,pressure_mean25,pressure_var25,total_time25
66,8385,0.000010,227.890389,180.387039,1682,7775,0.495571,0.168285,204.138714,0.081854,...,204.885640,0.131505,0.023654,3.512483,3.139853,95,31040,1829.097294,123997.03850,62450
34,670,0.000007,536.433411,405.992762,3009,8371,0.483585,0.273639,471.213087,0.072642,...,178.369346,0.124234,0.018807,4.286445,1.910254,80,40800,1394.883578,212422.33870,77170
7,4055,0.000014,173.485920,97.468975,1163,6280,0.208911,0.148446,135.477448,0.026987,...,118.261416,0.090334,0.017186,2.729389,1.372999,95,68460,849.462387,156913.89140,249090
43,5160,0.000013,120.804174,86.853334,957,6601,0.361800,0.217459,103.828754,0.051836,...,249.729085,0.141434,0.024471,5.596487,3.184589,71,40120,1749.278166,296102.76760,144605
68,1555,0.000009,419.263442,412.504259,3258,8739,0.863873,0.339969,415.883851,0.153785,...,288.532516,0.211196,0.020693,5.692429,3.337157,60,24230,1843.348329,137208.31000,39380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,2310,0.000007,257.997131,111.275889,987,4732,0.266077,0.145104,184.636510,0.037528,...,147.094679,0.121782,0.020872,3.319036,1.680629,92,37285,1841.702561,158290.02550,72575
14,2900,0.000011,305.092663,130.501402,1214,8202,0.979940,0.127559,217.797032,0.185118,...,105.215372,0.071264,0.017608,1.844563,1.505821,84,55645,874.566718,153968.14580,117830
92,9880,0.000015,137.524031,56.248264,901,7768,0.368335,0.181916,96.886147,0.057226,...,273.713114,0.212052,0.020816,5.662866,3.432212,71,38285,1705.952331,157649.09800,90485
51,2930,0.000016,333.839450,156.727564,2428,9491,0.191115,0.106993,245.283507,0.025509,...,201.788379,0.132301,0.017726,3.738211,3.079017,53,34515,1961.164856,93186.32455,85895


In [6]:
X_test

Unnamed: 0,air_time1,disp_index1,gmrt_in_air1,gmrt_on_paper1,max_x_extension1,max_y_extension1,mean_acc_in_air1,mean_acc_on_paper1,mean_gmrt1,mean_jerk_in_air1,...,mean_gmrt25,mean_jerk_in_air25,mean_jerk_on_paper25,mean_speed_in_air25,mean_speed_on_paper25,num_of_pendown25,paper_time25,pressure_mean25,pressure_var25,total_time25
30,5100,1.1e-05,177.963556,237.609298,1648,6681,0.189135,0.186363,207.786427,0.024391,...,116.341654,0.076134,0.017364,2.59469,1.431078,171,131610,1663.374212,302207.896,314885
65,51980,1.6e-05,115.318238,83.448681,1694,6998,0.272513,0.14488,99.383459,0.039827,...,77.258394,0.049663,0.018368,1.665973,0.950249,129,126700,1504.768272,278744.285,298640
64,32300,1.3e-05,139.410773,94.248239,2091,5908,0.415122,0.104892,116.829506,0.072672,...,175.72705,0.102401,0.018792,3.257622,2.273153,106,44470,1615.038565,234812.3917,108950
53,2740,9e-06,354.518156,289.414055,1831,9729,0.311624,0.216704,321.966106,0.041679,...,188.204705,0.113037,0.017907,3.281408,2.635369,110,55115,1648.218997,219329.4701,124760
45,2039,1.3e-05,323.101817,206.158451,2703,10615,0.232597,0.161851,264.630134,0.028522,...,259.406572,0.16179,0.025171,6.093337,3.149393,108,35380,1706.291549,202363.7529,66470
93,2600,1e-05,229.933997,172.761858,2333,5802,0.38702,0.181342,201.347928,0.06422,...,193.667018,0.178194,0.017174,4.000781,2.392521,74,45480,1431.443492,144411.7055,79025
91,1970,1.1e-05,231.499777,90.64948,1434,5643,0.20956,0.144054,161.074629,0.031652,...,107.304461,0.063367,0.017974,2.022657,1.528793,104,71795,912.940525,145022.1189,182355
47,465,6e-06,935.805821,497.851811,1992,6070,0.860722,0.316033,716.828816,0.139155,...,189.771703,0.133241,0.016294,3.585643,2.500616,134,71075,1530.794724,184497.7541,181970
10,845,9e-06,319.136372,148.979549,1328,5488,0.250414,0.140594,234.05796,0.044115,...,198.486673,0.107724,0.020837,3.556917,3.078715,69,28295,1455.754197,238345.1474,46455
0,4860,1.3e-05,236.876312,308.575985,3171,7348,0.277021,0.214148,272.726149,0.040834,...,202.201858,0.043534,0.021311,3.926618,3.026629,102,64740,1253.154541,193816.2765,108925


In [7]:
y_train

Unnamed: 0,class
66,0
34,1
7,1
43,1
68,0
...,...
71,1
14,1
92,1
51,1


In [8]:
y_test

Unnamed: 0,class
30,1
65,1
64,1
53,0
45,0
93,1
91,1
47,1
10,0
0,1


## Data Normalization

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


## Model: Multi-Layer Perceptron (MLP) instead of Logistic Regression

In [11]:

class MLPModel(nn.Module):
    def __init__(self, input_dim):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # First hidden layer
        self.bn1 = nn.BatchNorm1d(128)        # Batch Normalization for layer 1
        self.fc2 = nn.Linear(128, 64)         # Second hidden layer
        self.bn2 = nn.BatchNorm1d(64)         # Batch Normalization for layer 2
        self.fc3 = nn.Linear(64, 1)           # Output layer
        self.relu = nn.ReLU()                 # Activation function
        self.dropout = nn.Dropout(0.5)        # Dropout for regularization
        self.sigmoid = nn.Sigmoid()           # Sigmoid for binary classification

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))  # Apply BatchNorm to layer 1 output before ReLU
        x = self.dropout(x)                   # Apply dropout after BatchNorm + ReLU
        x = self.relu(self.bn2(self.fc2(x)))  # Apply BatchNorm to layer 2 output before ReLU
        return self.sigmoid(self.fc3(x))      # Output layer with Sigmoid for classification

input_dim = X_train.shape[1]
model = MLPModel(input_dim)

# Optimizer and loss function (Adam with L2 regularization)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.BCELoss()

## Training and validation : criteria AUC score

In [12]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import torch

# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available, otherwise use CPU
print(f"Using device: {device}")

# Move the model to the GPU
model = model.to(device)


# Training loop with early stopping
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)  # Reduce LR every 10 epochs
# Weight initialization for better convergence
def weights_init(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)  # Xavier initialization for fully connected layers
        if m.bias is not None:
            torch.nn.init.zeros_(m.bias)

# Apply weight initialization
model.apply(weights_init)

# Early stopping variables
best_val_loss = float('inf')  # To track the best validation loss
patience = 10                 # Number of epochs to wait for improvement
no_improvement = 0            # Counter to track epochs with no improvement

# Mixup function
def mixup_data(x, y, alpha=1.0):
    '''Compute the mixup data. Return mixed inputs, pairs of targets, and the lambda value'''
    if alpha > 0:
        lam = torch.distributions.Beta(alpha, alpha).sample().item()  # Generate lambda from Beta distribution
    else:
        lam = 1
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(x.device)  # Shuffle the batch indices

    # Create the new mixed input and mixed target labels
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


num_epochs = 120
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move input and labels to GPU

        inputs, y_a, y_b, lam = mixup_data(X_batch, y_batch, alpha=1.0)  # Apply Mixup data augmentation with alpha=1.0

        optimizer.zero_grad()  # Zero gradients before backpropagation
        y_pred = model(X_batch).squeeze()  # Forward pass (on GPU)
        #Compute loss with Mixup (weighted sum of two losses)
        loss = lam * criterion(y_pred, y_a) + (1 - lam) * criterion(y_pred, y_b)

        loss = criterion(y_pred, y_batch)  # Compute loss (on GPU)
        loss.backward()  # Backpropagation (on GPU)
        optimizer.step()  # Update weights (on GPU)
        # Update learning rate
        scheduler.step()

        running_loss += loss.item()

    # Validation step
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():  # No need to calculate gradients during validation
        X_test_tensor, y_test_tensor = X_test_tensor.to(device), y_test_tensor.to(device)  # Move test data to GPU
        y_test_pred = model(X_test_tensor).squeeze()  # Forward pass on test data (on GPU)
        val_loss = criterion(y_test_pred, y_test_tensor)  # Compute validation loss (on GPU)
        y_test_pred_class = torch.round(y_test_pred)  # Round predictions for classification

    # Convert tensors to CPU before using sklearn metrics (sklearn doesn't work with GPU tensors)
    y_test_numpy = y_test_tensor.cpu().numpy()  # Move tensor back to CPU for evaluation
    y_test_pred_numpy = y_test_pred_class.cpu().numpy()  # Move prediction tensor back to CPU

    # Compute evaluation metrics
    f1 = f1_score(y_test_numpy, y_test_pred_numpy)
    auc = roc_auc_score(y_test_numpy, y_test_pred_numpy)
    cm = confusion_matrix(y_test_numpy, y_test_pred_numpy)

    # Print evaluation results
    print(f"Epoch {epoch+1}, Training Loss: {running_loss:.4f}, F1 Score: {f1:.4f}, AUC: {auc:.4f}, Validation Loss: {val_loss:.4f}")
    print("Confusion Matrix:")
    print(cm)

    # Early stopping logic
    if val_loss < best_val_loss:
        best_val_loss = val_loss  # Save best validation loss
        no_improvement = 0  # Reset counter if validation loss improves
        best_model = model.state_dict()  # Save the best model's state
        print(f">>> Saving best model with validation loss {best_val_loss:.4f}")
    else:
        no_improvement += 1  # Increment counter if no improvement
        print(f"No improvement in validation loss for {no_improvement} epochs")

    # Stop training early if no improvement for 'patience' epochs
    if no_improvement >= patience:
        print("Early stopping triggered")
        break

# Load the best model after early stopping
model.load_state_dict(best_model)


Using device: cpu
Epoch 1, Training Loss: 1.8999, F1 Score: 0.2143, AUC: 0.5600, Validation Loss: 1.1967
Confusion Matrix:
[[17  0]
 [22  3]]
>>> Saving best model with validation loss 1.1967
Epoch 2, Training Loss: 1.5021, F1 Score: 0.5946, AUC: 0.6906, Validation Loss: 0.9402
Confusion Matrix:
[[16  1]
 [14 11]]
>>> Saving best model with validation loss 0.9402
Epoch 3, Training Loss: 1.2238, F1 Score: 0.6486, AUC: 0.7400, Validation Loss: 0.7847
Confusion Matrix:
[[17  0]
 [13 12]]
>>> Saving best model with validation loss 0.7847
Epoch 4, Training Loss: 1.0896, F1 Score: 0.6667, AUC: 0.7306, Validation Loss: 0.6809
Confusion Matrix:
[[16  1]
 [12 13]]
>>> Saving best model with validation loss 0.6809
Epoch 5, Training Loss: 1.0056, F1 Score: 0.8182, AUC: 0.8306, Validation Loss: 0.5982
Confusion Matrix:
[[16  1]
 [ 7 18]]
>>> Saving best model with validation loss 0.5982
Epoch 6, Training Loss: 0.9212, F1 Score: 0.8182, AUC: 0.8306, Validation Loss: 0.5863
Confusion Matrix:
[[16  1

<All keys matched successfully>

# Submission

## Downloading the test data

In [13]:
filename = "test.csv"
fileid = "1VM_ofPP6sTVUU9Nws4VgQDJWWgb4BQVx"
!wget -O $filename 'https://docs.google.com/uc?export=download&id='$fileid -q

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


In [15]:
df_test = pd.read_csv("/kaggle/input/m-2-bdia-dl-project-2024/test.csv", sep=";")
X_test = df_test.values


#Normalization above
X_test = scaler.fit_transform(X_test)

#Transforming into test tensor and loader
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
test_dataset = TensorDataset(X_test_tensor)

In [16]:
with torch.no_grad():
    y_test_pred = model(X_test_tensor).squeeze()


In [17]:
preds_list = y_test_pred.tolist()

df_submission = pd.DataFrame({"ID": range(len(preds_list)), "TARGET": preds_list})
df_submission.to_csv("./submission.csv", index=False)

In [18]:
from sklearn.metrics import accuracy_score

# Calculate and print accuracy
accuracy = accuracy_score(y_test_numpy, y_test_pred_numpy)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.8095
