# Custom Sklearn model using Pytorch

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader


class MyDataset(Dataset):
    def __init__(self, X, y):
        x = X.values
        y = y.values
        self.x_train = torch.tensor(x, dtype=torch.float32)
        self.y_train = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
 
    def __len__(self):
        return len(self.y_train)

    def __getitem__(self,idx):
        return self.x_train[idx], self.y_train[idx]


# Get my DataFrame
df = pd.DataFrame(data={
    "a": [1, 1, 0.5],
    "b": [2.5, 2.7, 8],
    "target": [0, 0, 1]
})
X = df.drop("target", axis=1)
y = df["target"]

# testing the dataloader
# dataset = MyDataset(X, y)
# data_loader = DataLoader(dataset, batch_size = 32, shuffle = False)

In [2]:
import lightning.pytorch as pl
from torch import optim, nn
import torchmetrics


class LightningDeepLearning(pl.LightningModule):
    """
    Define the LightningModule
    """
    def __init__(self, architecture):
        super().__init__()
        self.architecture = architecture
        self.loss = nn.BCELoss()
        self.accuracy = torchmetrics.Accuracy(task="binary")

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        x, y = batch
        pred = self.architecture(x)
        # print(pred)
        loss = self.loss(pred, y)

        # log step metric
        self.accuracy(pred, y)  # compute metrics
        self.log('train_acc_step', self.accuracy)  # log metric object

        return loss

    def forward(self, x):
        pred = self.architecture(x)
        return pred

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

# testint the model
# input_size = 2
# output_size = 1
# architecture = nn.Sequential(
#     nn.Linear(input_size, 64), nn.ReLU(),
#     nn.Linear(64, output_size), nn.Sigmoid()
# )

# model = LightningDeepLearning(architecture)

In [3]:
# architecture(torch.tensor([[0, 0], [0, 1]], dtype=torch.float32))

In [4]:
# train the model (hint: here are some helpful Trainer arguments for rapid idea iteration)
# trainer = pl.Trainer(limit_train_batches = 100, max_epochs = 5)
# trainer.fit(model = model, train_dataloaders = data_loader)

In [5]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin

class TemplateClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, input_size, batch_size=32):
        self.input_size = input_size
        self.batch_size = batch_size
        self.architecture = nn.Sequential(
            nn.Linear(input_size, 64), nn.ReLU(),
            nn.Linear(64, 1), nn.Sigmoid()
        )
        self.base = LightningDeepLearning(self.architecture)

    def fit(self, X, y):
        # Create dataloader
        dataset = MyDataset(X, y)
        data_loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False)
        # Train the model
        trainer = pl.Trainer(max_epochs=10)
        trainer.fit(model=self.base, train_dataloaders=data_loader)

    def predict(self, X):
        x = torch.tensor(X.values, dtype=torch.float32)
        pred = self.base(x)
        pred = pred.reshape(-1).detach().numpy().round().astype(np.int32)
        return pred

In [6]:
# testing my sklearn template
my_estimator = TemplateClassifier(input_size=2)
my_estimator.fit(X, y)
my_estimator.predict(X)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name         | Type           | Params
------------------------------------------------
0 | architecture | Sequential     | 257   
1 | loss         | BCELoss        | 0     
2 | accuracy     | BinaryAccuracy | 0     
------------------------------------------------
257       Trainable params
0         Non-trainable params
257       Total params
0.001     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Epoch 9: 100%|██████████| 1/1 [00:00<00:00, 84.40it/s, v_num=34] 

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 1/1 [00:00<00:00, 56.61it/s, v_num=34]


array([0, 0, 1], dtype=int32)

In [7]:
# running a "normal" sklearn mode for comparison
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=0)
model.fit(X, y)
model.predict(X)

array([0, 0, 1])