In [57]:
from platform import python_version
print(python_version())

3.6.13


In [58]:
torch.cuda.is_available()

True

In [20]:
# !pip install transformers
# !pip install pytorch_lightning

import pytorch_lightning as pl
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

pd.set_option('display.max_colwidth', None)
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'  # This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2.
BATCH_SIZE = 60
N_EPOCHS = 3  

In [21]:
def join_columns(row):
    final = []
    for col in df.columns[:4]:
      aux = []
      aux.append(col)
      aux.append(str(row[col]))
      final.append(' '.join(aux))
    return ', '.join(final)

In [22]:
def build_range(num):
  pattern = "{:03d}-{:03d}"
  aux = int(num * 10)
  res = aux % 10
  if res < 3:
    return pattern.format(aux - res, aux - res + 2)
  if res < 6:
    return pattern.format(aux - res + 3, aux - res + 5)
  return pattern.format(aux - res + 6, aux - res + 10)

In [23]:
build_range(5.5)

'053-055'

In [24]:
# iris = datasets.load_breast_cancer()
# iris = datasets.load_wine()
iris = datasets.load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# df['sepal length'] = df['sepal length (cm)'].apply(lambda x: build_range(x))
# df['sepal width'] = df['sepal width (cm)'].apply(lambda x: build_range(x))
# df['petal length'] = df['petal length (cm)'].apply(lambda x: build_range(x))
# df['petal width'] = df['petal width (cm)'].apply(lambda x: build_range(x))

print(len(df))
df.head()

150


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [25]:
df['text'] = df.apply(join_columns, axis=1)
df_text = df[['text', 'target']].copy()
df_text.head()

Unnamed: 0,text,target
0,"sepal length (cm) 5.1, sepal width (cm) 3.5, petal length (cm) 1.4, petal width (cm) 0.2",0
1,"sepal length (cm) 4.9, sepal width (cm) 3.0, petal length (cm) 1.4, petal width (cm) 0.2",0
2,"sepal length (cm) 4.7, sepal width (cm) 3.2, petal length (cm) 1.3, petal width (cm) 0.2",0
3,"sepal length (cm) 4.6, sepal width (cm) 3.1, petal length (cm) 1.5, petal width (cm) 0.2",0
4,"sepal length (cm) 5.0, sepal width (cm) 3.6, petal length (cm) 1.4, petal width (cm) 0.2",0


In [26]:
set(df['target'])

{0, 1, 2}

In [27]:
df_text['text'][0]

'sepal length (cm) 5.1, sepal width (cm) 3.5, petal length (cm) 1.4, petal width (cm) 0.2'

In [28]:
text = df_text['text']
labels = df_text['target']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(text, labels,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=labels)

In [30]:
X_train[0]

'sepal length (cm) 5.1, sepal width (cm) 3.5, petal length (cm) 1.4, petal width (cm) 0.2'

In [31]:
MAX_LEN = X_train.apply(lambda s: len([x for x in s.split()])).max()
MAX_LEN

16

In [32]:
#define a tokenizer object
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)


#tokenize the text (padding to max sequence in batch)
train_encodings = tokenizer(list(X_train.values), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test.values), truncation=True, padding=True)

In [33]:
#print the first paragraph and its transformation
print(f'First paragraph: \'{X_train[0]}\'')
print(f'Input ids: {train_encodings["input_ids"][0]}')
print(f'Attention mask: {train_encodings["attention_mask"][0]}')

First paragraph: 'sepal length (cm) 5.1, sepal width (cm) 3.5, petal length (cm) 1.4, petal width (cm) 0.2'
Input ids: [101, 19802, 2389, 3091, 1006, 4642, 1007, 1018, 1012, 1018, 1010, 19802, 2389, 9381, 1006, 4642, 1007, 1016, 1012, 1023, 1010, 9004, 2389, 3091, 1006, 4642, 1007, 1015, 1012, 1018, 1010, 9004, 2389, 9381, 1006, 4642, 1007, 1014, 1012, 1016, 102]
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [34]:
y_train.to_numpy()

array([0, 2, 1, 0, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 2, 2, 0, 1, 0,
       2, 0, 1, 2, 2, 0, 2, 0, 0, 1, 1, 0, 2, 2, 1, 1, 2, 1, 0, 1, 0, 2,
       0, 0, 2, 0, 0, 0, 0, 1, 2, 1, 0, 2, 1, 2, 0, 2, 0, 1, 2, 0, 1, 1,
       2, 1, 1, 2, 0, 0, 0, 2, 1, 2, 1, 2, 2, 1, 0, 2, 1, 0, 2, 0, 2, 1,
       1, 0, 1, 2, 0, 0, 2, 2, 2, 1, 2, 0, 2, 1, 2, 2, 0, 1, 1, 1, 1, 1,
       0, 2, 1, 1, 0, 0, 0, 0, 1, 0])

In [35]:
from torch.utils.data import Dataset, DataLoader

class TabularToTextDataset(Dataset):

  def __init__(self, data, label):
    self.data = data 
    self.label = torch.tensor(label.to_numpy(), dtype=torch.long)
    self.keys = list(self.data.keys())

  def __getitem__(self, idx):
    instance = {key: self.data[key][idx] for key in self.keys}
    label = self.label[idx]
    return instance, label
  
  def __len__(self):
    return len(self.label)


In [36]:
class TabularToTextDM(pl.LightningDataModule):
  
  def __init__(self, train_encodings, y_train, test_encodings, y_test):
    super().__init__()
    self.train_encodings = train_encodings
    self.y_train = y_train 
    self.test_encodings = test_encodings
    self.y_test = y_test

  def train_dataloader(self,):
    train_dataset = TabularToTextDataset(self.train_encodings, self.y_train)
    train_loader = DataLoader(train_dataset, batch_size=12)
    return train_loader
  
  def test_dataloader(self,):
    test_dataset = TabularToTextDataset(self.test_encodings, self.y_test)
    test_loader = DataLoader(test_dataset, batch_size=12)
    return test_loader

  def val_dataloader(self,):
    test_dataset = TabularToTextDataset(self.test_encodings, self.y_test)
    test_loader = DataLoader(test_dataset, batch_size=12)
    return test_loader


In [37]:
data_module = TabularToTextDM(train_encodings, y_train, test_encodings, y_test)
data_module.train_dataloader()

<torch.utils.data.dataloader.DataLoader at 0x7f4c9a0f7630>

In [38]:
for x, y in data_module.train_dataloader():
  print(x)
  print(y)
  break

{'input_ids': [tensor([101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101]), tensor([19802, 19802, 19802, 19802, 19802, 19802, 19802, 19802, 19802, 19802,
        19802, 19802]), tensor([2389, 2389, 2389, 2389, 2389, 2389, 2389, 2389, 2389, 2389, 2389, 2389]), tensor([3091, 3091, 3091, 3091, 3091, 3091, 3091, 3091, 3091, 3091, 3091, 3091]), tensor([1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006]), tensor([4642, 4642, 4642, 4642, 4642, 4642, 4642, 4642, 4642, 4642, 4642, 4642]), tensor([1007, 1007, 1007, 1007, 1007, 1007, 1007, 1007, 1007, 1007, 1007, 1007]), tensor([1018, 1018, 1020, 1018, 1019, 1020, 1019, 1020, 1021, 1021, 1021, 1020]), tensor([1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]), tensor([1018, 1023, 1022, 1023, 1019, 1017, 1020, 1017, 1021, 1021, 1020, 1014]), tensor([1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010]), tensor([19802, 19802, 19802, 19802, 19802, 19802, 19802, 19802, 19802, 1980

In [39]:
# train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
#                                                     list(y_train.values)))

# test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
#                                                     list(y_test.values)))

In [40]:
import torch 
from torch import nn 
from torch import optim 
import torchmetrics 


In [41]:
class DistilBertTabular(pl.LightningModule):
  def __init__(self, num_classes):
    super().__init__()
    self.model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
    self.model.classifier = nn.Linear(768, num_classes)
    self.model.dropout = nn.Identity()

    self.train_acc = torchmetrics.Accuracy()
    self.val_acc = torchmetrics.Accuracy()

    self.loss_fn = nn.CrossEntropyLoss()
  
  def forward(self, **kwargs):
    preds = self.model(**kwargs)
    return preds.logits

  def training_step(self, batch, batch_id):
    batch, labels = batch
    keys = batch.keys()
    instance = {}
    for key in keys:
      instance[key] = torch.stack(batch[key], dim=0).transpose(1, 0).cuda()
    pred = self.forward(**instance)
    loss_value = self.loss_fn(pred, labels)
    acc = self.train_acc(pred, labels)
    self.log("train_loss", loss_value, on_step=True, on_epoch=True, prog_bar=True, logger=True)
    return loss_value
  
  def training_epoch_end(self, training_step_outputs):
    train_acc = self.train_acc.compute()
    self.log("train_acc", train_acc, on_epoch=True, prog_bar=True, logger=True)
  
  def validation_step(self, batch, batch_id):
    batch, labels = batch
    keys = batch.keys()
    instance = {}
    for key in keys:
      instance[key] = torch.stack(batch[key], dim=0).transpose(1, 0).cuda()
    pred = self.forward(**instance)
    loss_value = self.loss_fn(pred, labels)
    acc = self.val_acc(pred, labels)
    self.log('val_loss', loss_value, on_step=True, on_epoch=True, prog_bar=True, logger=True)
    return pred
  
  def test_step(self, batch, batch_id):
    return self.validation_step(batch, batch_id)
  
  def validation_epoch_end(self, validation_step_outputs):
    acc = self.val_acc.compute()
    print('validation acc', acc)
    self.log("val_acc", acc, on_epoch=True, prog_bar=True, logger=True)
    return acc 

  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=1e-5)


In [42]:
model = DistilBertTabular(3)
model = model.cuda()

In [43]:
trainer = pl.Trainer(gpus=1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [44]:
trainer.test(model, data_module.val_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'val_loss': 1.0836148262023926, 'val_loss_epoch': 1.0836148262023926}
--------------------------------------------------------------------------------


[{'val_loss': 1.0836148262023926, 'val_loss_epoch': 1.0836148262023926}]

In [45]:
trainer.fit(model=model, datamodule=data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name      | Type                                | Params
------------------------------------------------------------------
0 | model     | DistilBertForSequenceClassification | 67.0 M
1 | train_acc | Accuracy                            | 0     
2 | val_acc   | Accuracy                            | 0     
3 | loss_fn   | CrossEntropyLoss                    | 0     
------------------------------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.823   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

validation acc tensor(0.4074, device='cuda:0')


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

validation acc tensor(0.5119, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.5965, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.6458, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.6782, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.7010, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.7350, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.7576, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.7755, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.7901, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8023, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8125, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8213, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8288, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8354, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8413, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8464, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8511, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8552, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8590, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8624, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8670, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8711, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8737, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8760, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8781, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8801, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8819, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8837, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8842, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8847, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8852, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8856, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8851, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8855, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8859, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8854, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8857, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8869, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8864, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8860, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8855, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8858, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8862, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8865, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8875, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8870, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8873, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8876, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8871, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8874, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8876, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8879, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8875, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8877, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8879, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8875, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8872, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8874, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8871, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8873, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8875, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8877, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8879, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8880, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8882, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8884, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8886, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8892, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8894, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8900, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8897, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8898, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8899, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8901, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8902, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8907, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8913, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8918, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8923, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8928, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8933, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8938, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8943, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8947, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8952, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8956, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8956, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8957, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8961, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8962, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8962, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8962, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8963, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8963, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8963, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8964, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8964, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8965, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8965, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8965, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8966, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8966, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8966, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8970, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8973, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8976, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8977, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8977, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8977, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8977, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8978, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8978, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8978, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8978, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8978, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8978, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8979, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8979, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8979, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8979, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8979, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8980, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8980, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8980, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8980, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8980, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8983, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8983, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8983, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8983, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8983, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8984, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8984, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8984, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8984, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8984, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8984, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8984, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8984, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8984, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8985, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8985, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8985, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8985, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8985, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8985, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8985, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8985, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8985, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8986, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8986, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8986, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8986, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8986, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8986, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8986, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8986, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8986, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8988, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8991, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8993, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8995, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8997, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.8999, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9001, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9002, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9003, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9005, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9006, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9006, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9006, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9006, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9007, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9009, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9010, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9011, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9013, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9014, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9015, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9016, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9018, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9019, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9019, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9020, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9021, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9023, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9024, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9025, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9026, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9027, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9027, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9028, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9030, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9031, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9032, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9033, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9034, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9035, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9036, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9037, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9039, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9040, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9041, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9042, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9043, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9044, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9045, device='cuda:0')


Validating: 0it [00:00, ?it/s]

validation acc tensor(0.9046, device='cuda:0')


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [48]:
from scipy.interpolate import interp1d

In [49]:
inter = interp1d(df.min().to_numpy(), df.max().to_numpy())

TypeError: '>' not supported between instances of 'numpy.ndarray' and 'str'

In [50]:
df.min(), df.max()

(sepal length (cm)                                                                                         4.3
 sepal width (cm)                                                                                            2
 petal length (cm)                                                                                           1
 petal width (cm)                                                                                          0.1
 target                                                                                                      0
 text                 sepal length (cm) 4.3, sepal width (cm) 3.0, petal length (cm) 1.1, petal width (cm) 0.1
 dtype: object,
 sepal length (cm)                                                                                         7.9
 sepal width (cm)                                                                                          4.4
 petal length (cm)                                                                              

### KERAS

In [None]:
model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE), 
          epochs=40,
          batch_size=BATCH_SIZE)

In [None]:
model.evaluate(test_dataset.shuffle(len(X_test)).batch(BATCH_SIZE), return_dict=True, batch_size=BATCH_SIZE)

In [None]:
def predict_proba(text_list, model, tokenizer):
  """
  To get array with predicted probabilities for 0 - instructions, 1- ingredients classes 
  for each paragraph in the list of strings
  :param text_list: list[str]
  :param model: transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForSequenceClassification
  :param tokenizer: transformers.models.distilbert.tokenization_distilbert.DistilBertTokenizer
  :return res: numpy.ndarray
  """
     
  encodings = tokenizer(text_list, max_length=MAX_LEN, truncation=True, padding=True)
  dataset = tf.data.Dataset.from_tensor_slices((dict(encodings))) 
  preds = model.predict(dataset.batch(1)).logits
  res = tf.nn.softmax(preds, axis=1).numpy()
    
  return res

In [None]:
string = [X_test[38]]
pred = predict_proba(string, model, tokenizer)
print(pred)
print(np.argmax(pred))

In [None]:
iris = datasets.load_breast_cancer()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df.head()