In [133]:
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import transformers
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_metric
import numpy as np

In [134]:
df = pd.read_csv('Avicenna_Train.csv', encoding='ISO-8859-1')

In [135]:
df.head()

Unnamed: 0,Premise 1,Premise 2,Syllogistic relation,Conclusion
0,"unchecked imbalances in the society, will see ...",correct these imbalances requires in-depth kno...,no,No conclusion
1,"Chronic diseases are heart attacks and stroke,...",In populations that eat a regular high-fiber d...,yes,In populations that eat a regular high-fiber d...
2,Formative assessment encourages children to en...,An ideal learning environment uses formative a...,yes,An ideal learning environment encourages child...
3,Underrepresented female labor force in some pr...,Job discrimination comes with underrepresented...,yes,Job discrimination comes with not being able t...
4,damaged mentality in an individual brings seri...,Aggression harms the mentality of person.,yes,Aggression brings brings serious health proble...


In [136]:
df['label'] = df['Syllogistic relation'].eq('yes').mul(1)

In [137]:
df['text'] = (df['Premise 1'] + " : " + df['Premise 2'])

In [138]:
int(len(df) * 0.8)

3840

In [139]:
train_texts = df.iloc[:3840]['text'].values
train_labels = df.iloc[:3840]['label'].values

valid_texts = df.iloc[3840:]['text'].values
valid_labels = df.iloc[3840:]['label'].values

In [140]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf

In [141]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)

In [142]:
class SyllogismDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)


In [143]:
train_dataset = SyllogismDataset(train_encodings, train_labels)
valid_dataset = SyllogismDataset(valid_encodings, valid_labels)

In [144]:
train_dataloader = torch.utils.data.DataLoader2(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader2(valid_dataset, batch_size=16, shuffle=True)

In [145]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.20.1",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9

In [146]:
DEVICE = 'cuda'

In [147]:
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)
metrics = load_metric('accuracy')

In [148]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    predictions = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=predictions, references=labels)

In [149]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=3, per_device_train_batch_size=16,
                                 per_device_eval_batch_size=16, logging_dir='./logs', logging_steps=72)

trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=train_dataset, 
                  eval_dataset=valid_dataset,
                  compute_metrics=compute_metrics, 
                  optimizers=(optim, None)
                 )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [150]:
trainer.train()

***** Running training *****
  Num examples = 3840
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 720


Step,Training Loss
72,0.6511
144,0.5034
216,0.417
288,0.3511
360,0.2741
432,0.2465
504,0.2019
576,0.1383
648,0.1183
720,0.1051


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=720, training_loss=0.30067731738090514, metrics={'train_runtime': 111.7538, 'train_samples_per_second': 103.084, 'train_steps_per_second': 6.443, 'total_flos': 289110097566720.0, 'train_loss': 0.30067731738090514, 'epoch': 3.0})

In [151]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 960
  Batch size = 16


{'eval_loss': 0.42222920060157776,
 'eval_accuracy': 0.878125,
 'eval_runtime': 2.3711,
 'eval_samples_per_second': 404.88,
 'eval_steps_per_second': 25.305,
 'epoch': 3.0}

In [188]:
df_test = pd.read_csv('Avicenna_Test.csv', encoding='ISO-8859-1')

df_test['label'] = df_test['Syllogistic relation'].eq('yes').mul(1)
df_test['text'] = (df_test['Premise 1'] + " : " + df_test['Premise 2'])

test_texts = df_test['text'].values
test_labels = df_test['label'].values

test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

test_dataset = SyllogismDataset(test_encodings, test_labels)

test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=True)

In [154]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 1200
  Batch size = 16


{'eval_loss': 0.5125078558921814,
 'eval_accuracy': 0.86,
 'eval_runtime': 4.5547,
 'eval_samples_per_second': 263.466,
 'eval_steps_per_second': 16.467,
 'epoch': 3.0}

In [239]:
sample_text = ['Socrates is a man : all men are mortal']

In [240]:
sample_encoded = tokenizer(sample_text, truncation=True, padding=True)

In [241]:
sample_dataset = SyllogismDataset(sample_encoded, sample_label)

In [250]:
input_ids = torch.tensor(sample_encoded['input_ids']).to(DEVICE)
attention_mask = torch.tensor(sample_encoded['attention_mask']).to(DEVICE)

In [260]:
torch.argmax(model(input_ids, attention_mask).logits)

tensor(1, device='cuda:0')

In [264]:
sample_text_2 = ['If the streets are wet, it has rained recently : The streets are wet.']

sample_encoded = tokenizer(sample_text_2, truncation=True, padding=True)

sample_dataset = SyllogismDataset(sample_encoded, sample_label)
trainer.evaluate(sample_dataset)

input_ids = torch.tensor(sample_encoded['input_ids']).to(DEVICE)
attention_mask = torch.tensor(sample_encoded['attention_mask']).to(DEVICE)

torch.argmax(model(input_ids, attention_mask).logits)

***** Running Evaluation *****
  Num examples = 1
  Batch size = 16


tensor(0, device='cuda:0')