#### Electra-large for classification

In [1]:
from transformers import ElectraForSequenceClassification, ElectraTokenizer, TrainingArguments, Trainer
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset

In [2]:
model_name = "google/electra-large-discriminator"
tokenizer = ElectraTokenizer.from_pretrained(model_name)
model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Get the data
train_data = pd.read_csv("./data/train_data.csv")
val_data = pd.read_csv("./data/val_data.csv")
test_data = pd.read_csv("./data/test_data.csv")

In [4]:
def preprocess_data(data):
    return Dataset.from_pandas(data[['text', 'label']])

train_dataset = preprocess_data(train_data)
val_dataset = preprocess_data(val_data)
test_dataset = preprocess_data(test_data)

print(train_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 7328
})


In [5]:
for i in range(test_dataset.num_rows):
    if type(test_dataset[i]['text']) != str:
        print(i, test_dataset[i])
        

In [6]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7328 [00:00<?, ? examples/s]

Map:   0%|          | 0/1047 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

In [9]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results/electra_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# Compute the f1 score
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions),
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)



In [10]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 15.70 GiB of which 88.38 MiB is free. Process 2717538 has 376.00 MiB memory in use. Process 2728548 has 3.95 GiB memory in use. Including non-PyTorch memory, this process has 10.72 GiB memory in use. Of the allocated memory 10.44 GiB is allocated by PyTorch, and 7.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)