In [1]:
from datetime import datetime
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, Subset
from torch.utils.tensorboard import SummaryWriter

2024-07-08 17:25:14.574529: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 17:25:14.574588: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 17:25:14.575984: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 17:25:14.583902: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class EFCamDatSet(Dataset):
    def __init__(self, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.efcamdat = pd.read_csv("./efcamdat_cleaned2.csv")

    def __len__(self):
        return len(self.efcamdat)

    def __getitem__(self, index):
        cefr_numeric = self.efcamdat.loc[index, "cefr_numeric"]
        text = self.efcamdat.loc[index, "text"]
        text_corrected = self.efcamdat.loc[index, "text_corrected"]

        #print(cefr_numeric)
        #print(text_corrected)
        tokenized = tokenizer(text_corrected,
                            return_tensors='pt',
                            padding='max_length', max_length=max_length,
                           truncation=True)
        inputs = {}
        inputs["input_ids"] = torch.squeeze(torch.tensor(tokenized["input_ids"], dtype=torch.long))
        inputs["attention_mask"] = torch.squeeze(torch.tensor(tokenized["attention_mask"], dtype=torch.long))
        cefr_level = torch.tensor(cefr_numeric - 1)

        return inputs, cefr_level

In [3]:
class CEFRClassifier(nn.Module):
    def __init__(self, num_cefr_levels):
        super(CEFRClassifier, self).__init__()
        
        self.bert = AutoModel.from_pretrained("distilbert/distilbert-base-uncased")
                                             #torch_dtype=torch.float16,
                                             #attn_implementation="flash_attention_2")

        # Freeze distilBERT params
        for param in self.bert.parameters():
            param.requires_grad = False
        
        # self.global_avg_pool = nn.AdaptiveAvgPool1d(1)

        # Classifier layers
        self.pre_classifier = nn.Linear(768, 768)
        self.fc2 = nn.Linear(768, 128)
        self.output = nn.Linear(128, num_cefr_levels)

        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        # self.softmax = nn.Softmax()
    
    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = output.last_hidden_state
        pooled_output = sequence_output[:, 0]

        #outputs_text_corrected = self.bert(input_ids=input_ids[:,1], attention_mask=attention_mask[:,1])
        #sequence_outputs_text_corrected = outputs_text_corrected.last_hidden_state
        #pooled_outputs_text_corrected = sequence_outputs_text_corrected[:, 0]

        #combined_output = torch.cat((pooled_outputs_text, pooled_outputs_text_corrected), dim=1)
    
        x = self.pre_classifier(pooled_output)
        x = self.dropout(x)
        x = self.relu(x)
        
        x = self.fc2(x)
        x = self.dropout(x)
        x = self.relu(x)
        
        logits = self.output(x)
        # probs = self.softmax(logits)
        return logits


In [4]:
device = 'cuda'
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
max_length = 450
# max_length = 512
dataset = EFCamDatSet(tokenizer, max_length)

num_cefr_levels = 5
num_samples = 100
model = CEFRClassifier(num_cefr_levels)
model.to(device)

generator = torch.Generator().manual_seed(42)
#sampled_indices = torch.randperm(len(dataset), generator=generator)[:num_samples]
#dataset = Subset(dataset, sampled_indices.tolist())
train_set, validation_set, test_set = torch.utils.data.random_split(dataset, [0.7,0.2,0.1], generator=generator)

bs = 32

training_loader = DataLoader(train_set, batch_size=bs, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=bs, shuffle=False)

# Training
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)
scaler = torch.cuda.amp.GradScaler()

def train_step(epoch_idx, writer):
    running_loss = 0.0
    last_loss = 0.0

    for i, train_data in enumerate(training_loader):
        train_inputs, train_labels = train_data

        input_ids = train_inputs["input_ids"].to(device)
        attention_mask = train_inputs["attention_mask"].to(device)
        train_labels = train_labels.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass and loss computation
        with torch.cuda.amp.autocast():
            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, train_labels)
        
        # Backward pass and optimization
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        #loss.backward()
        #optimizer.step()
        
        running_loss += loss.item()

        if i % 100 == 99:
            last_loss = running_loss / 100
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_idx * len(training_loader) + i + 1
            writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.0

    return last_loss


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/emfcamdat_cefr_{}'.format(timestamp))

validation_scaler = torch.cuda.amp.GradScaler()

EPOCHS = 30
best_vloss = 1_000_000.0

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch + 1))

    model.train(True)
    avg_loss = train_step(epoch, writer)

    running_vloss = 0.0
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, validation_data in enumerate(validation_loader):
            validation_inputs, validation_labels = validation_data

            input_ids = validation_inputs["input_ids"].to(device)
            attention_mask = validation_inputs["attention_mask"].to(device)
            validation_labels = validation_labels.to(device)
         
            with torch.cuda.amp.autocast():
                validation_logits = model(input_ids, attention_mask)
                validation_loss = loss_fn(validation_logits, validation_labels)
                
            running_vloss += validation_loss

    avg_vloss = running_vloss / len(validation_loader)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'runs/model_{}_{}'.format(timestamp, epoch)
        torch.save(model.state_dict(), model_path)


EPOCH 1:


  inputs["input_ids"] = torch.squeeze(torch.tensor(tokenized["input_ids"], dtype=torch.long))
  inputs["attention_mask"] = torch.squeeze(torch.tensor(tokenized["attention_mask"], dtype=torch.long))


  batch 100 loss: 0.9038429141044617
  batch 200 loss: 0.5141716095805168
  batch 300 loss: 0.4408052898943424
  batch 400 loss: 0.39013179406523707
  batch 500 loss: 0.33955736495554445
  batch 600 loss: 0.2985284306108952
  batch 700 loss: 0.31322404034435747
  batch 800 loss: 0.2664891039580107
  batch 900 loss: 0.27265708826482293
  batch 1000 loss: 0.2661874966323376
  batch 1100 loss: 0.22351632364094257
  batch 1200 loss: 0.2176860886067152
  batch 1300 loss: 0.21116969250142575
  batch 1400 loss: 0.20908818699419499
  batch 1500 loss: 0.18641429483890534
  batch 1600 loss: 0.1942133314907551
  batch 1700 loss: 0.18957546710968018
  batch 1800 loss: 0.1732452769204974
  batch 1900 loss: 0.1733451260626316
  batch 2000 loss: 0.18112654656171798
  batch 2100 loss: 0.16530756156891585
  batch 2200 loss: 0.19407229084521532
  batch 2300 loss: 0.17308582104742526
  batch 2400 loss: 0.1609939555823803
  batch 2500 loss: 0.16635374892503024
  batch 2600 loss: 0.15788882218301295
  batc

In [24]:
eval_model = CEFRClassifier(5)
eval_model.load_state_dict(torch.load("runs/model_20240620_204416_2"))

bs = 32
test_loader = DataLoader(test_set, batch_size=bs, shuffle=True)

device = "cuda"

def compute_accuracy(big_idx, targets):
    return (big_idx==targets).sum().item()

def validate(model, testing_loader):
    model.eval()
    model.to(device)
    
    n_correct = 0
    n_wrong = 0
    total = 0
    total_loss = 0
    nb_tr_examples = 0
    
    with torch.no_grad():
        for i, data in enumerate(testing_loader):
            test_inputs, test_labels = data

            ids = test_inputs['input_ids'].to(device, dtype = torch.long)
            mask = test_inputs['attention_mask'].to(device, dtype = torch.long)
            targets = test_labels.to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            
            loss = loss_fn(outputs, targets)
            total_loss += loss.item()
            
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += compute_accuracy(big_idx, targets)

            nb_tr_examples+=targets.size(0)
            
            if i%100==0:
                loss_step = total_loss/(i+1)
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
                
    epoch_loss = total/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

validate(eval_model, test_loader)

  inputs["input_ids"] = torch.squeeze(torch.tensor(tokenized["input_ids"], dtype=torch.long))
  inputs["attention_mask"] = torch.squeeze(torch.tensor(tokenized["attention_mask"], dtype=torch.long))


Validation Loss per 100 steps: 0.061803024262189865
Validation Accuracy per 100 steps: 96.875
Validation Loss per 100 steps: 0.03677838033734876
Validation Accuracy per 100 steps: 98.66955445544555
Validation Loss per 100 steps: 0.03475033012824017
Validation Accuracy per 100 steps: 98.75621890547264
Validation Loss per 100 steps: 0.03642535056720451
Validation Accuracy per 100 steps: 98.79568106312293
Validation Loss per 100 steps: 0.03509403241739058
Validation Accuracy per 100 steps: 98.80766832917706
Validation Loss per 100 steps: 0.0352492569752308
Validation Accuracy per 100 steps: 98.81487025948104
Validation Loss per 100 steps: 0.036752852275799074
Validation Accuracy per 100 steps: 98.74168053244593
Validation Loss per 100 steps: 0.03810991224944598
Validation Accuracy per 100 steps: 98.6938302425107
Validation Loss per 100 steps: 0.03809177700826781
Validation Accuracy per 100 steps: 98.65792759051186
Validation Loss per 100 steps: 0.03843660812986344
Validation Accuracy per 

NameError: name 'nb_tr_steps' is not defined