In [1]:
# data source
# https://github.com/yukiar/CEFR-SP?tab=readme-ov-file 
# https://www.kaggle.com/datasets/amontgomerie/cefr-levelled-english-texts

In [2]:
# make sure to install requirements! 

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd

from transformers import DebertaV2Model, DebertaV2Tokenizer
from transformers import PreTrainedModel, AutoConfig, AutoTokenizer
from transformers.modeling_outputs import SequenceClassifierOutput

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class CustomModelWithCEFR(PreTrainedModel):
    def __init__(self, config, model_name_or_path, num_labels=6, two_labels=True):
        super().__init__(config)
        self.backbone = DebertaV2Model(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        
        self.num_labels = num_labels
        self.two_labels = two_labels

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        label_a=None,
        label_b=None,
        labels=None,
    ):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS token
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if self.two_labels:
            if label_a is not None and label_b is not None:
                loss_fn = nn.CrossEntropyLoss()
                loss_a = loss_fn(logits, label_a)
                loss_b = loss_fn(logits, label_b)
                loss = 0.5 * (loss_a + loss_b)
        else:
            if labels is not None:
                loss_fn = nn.CrossEntropyLoss()
                loss = loss_fn(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states if hasattr(outputs, 'hidden_states') else None,
            attentions=outputs.attentions if hasattr(outputs, 'attentions') else None
        )

    @classmethod
    def from_pretrained_custom(cls, model_path, **kwargs):
        config = AutoConfig.from_pretrained(model_path)
        return cls.from_pretrained(model_path, config=config, model_name_or_path=model_path, **kwargs)

In [5]:
model_name = 'microsoft/deberta-v3-small'
tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

cefr_levels = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']

In [7]:
# Just example simplified texts
# should be replaced with the actual data when running 

simplified_sent_file = 'data/simplified-sentences.csv'
df = pd.read_csv(simplified_sent_file, encoding='ISO-8859-1', usecols=['id', 'sentence'])

In [8]:
# Organize the sentences into groups based on 'base' and 'variants'

grouped_sentences = []
current_group = None

for _, row in df.iterrows():
    sid, sentence = row['id'], row['sentence']
    if sid == 'base':
        if current_group:
            grouped_sentences.append(current_group)
        
        current_group = {  
            'base': sentence,
            'variants': []
        }
    else:
        if current_group: 
            current_group['variants'].append({
                'id': int(sid),
                'sentence': sentence
            })

# After the loop, add the last group to grouped_sentences if it exists
if current_group:
    grouped_sentences.append(current_group)

In [None]:
# Make sure to download the model from https://www.kaggle.com/models/vinaxue/cefr-classifier-bert/pyTorch/deberta first and put it in the model directory 
# Make sure it's deberta version 5

model_path = 'model'
model = CustomModelWithCEFR.from_pretrained_custom(model_path, two_labels=False)
model.to(device)

CustomModelWithCEFR(
  (backbone): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps

In [11]:
model.eval()

for group in grouped_sentences:
    predictions = []

    # Predict for base sentence
    base_sentence = group['base']
    inputs = tokenizer(base_sentence, return_tensors='pt', truncation=True, padding=True).to(model.device)

    with torch.no_grad():
        outputs = model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            token_type_ids=inputs.get('token_type_ids')
        )
        logits = outputs.logits 
        probs = F.softmax(logits, dim=1)
        pred_idx = torch.argmax(probs, dim=1).item()
        base_pred = cefr_levels[pred_idx]
        predictions.append(base_pred)

    # Predict for variants
    for variant in group['variants']:
        sentence = variant['sentence']
        inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True).to(model.device)

        with torch.no_grad():
            outputs = model(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                token_type_ids=inputs.get('token_type_ids')
            )
            logits = outputs.logits
            probs = F.softmax(logits, dim=1)
            pred_idx = torch.argmax(probs, dim=1).item()
            pred_label = cefr_levels[pred_idx]
            predictions.append(pred_label)

    print(f"Base: {predictions[0]}")
    print(f"Variants: {predictions[1:]}")
    print("-" * 30)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Base: B1
Variants: ['B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1']
------------------------------
Base: B1
Variants: ['A2', 'B1', 'A1', 'A1', 'A2', 'A1', 'A2', 'A1', 'A2', 'B1', 'B1', 'B1']
------------------------------
Base: C1
Variants: ['B2', 'B2', 'B2', 'B2', 'B2', 'B2']
------------------------------
Base: C1
Variants: ['C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1']
------------------------------
