In [None]:
from datasets import load_dataset
from transformers import BertTokenizerFast
import numpy as np

# Initialize tokenizer (e.g., BERT tokenizer)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

dataset = load_dataset("Alex123321/english_cefr_dataset")
dataset

In [None]:
# Split into train and temporary dataset (for validation and test)
train_ds, temp_ds = dataset['train'].train_test_split(test_size=0.2).values()

# Split the temporary dataset into validation and test
val_ds, test_ds = temp_ds.train_test_split(test_size=0.5).values()

# Verify the splits
print(f"Train size: {len(train_ds)}")
print(f"Validation size: {len(val_ds)}")
print(f"Test size: {len(test_ds)}")

# Create a DatasetDict to organize the splits
split_dataset = {
    'train': train_ds,
    'validation': val_ds,
    'test': test_ds
}

# Alternatively, if you want to have it in a DatasetDict format:
from datasets import DatasetDict
dataset_split = DatasetDict(split_dataset)

In [None]:
dataset_split["train"][0]

In [None]:
unique_labels = set(dataset['train']['ud_word_level'])  # Assuming you have the full dataset
label2id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
label2id

In [None]:
def preprocess_function(examples):
    # Tokenize the word (this will split compound words into subwords if necessary)
    tokenized_inputs = tokenizer(examples['ud_word'], padding=True, truncation=True, is_split_into_words=False)

    # Convert labels to numeric IDs for the ud_word_level
    word_ids = tokenized_inputs.word_ids()  # This gives the mapping from token to word ID
    
    # Assign the same label to all tokens belonging to the same word
    labels = []
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)  # Padding tokens should be ignored, hence the label -100
        else:
            # Assign the label for the entire word (based on the ud_word_level)
            labels.append(label2id[examples['ud_word_level']])  # Use the level label for the word

    # Add the labels to the tokenized inputs
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [None]:
# Apply preprocessing to each split of the dataset (train, validation, and test)
train_ds = dataset_split['train'].map(preprocess_function, batched=False)
val_ds = dataset_split['validation'].map(preprocess_function, batched=False)
test_ds = dataset_split['test'].map(preprocess_function, batched=False)

# Check the first processed item
print(train_ds[0])

In [None]:
from transformers import DataCollatorForTokenClassification
# Initialize the data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments

# Initialize the model (BERT for token classification)
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label2id))

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Output directory for model checkpoints
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=32,  # Batch size for training
    per_device_eval_batch_size=32,  # Batch size for evaluation
    num_train_epochs=5,  # Number of training epochs
    weight_decay=0.01,  # Weight decay
    save_total_limit=1,               # keep only the best model (deletes older checkpoints)
    load_best_model_at_end=True,      # load the best model after training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Start training
trainer.train()


In [None]:
eval = trainer.evaluate()
print(eval)

### load model 

In [93]:
'''
from transformers import AutoModelForTokenClassification
import torch 
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("results/checkpoint-950")
model = AutoModelForTokenClassification.from_pretrained("results/checkpoint-950")
with torch.no_grad():
    logits = model(**inputs).logits
# Example input text for token classification
text = "All you need is guigui"

# Tokenize the text using the loaded tokenizer
inputs = tokenizer(text, return_tensors="pt")

# Get the model's predictions (logits) from the tokenized input
outputs = model(**inputs)

# The logits are typically in shape (batch_size, sequence_length, num_labels)
logits = outputs.logits

# Get the predicted labels by taking the argmax of the logits
predictions = logits.argmax(dim=-1)

# Convert the predictions to label indices (if needed)
predicted_labels = predictions[0].tolist()

print(predicted_labels)
'''

[2, 2, 2, 1, 2, 2, 2, 2, 2, 1]


In [99]:
text = "I love this game. Goodbye."

In [100]:
from transformers import pipeline

classifier = pipeline("ner", model="results/checkpoint-950/")
classifier(text)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'LABEL_2',
  'score': 0.32772133,
  'index': 1,
  'word': 'I',
  'start': 0,
  'end': 1},
 {'entity': 'LABEL_2',
  'score': 0.31449705,
  'index': 2,
  'word': 'love',
  'start': 2,
  'end': 6},
 {'entity': 'LABEL_2',
  'score': 0.31998727,
  'index': 3,
  'word': 'this',
  'start': 7,
  'end': 11},
 {'entity': 'LABEL_2',
  'score': 0.29040098,
  'index': 4,
  'word': 'game',
  'start': 12,
  'end': 16},
 {'entity': 'LABEL_2',
  'score': 0.26514834,
  'index': 5,
  'word': '.',
  'start': 16,
  'end': 17},
 {'entity': 'LABEL_1',
  'score': 0.27017993,
  'index': 6,
  'word': 'Goodbye',
  'start': 18,
  'end': 25},
 {'entity': 'LABEL_0',
  'score': 0.2647442,
  'index': 7,
  'word': '.',
  'start': 25,
  'end': 26}]

In [13]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Load the pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("LordCoffee/bert-base-cased-cefr")
model = AutoModelForTokenClassification.from_pretrained("LordCoffee/bert-base-cased-cefr")

OOV_LABEL_ID = "UNK"
# Example input text
text = "I love this game, goodbye"

# Tokenize the text
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Get model predictions
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Apply softmax to get probabilities
probabilities = torch.nn.functional.softmax(logits, dim=-1)
# Get predicted labels (most likely label)
predicted_labels = torch.argmax(probabilities, dim=-1)
# Set a confidence threshold for uncertain predictions
confidence_threshold = 0

# Map predicted labels to token strings and check confidence
predicted_labels = predicted_labels[0].tolist()
for idx, label_id in enumerate(predicted_labels):
    token_str = tokenizer.decode(inputs['input_ids'][0][idx]).strip()
    confidence = probabilities[0, idx, label_id].item()
    
    # If confidence is below threshold, classify as "UNK"
    if confidence < confidence_threshold:
        label_id = OOV_LABEL_ID  # Assign special label "UNK"
    
    print(f"Token: {token_str}, Predicted label: {label_id}, Confidence: {confidence:.4f}")

Token: [CLS], Predicted label: 3, Confidence: 0.2537
Token: I, Predicted label: 1, Confidence: 0.1926
Token: love, Predicted label: 2, Confidence: 0.2095
Token: this, Predicted label: 5, Confidence: 0.2808
Token: game, Predicted label: 2, Confidence: 0.2358
Token: ,, Predicted label: 3, Confidence: 0.2345
Token: goodbye, Predicted label: 5, Confidence: 0.1992
Token: [SEP], Predicted label: 1, Confidence: 0.3825
