#Importing Fundamental Libraries#

In [17]:
import torch
import sentencepiece as spm
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForTokenClassification, Trainer, TrainingArguments
import ast
from torch.utils.data import Dataset
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [41]:
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')

In [31]:
file_path = '../data/data.csv'
df = pd.read_csv(file_path)
# df

In [32]:
columns_to_drop_without_nationality = ['file_name','USER_NUMBER', 'begin','end','wrong_word','correct_word','wrong_sentence','Gender','correct_sentence','process','main_category','sub_category_1','sub_category_2','Training Center','Nationality','nationality_encoded','word_tokens_correct']
columns_to_drop_with_nationality = ['USER_NUMBER', 'begin','end','wrong_word','correct_word','wrong_sentence','Gender','correct_sentence','process','main_category','sub_category_1','sub_category_2','Training Center','word_tokens_correct']

Training_df_without_nationality = df.drop(columns=columns_to_drop_without_nationality)
# Training_df_with_nationality =df.drop(columns=columns_to_drop_with_nationality)

In [4]:
# Training_df_without_nationality

In [5]:
# Training_df_with_nationality

In [33]:
def is_list_of_zeros(lst):
    return isinstance(lst, list) and all(element == 0 for element in lst)

In [34]:
Training_df_without_nationality = Training_df_without_nationality[~Training_df_without_nationality['error_index'].apply(is_list_of_zeros)]

In [8]:
# Training_df_without_nationality

In [35]:
main_category_errors = {
    "Main signs": 1,
    "Diacritic signs": 2,
    "Form": 3,
    "Punctuation": 4,
    None: 5
}

sub_category_1 = {
    None: 0,
    "None": 0,
    "Boundary": 1,
    "Consonants": 2,
    "Vowels": 3,
    "Tashdid": 4,
    "Madd": 5,
    "Tanwin": 6,
    "Alif Lam": 7,
    "Place": 8,
    "Dot": 9
}

class NERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, label_map, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.label_map = label_map
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        
        # Tokenize input text and pad/truncate
        encoding = self.tokenizer(
            text,
            is_split_into_words=True,
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=self.max_length
        )
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        # Prepare labels and pad/truncate to match input_ids length
        label_ids = [self.label_map.get(str(label), 0) for label in labels]
        label_ids = label_ids[:self.max_length] + [0] * (self.max_length - len(label_ids))
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label_ids)
        }


In [36]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['word_tokens_wrong'].tolist(),
    df['error_index'].tolist(),
    test_size=0.2,  # Adjust the test size as needed
    random_state=42
)

In [37]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
train_dataset = NERDataset(train_texts, train_labels, tokenizer, label_map)
test_dataset = NERDataset(test_texts, test_labels, tokenizer, label_map)

In [38]:
# Initialize Model
model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=len(label_map))

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)


In [43]:
trainer.train()

RuntimeError: Placeholder storage has not been allocated on MPS device!

In [None]:
# # Define your compute_metrics function
# def compute_metrics(p):
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis=2)
#     true_labels = labels

#     # Flatten arrays
#     predictions_flat = predictions.flatten()
#     labels_flat = true_labels.flatten()

#     # Compute classification report
#     return classification_report(labels_flat, predictions_flat, output_dict=True)

# # Initialize the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,  # Use the same arguments as in training
#     eval_dataset=test_dataset,
#     compute_metrics=compute_metrics  # Pass the compute_metrics function
# )

# # Evaluate the model
# results = trainer.evaluate()

# # Print results
# print("Evaluation Results:")
# print(results)