BERT

In [None]:
import torch
import gc
from transformers import BertTokenizer, EncoderDecoderModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np

# Step 1: Data Preprocessing Function
def preprocess_data(df):
    df = df.dropna(subset=['Error word & consecutive word', 'Corrected words & its'])
    return df

# Load Data
df = pd.read_csv('Error Annotated Corpus.csv')

# Preprocess Data
df = preprocess_data(df)

# Define the Dataset
class TamilGrammarDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        error_sentence = self.data.iloc[idx, 0]
        corrected_sentence = self.data.iloc[idx, 1]

        encoding = self.tokenizer(error_sentence, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        target_encoding = self.tokenizer(corrected_sentence, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# Initialize Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
dataset = TamilGrammarDataset(df, tokenizer, max_length=128)

# Define a model for sequence-to-sequence tasks
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-multilingual-cased",
    "bert-base-multilingual-cased"
)
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

# Ensure device is defined
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

# K-Fold Cross-Validation
kf = KFold(n_splits=5)

def train_and_evaluate(train_index, val_index):
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_index)
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_index)

    train_loader = DataLoader(dataset, batch_size=16, sampler=train_subsampler, num_workers=2)  # Increased batch size, adjusted num_workers
    val_loader = DataLoader(dataset, batch_size=16, sampler=val_subsampler, num_workers=2)      # Increased batch size, adjusted num_workers

    optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
    total_steps = len(train_loader) * 3  # Assuming 3 epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    scaler = torch.cuda.amp.GradScaler()  # Mixed precision training

    best_loss = float('inf')
    patience = 3  # Early stopping patience
    patience_counter = 0

    for epoch in range(3):
        print(f'Epoch {epoch + 1}/3')
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, device, scheduler, scaler)
        val_loss, val_acc = eval_model(model, val_loader, device)
        print(f'Train loss: {train_loss}, Train accuracy: {train_acc}')
        print(f'Validation loss: {val_loss}, Validation accuracy: {val_acc}')

        if val_loss < best_loss:
            best_loss = val_loss
            patience_counter = 0
            # Save the model
            torch.save(model.state_dict(), 'best_model_state.bin')
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print("Early stopping")
            break

        # Explicit garbage collection
        gc.collect()
        torch.cuda.empty_cache()

    return best_loss, val_acc

# Training Function
def train_epoch(model, data_loader, optimizer, device, scheduler, scaler):
    model = model.train()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    for batch_idx, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():  # Mixed precision
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        losses.append(loss.item())

        # Calculate accuracy
        preds = outputs.logits.argmax(dim=-1)
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.numel()

        # Print progress every 100 batches
        if batch_idx % 100 == 0:
            accuracy = correct_predictions / total_predictions
            print(f"Batch {batch_idx}/{len(data_loader)} - Loss: {loss.item()}, Accuracy: {accuracy}")

    accuracy = correct_predictions / total_predictions
    return np.mean(losses), accuracy

# Evaluation Function
def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch_idx, batch in enumerate(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with torch.cuda.amp.autocast():  # Mixed precision
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            losses.append(loss.item())

            # Calculate accuracy
            preds = outputs.logits.argmax(dim=-1)
            correct_predictions += (preds == labels).sum().item()
            total_predictions += labels.numel()

            # Print progress every 100 batches
            if batch_idx % 100 == 0:
                accuracy = correct_predictions / total_predictions
                print(f"Validation Batch {batch_idx}/{len(data_loader)} - Loss: {loss.item()}, Accuracy: {accuracy}")

    accuracy = correct_predictions / total_predictions
    return np.mean(losses), accuracy

# Run K-Fold Cross-Validation
best_losses = []
best_accuracies = []
for train_index, val_index in kf.split(dataset):
    best_loss, best_accuracy = train_and_evaluate(train_index, val_index)
    best_losses.append(best_loss)
    best_accuracies.append(best_accuracy)

print(f'Best losses from each fold: {best_losses}')
print(f'Mean loss: {np.mean(best_losses)}')
print(f'Best accuracies from each fold: {best_accuracies}')
print(f'Mean accuracy: {np.mean(best_accuracies)}')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bia

Epoch 1/3


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Batch 0/251 - Loss: 23.435760498046875, Accuracy: 0.0
Batch 100/251 - Loss: 0.6229305267333984, Accuracy: 0.7284189356435643
Batch 200/251 - Loss: 0.4551798701286316, Accuracy: 0.8264876787935324
Validation Batch 0/63 - Loss: 0.4332714378833771, Accuracy: 0.9326171875
Train loss: 1.9377543494045972, Train accuracy: 0.8475259119770916
Validation loss: 0.37566926413112217, Validation accuracy: 0.9377178784860558
Epoch 2/3
Batch 0/251 - Loss: 0.4783584475517273, Accuracy: 0.9267578125
Batch 100/251 - Loss: 0.4137267768383026, Accuracy: 0.9348458771658416
Batch 200/251 - Loss: 0.38140401244163513, Accuracy: 0.9347039217972637
Validation Batch 0/63 - Loss: 0.3505668342113495, Accuracy: 0.9375
Train loss: 0.39918417011599144, Train accuracy: 0.9347823549551793
Validation loss: 0.3555280708131336, Validation accuracy: 0.9387216757968128
Epoch 3/3
Batch 0/251 - Loss: 0.36182236671447754, Accuracy: 0.935546875
Batch 100/251 - Loss: 0.3805635869503021, Accuracy: 0.9360448251856436
Batch 200/251 

T5

In [None]:
pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
pip install transformers[torch] accelerate -U


Collecting transformers[torch]
  Downloading transformers-4.41.0-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers[torch])
  Downloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (8

In [None]:
pip install datasets


Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Trainer, TrainingArguments
import torch

# Load the dataset
df = pd.read_csv('Error Annotated Corpus.csv')

# Handle missing values
df.dropna(subset=['Error word & consecutive word', 'Corrected words & its'], inplace=True)

# Ensure the columns are strings
df['Error word & consecutive word'] = df['Error word & consecutive word'].astype(str)
df['Corrected words & its'] = df['Corrected words & its'].astype(str)

# Use a smaller subset of the dataset
df = df.sample(frac=0.1, random_state=42)  # Use 10% of the dataset for faster training

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face Datasets format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model='t5-small'
)

# Preprocess the data
def preprocess_function(examples):
    inputs = tokenizer(examples['Error word & consecutive word'], max_length=128, truncation=True, padding='max_length')
    targets = tokenizer(examples['Corrected words & its'], max_length=128, truncation=True, padding='max_length')
    inputs['labels'] = targets['input_ids']
    return inputs

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Remove columns not needed for the model
train_dataset = train_dataset.remove_columns(['Annotation'])
val_dataset = val_dataset.remove_columns(['Annotation'])

# Set the format for PyTorch
train_dataset.set_format('torch')
val_dataset.set_format('torch')

# Load the model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Define a function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]

    logits = torch.tensor(logits)  # Convert logits to a tensor
    predictions = torch.argmax(logits, dim=-1)

    # Decode the predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Calculate token-level accuracy
    correct = 0
    total = 0
    for pred, label in zip(decoded_preds, decoded_labels):
        pred_tokens = tokenizer(pred, truncation=True, padding='max_length', max_length=128)['input_ids']
        label_tokens = tokenizer(label, truncation=True, padding='max_length', max_length=128)['input_ids']
        for p, l in zip(pred_tokens, label_tokens):
            if l != tokenizer.pad_token_id:  # Ignore padding tokens
                if p == l:
                    correct += 1
                total += 1

    accuracy = correct / total if total != 0 else 0
    return {'accuracy': accuracy}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,  # Reduce the number of epochs
    per_device_train_batch_size=8,  # Increase batch size if possible
    per_device_eval_batch_size=8,
    save_steps=1_000,
    save_total_limit=1,
    evaluation_strategy="epoch",
    fp16=True  # Enable mixed precision training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/451 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.148452,0.943396


{'eval_loss': 0.14845219254493713, 'eval_accuracy': 0.9433962264150944, 'eval_runtime': 23.2895, 'eval_samples_per_second': 2.19, 'eval_steps_per_second': 0.301, 'epoch': 1.0}


mBART

In [None]:
pip install accelerate -U



In [None]:
pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
import pandas as pd
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader, random_split

# Load the mBART model and tokenizer
model_name = 'facebook/mbart-large-50-many-to-many-mmt'
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="ta_IN", tgt_lang="ta_IN")
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Load the dataset
dataset_path = 'Error Annotated Corpus.csv'
df = pd.read_csv(dataset_path)

# Ensure the CSV has the required columns
if 'Error word & consecutive word' not in df.columns or 'Corrected words & its' not in df.columns:
    raise ValueError("The dataset must contain 'Error word & consecutive word' and 'Corrected words & its' columns.")

# Drop rows where 'Corrected words & its' is NaN
df = df.dropna(subset=['Corrected words & its'])


# Define the dataset class
class GrammarCorrectionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source = str(self.data.iloc[index, 0])
        target = str(self.data.iloc[index, 1])

        source_encodings = tokenizer(source, truncation=True, padding='max_length', max_length=self.max_len)
        target_encodings = tokenizer(target, truncation=True, padding='max_length', max_length=self.max_len)

        return {
            'input_ids': torch.tensor(source_encodings['input_ids']),
            'attention_mask': torch.tensor(source_encodings['attention_mask']),
            'labels': torch.tensor(target_encodings['input_ids'])
        }

# Prepare the dataset
max_len = 64  # Reduced sequence length
dataset = GrammarCorrectionDataset(df, tokenizer, max_len)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_steps=10,
    load_best_model_at_end=True
)

# Define the Trainer class
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Fine-tune the model
trainer.train()

# Evaluate the fine-tuned model
eval_results = trainer.evaluate()

# Print evaluation results
print("Evaluation results:", eval_results)

# Calculate and print final accuracy on the validation set
def calculate_accuracy(dataset, tokenizer, model, batch_size=2, device='cpu'):
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=2, pin_memory=True)
    num_correct = 0
    total_samples = len(dataset)

    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            input_sentences = batch['input_ids'].to(device)
            attention_masks = batch['attention_mask'].to(device)
            target_sentences = batch['labels'].to(device)

            # Perform grammatical error correction
            outputs = model.generate(input_ids=input_sentences, attention_mask=attention_masks, num_beams=5)
            corrected_sentences = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            target_sentences = tokenizer.batch_decode(target_sentences, skip_special_tokens=True)

            # Check accuracy
            for corrected, target in zip(corrected_sentences, target_sentences):
                if corrected.strip() == target.strip():
                    num_correct += 1

    accuracy = num_correct / total_samples
    return accuracy

# Move model to appropriate device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Calculate and print final accuracy
accuracy = calculate_accuracy(val_dataset, tokenizer, model, batch_size=2, device=device)  # Small batch size
print("Final Accuracy on validation set:", accuracy)




Epoch,Training Loss,Validation Loss
1,0.0453,0.049532
2,0.0395,0.048387
3,0.0007,0.048946


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Evaluation results: {'eval_loss': 0.04838728904724121, 'eval_runtime': 30.3284, 'eval_samples_per_second': 33.104, 'eval_steps_per_second': 16.552, 'epoch': 3.0}


  self.pid = os.fork()
  self.pid = os.fork()


Final Accuracy on validation set: 0.6264940239043825


BiLSTM

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('Error Annotated Corpus.csv')

# Remove rows with missing values
df = df.dropna()

# Define preprocessing functions
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Apply preprocessing to the dataset
df['Error word & consecutive word'] = df['Error word & consecutive word'].apply(preprocess_text)
df['Corrected words & its'] = df['Corrected words & its'].apply(preprocess_text)

# Tokenize the sentences
all_sentences = df['Error word & consecutive word'].tolist() + df['Corrected words & its'].tolist()
all_words = [word for sentence in all_sentences for word in sentence.split()]

# Create word-to-index and index-to-word mappings
word_counts = Counter(all_words)
sorted_words = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_index = {word: idx + 1 for idx, word in enumerate(sorted_words)}  # Reserve index 0 for padding
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Encode the labels
label_encoder = LabelEncoder()
df['Annotation'] = label_encoder.fit_transform(df['Annotation'])

# Split the dataset into train and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    df['Error word & consecutive word'].tolist(),
    df['Annotation'].tolist(),
    test_size=0.1,  # Adjust as needed
    random_state=42
)

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, sentences, labels, word_to_index):
        self.sentences = sentences
        self.labels = labels
        self.word_to_index = word_to_index

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]

        # Convert words to indices
        indexed_sentence = [self.word_to_index[word] for word in sentence.split()]

        return {
            'indexed_sentence': indexed_sentence,
            'label': label
        }

# Create datasets and dataloaders with padding
train_dataset = CustomDataset(train_sentences, train_labels, word_to_index)
val_dataset = CustomDataset(val_sentences, val_labels, word_to_index)

# Pad sequences
train_collate_fn = lambda batch: {
    'indexed_sentence': torch.nn.utils.rnn.pad_sequence([torch.tensor(item['indexed_sentence']) for item in batch], batch_first=True),
    'label': torch.tensor([item['label'] for item in batch])
}

val_collate_fn = lambda batch: {
    'indexed_sentence': torch.nn.utils.rnn.pad_sequence([torch.tensor(item['indexed_sentence']) for item in batch], batch_first=True),
    'label': torch.tensor([item['label'] for item in batch])
}

# Define the DataLoader with padding
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=train_collate_fn)  # Increased batch size
val_dataloader = DataLoader(val_dataset, batch_size=64, collate_fn=val_collate_fn)  # Increased batch size

# Define the BiLSTM model with dropout and regularization
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.5):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)  # Applying dropout to the embeddings
        output, _ = self.lstm(embedded)
        output = torch.relu(self.fc1(output[:, -1, :]))  # Use ReLU activation function for the first fully connected layer
        out = self.fc2(output)
        return out

# Instantiate the model with dropout
vocab_size = len(word_to_index) + 1  # Add 1 for padding
embedding_dim = 128  # Adjusted embedding dimension
hidden_dim = 256  # Adjusted hidden dimension
output_dim = len(label_encoder.classes_)
dropout = 0.4  # Adjusted dropout rate
model = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, dropout=dropout)

# Move model to appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Added L2 regularization with weight decay

# Training loop
num_epochs = 30  # Increased number of epochs
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        inputs, labels = batch['indexed_sentence'].to(device), batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for batch in val_dataloader:
            inputs, labels = batch['indexed_sentence'].to(device), batch['label'].to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)

    val_accuracy = correct_preds / total_preds
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {epoch_loss / len(train_dataloader):.4f}, Val Loss: {val_loss / len(val_dataloader):.4f}, Val Acc: {val_accuracy:.2%}")

print('Training finished.')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Epoch 1/30: 100%|██████████| 71/71 [00:03<00:00, 22.19it/s]


Epoch 1/30, Train Loss: 2.4783, Val Loss: 2.2007, Val Acc: 25.10%


Epoch 2/30: 100%|██████████| 71/71 [00:03<00:00, 18.25it/s]


Epoch 2/30, Train Loss: 2.1468, Val Loss: 2.1946, Val Acc: 30.68%


Epoch 3/30: 100%|██████████| 71/71 [00:03<00:00, 22.14it/s]


Epoch 3/30, Train Loss: 2.0322, Val Loss: 2.0900, Val Acc: 34.06%


Epoch 4/30: 100%|██████████| 71/71 [00:03<00:00, 22.26it/s]


Epoch 4/30, Train Loss: 1.8974, Val Loss: 2.0208, Val Acc: 36.45%


Epoch 5/30: 100%|██████████| 71/71 [00:03<00:00, 20.25it/s]


Epoch 5/30, Train Loss: 1.7821, Val Loss: 1.9217, Val Acc: 40.84%


Epoch 6/30: 100%|██████████| 71/71 [00:03<00:00, 20.18it/s]


Epoch 6/30, Train Loss: 1.6330, Val Loss: 1.8602, Val Acc: 40.64%


Epoch 7/30: 100%|██████████| 71/71 [00:03<00:00, 21.86it/s]


Epoch 7/30, Train Loss: 1.5018, Val Loss: 1.8399, Val Acc: 43.43%


Epoch 8/30: 100%|██████████| 71/71 [00:03<00:00, 22.15it/s]


Epoch 8/30, Train Loss: 1.3750, Val Loss: 1.8284, Val Acc: 42.63%


Epoch 9/30: 100%|██████████| 71/71 [00:03<00:00, 18.25it/s]


Epoch 9/30, Train Loss: 1.2337, Val Loss: 1.8349, Val Acc: 44.82%


Epoch 10/30: 100%|██████████| 71/71 [00:03<00:00, 22.17it/s]


Epoch 10/30, Train Loss: 1.0949, Val Loss: 1.7993, Val Acc: 48.21%


Epoch 11/30: 100%|██████████| 71/71 [00:03<00:00, 22.28it/s]


Epoch 11/30, Train Loss: 0.9632, Val Loss: 1.8873, Val Acc: 47.01%


Epoch 12/30: 100%|██████████| 71/71 [00:03<00:00, 21.04it/s]


Epoch 12/30, Train Loss: 0.8789, Val Loss: 1.8581, Val Acc: 51.20%


Epoch 13/30: 100%|██████████| 71/71 [00:03<00:00, 19.50it/s]


Epoch 13/30, Train Loss: 0.7687, Val Loss: 2.1101, Val Acc: 50.00%


Epoch 14/30: 100%|██████████| 71/71 [00:03<00:00, 21.87it/s]


Epoch 14/30, Train Loss: 0.6852, Val Loss: 2.0493, Val Acc: 51.39%


Epoch 15/30: 100%|██████████| 71/71 [00:03<00:00, 19.81it/s]


Epoch 15/30, Train Loss: 0.5855, Val Loss: 2.1189, Val Acc: 51.39%


Epoch 16/30: 100%|██████████| 71/71 [00:03<00:00, 18.06it/s]


Epoch 16/30, Train Loss: 0.5340, Val Loss: 2.2054, Val Acc: 51.99%


Epoch 17/30: 100%|██████████| 71/71 [00:03<00:00, 18.84it/s]


Epoch 17/30, Train Loss: 0.4772, Val Loss: 2.2144, Val Acc: 52.19%


Epoch 18/30: 100%|██████████| 71/71 [00:04<00:00, 14.92it/s]


Epoch 18/30, Train Loss: 0.4164, Val Loss: 2.4902, Val Acc: 52.39%


Epoch 19/30: 100%|██████████| 71/71 [00:06<00:00, 10.57it/s]


Epoch 19/30, Train Loss: 0.3613, Val Loss: 2.6054, Val Acc: 53.98%


Epoch 20/30: 100%|██████████| 71/71 [00:09<00:00,  7.50it/s]


Epoch 20/30, Train Loss: 0.3457, Val Loss: 2.5204, Val Acc: 54.58%


Epoch 21/30: 100%|██████████| 71/71 [00:12<00:00,  5.89it/s]


Epoch 21/30, Train Loss: 0.3022, Val Loss: 2.6184, Val Acc: 53.98%


Epoch 22/30: 100%|██████████| 71/71 [00:18<00:00,  3.86it/s]


Epoch 22/30, Train Loss: 0.2845, Val Loss: 2.6524, Val Acc: 56.37%


Epoch 23/30: 100%|██████████| 71/71 [00:24<00:00,  2.90it/s]


Epoch 23/30, Train Loss: 0.2589, Val Loss: 3.0733, Val Acc: 54.98%


Epoch 24/30: 100%|██████████| 71/71 [00:25<00:00,  2.76it/s]


Epoch 24/30, Train Loss: 0.2536, Val Loss: 3.1325, Val Acc: 54.58%


Epoch 25/30: 100%|██████████| 71/71 [00:24<00:00,  2.87it/s]


Epoch 25/30, Train Loss: 0.2084, Val Loss: 2.9564, Val Acc: 54.58%


Epoch 26/30: 100%|██████████| 71/71 [00:24<00:00,  2.86it/s]


Epoch 26/30, Train Loss: 0.2020, Val Loss: 3.3480, Val Acc: 54.18%


Epoch 27/30: 100%|██████████| 71/71 [00:26<00:00,  2.65it/s]


Epoch 27/30, Train Loss: 0.2012, Val Loss: 3.4419, Val Acc: 53.98%


Epoch 28/30: 100%|██████████| 71/71 [00:26<00:00,  2.69it/s]


Epoch 28/30, Train Loss: 0.1778, Val Loss: 3.3694, Val Acc: 55.58%


Epoch 29/30: 100%|██████████| 71/71 [00:25<00:00,  2.74it/s]


Epoch 29/30, Train Loss: 0.1806, Val Loss: 3.2710, Val Acc: 56.18%


Epoch 30/30: 100%|██████████| 71/71 [00:25<00:00,  2.80it/s]


Epoch 30/30, Train Loss: 0.1766, Val Loss: 3.2101, Val Acc: 55.78%
Training finished.
