In [1]:
import torch
from CBHG import CBHGModel
from diacritizer import Diacritizer
from dataset import DiacriticsDataset
import pandas as pd
from baseline import BaseLineModel


In [2]:
# test_dataset_path = 'test_no_diacritics.txt'
test_dataset_path = 'dataset/test_no_diacritics.txt'
model_path = 'models/CBHG_EP20_BS256.pth'
# input_csv_path = 'test_set_without_labels.csv'
output_csv_path = 'output/labels.csv'
test_dataset_diacritized_path = 'output/diacritized.txt'

In [3]:
train_dataset = DiacriticsDataset()
train_dataset.load('dataset/train.txt')

model_CBHG = CBHGModel(
    inp_vocab_size=len(train_dataset.arabic_letters) + 1,
    targ_vocab_size=len(train_dataset.diacritic_classes),
)
print(model_CBHG)

CBHGModel(
  (embedding): Embedding(37, 512)
  (prenet): Prenet(
    (layers): ModuleList(
      (0): Linear(in_features=512, out_features=512, bias=True)
      (1): Linear(in_features=512, out_features=256, bias=True)
    )
    (relu): ReLU()
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (cbhg): CBHG(
    (relu): ReLU()
    (conv1d_banks): ModuleList(
      (0): BatchNormConv1d(
        (conv1d): Conv1d(256, 256, kernel_size=(1,), stride=(1,), bias=False)
        (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (activation): ReLU()
      )
      (1): BatchNormConv1d(
        (conv1d): Conv1d(256, 256, kernel_size=(2,), stride=(1,), padding=(1,), bias=False)
        (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (activation): ReLU()
      )
      (2): BatchNormConv1d(
        (conv1d): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
        (bn): BatchNorm1d(256,

In [4]:
# # train the model on a small batch to test the code
model_CBHG.train_(train_dataset)
# save the model
torch.save(model_CBHG.state_dict(),'models/CBHG_EP10_BS16_LR0.0009.pth')

  0%|          | 0/3955 [00:00<?, ?it/s]

In [6]:
from torch import nn
from tqdm import tqdm
# evaluate the model
test_dataset = DiacriticsDataset()
test_dataset.load('dataset/val.txt')
# model_CBHG.eval()

def evaluate(model, test_dataset, batch_size=16):
    """
    Evaluate the model on the test set.
    Args:
        test_dataset (torch.utils.data.Dataset): The test set.
        batch_size (int): The batch size.
    Returns:
        float: The accuracy of the model on the test set.
    """
    # Create a data loader from the test set
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    # Define the loss function
    criterion = nn.CrossEntropyLoss()

    # GPU Configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
      model = model.cuda()

    # Calculate accuracy on the test set
    with torch.no_grad():
        total = 0
        correct = 0
        for data in tqdm(test_loader):
            # Get the inputs
            inputs, labels = data[0], data[1]
            inputs = inputs.to(torch.long)
            labels = labels.to(torch.long)

            inputs = inputs.to(device)

            # (5) move the train label to the device
            labels = labels.to(device)

            # Forward pass
            outputs = model.forward(inputs)

            # Calculate accuracy
            acc = (torch.argmax(outputs['diacritics'],dim=-1) == labels).sum().item()
            total += acc
    total /= (len(test_dataset) * test_dataset[0][0].shape[0])

    print(f'\nTest Accuracy: {total}')
    return total

evaluate(model_CBHG,test_dataset)

100%|██████████| 200/200 [00:14<00:00, 13.94it/s]


Test Accuracy: 0.9694641736870407





0.9694641736870407

In [None]:
# loaded model
model_CBHG_loaded = CBHGModel(
    inp_vocab_size=37,
    targ_vocab_size=15,
)

In [None]:
'models/CBHG_val_EP5_BS32_LR0.001.pthmodel_CBHG_loaded.load_state_dict(torch.load('))

In [None]:
evaluate(model_CBHG_loaded,test_dataset)

# ==============================================================================================

In [38]:
test_dataset = DiacriticsDataset()
test_dataset.load(test_dataset_path, train=False)

inputs = test_dataset.character_sentences

In [None]:
# model = CBHGModel(
#     inp_vocab_size = 37,
#     targ_vocab_size = 15,
# )

# state_dict = torch.load(model_path, map_location=torch.device('cpu'))
# model.load_state_dict(state_dict)



In [39]:
# model_CBHG.eval()
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
if use_cuda:
    model_CBHG = model_CBHG.cuda()
    
with torch.no_grad():
    inputs = inputs.to(device)
    outputs = model_CBHG(inputs)
diacritics = torch.argmax(outputs['diacritics'], dim=-1)

In [40]:
mask_no_pad = inputs != test_dataset.pad_char
output_diacritics = diacritics[mask_no_pad]
output_diacritics = output_diacritics.cpu()

df = pd.DataFrame(output_diacritics.numpy(), columns=["label"])
df = df.rename_axis('ID').reset_index()
df.to_csv(output_csv_path, index=False)

In [41]:
with open(test_dataset_path, 'r', encoding='utf-8') as file:
    corpus = file.read()
    
diacritizer = Diacritizer()
diacritized_corpus = diacritizer.diacritize(corpus, output_diacritics)

with open(test_dataset_diacritized_path, 'w', encoding='utf-8') as file:
    corpus = file.write(diacritized_corpus)