## Custom DL Model (PyTorch)

We also train a  Transformer encoder model to automatically perform normalisation. The model takes as input an raw string and  trains on the respective  CLEAN TEXT string as the target. We use character level tokenization, and the torch framework to train a small 4 layer encoder transformer on this task. We use cross entropy loss on an unembed (embed to vocab) layer to predict the appropriate  letters of the output based on the clean text. 

There is a lot room for improvement as we dont have a lot of data, or a lot of compute either to train or to evaluate. Apart from more data  and compute, we suspect that a better mapping between the input and output data would be beneficial (where potentially each letter maps if possible directly in input and output poisitions). 



In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import random
import numpy as np
import pandas as pd

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Load the data
df = pd.read_csv("data/raw/normalization_assesment_dataset_10k.csv")  

# Split into train, val, test
train_df, val_df, test_df = np.split(df.sample(frac=1, random_state=42), 
                                    [int(0.5 * len(df)), int(0.75 * len(df))])


train_df["raw_comp_writers_text"] = train_df["raw_comp_writers_text"].fillna("").astype(str)
train_df["CLEAN_TEXT"] = train_df["CLEAN_TEXT"].fillna("").astype(str)

val_df["raw_comp_writers_text"] = val_df["raw_comp_writers_text"].fillna("").astype(str)
val_df["CLEAN_TEXT"] = val_df["CLEAN_TEXT"].fillna("").astype(str)

test_df["raw_comp_writers_text"] = test_df["raw_comp_writers_text"].fillna("").astype(str)
test_df["CLEAN_TEXT"] = test_df["CLEAN_TEXT"].fillna("").astype(str)

# Character-level vocabulary
chars = sorted(set("".join(train_df["raw_comp_writers_text"]) + "".join(train_df["CLEAN_TEXT"])))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}
print(len(chars), chars)

# Tokenization
def tokenize(text, char_to_idx, max_len=100):
    tokenized = [char_to_idx[ch] for ch in text if ch in char_to_idx]
    return tokenized[:max_len] + [0] * (max_len - len(tokenized))

class TextDataset(Dataset):
    def __init__(self, df, char_to_idx, max_len=100):
        self.raw_texts = df["raw_comp_writers_text"].tolist()
        self.clean_texts = df["CLEAN_TEXT"].tolist()
        self.char_to_idx = char_to_idx
        self.max_len = max_len

    def __len__(self):
        return len(self.raw_texts)

    def __getitem__(self, idx):
        raw = tokenize(self.raw_texts[idx], self.char_to_idx, self.max_len)
        clean = tokenize(self.clean_texts[idx], self.char_to_idx, self.max_len)
        return torch.tensor(raw, dtype=torch.long), torch.tensor(clean, dtype=torch.long)

# Create datasets and dataloaders
max_len = 100
batch_size=32
train_dataset = TextDataset(train_df, char_to_idx, max_len)
val_dataset = TextDataset(val_df, char_to_idx, max_len)
test_dataset = TextDataset(test_df, char_to_idx, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Transformer Encoder Model
class TransformerEncoderModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers, max_len):
        super(TransformerEncoderModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_len, embed_dim))
        # Default not causal, as we want in this case
        encoder_layer = nn.TransformerEncoderLayer(embed_dim, num_heads, ff_dim) 
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, src):
        embed = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]
        encoded = self.transformer_encoder(embed)
        output = self.fc(encoded)
        return output

# Model configuration
vocab_size = len(chars)
embed_dim = 128
num_heads = 8
ff_dim = 512
num_layers = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:","cuda" if torch.cuda.is_available() else "cpu")
model = TransformerEncoderModel(vocab_size, embed_dim, num_heads, ff_dim, num_layers, max_len).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.AdamW(model.parameters(), lr=0.001)

# Training loop
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for raw, clean in dataloader:
        raw, clean = raw.to(device), clean.to(device)
        optimizer.zero_grad()
        output = model(raw)
        loss = criterion(output.view(-1, vocab_size), clean.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# Validation loop
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for raw, clean in dataloader:
            raw, clean = raw.to(device), clean.to(device)
            output = model(raw)
            loss = criterion(output.view(-1, vocab_size), clean.view(-1))
            total_loss += loss.item()
    return total_loss / len(dataloader)

# Training the model
epochs = 10
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model, val_loader, criterion, device)
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

# Save the model?
#torch.save(model.state_dict(), "models/transformer_encoder_model.pth")


  return bound(*args, **kwds)


843 [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '«', '»', 'Á', 'Â', 'Ä', 'Å', 'É', 'Í', 'Ó', 'Ö', 'Ø', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'ï', 'ñ', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'ü', 'ý', 'ă', 'ć', 'Č', 'č', 'Đ', 'ě', 'Ģ', 'ĩ', 'ī', 'ı', 'Ł', 'ł', 'ń', 'ņ', 'ř', 'Ş', 'ş', 'Š', 'š', 'ũ', 'ū', 'Ż', 'Ə', 'ơ', 'ư', 'ə', '́', 'Α', 'Γ', 'Δ', 'Ε', 'Η', 'Θ', 'Ι', 'Κ', 'Λ', 'Μ', 'Ξ', 'Ο', 'Ρ', 'Σ', 'Τ', 'Υ', 'Ω', 'ά', 'α', 'κ', 'ο', 'ρ', 'ς', 'τ', 'χ', 'ύ', 'І', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'З', 'И', 'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'Х', 'Ц', 'Ч', 

Perform inference on validation dataset, and show predictions and targets. Notice that the output captures key characters of the input but has a lot of noise in irrelevant positions:

In [17]:
# Validation inference function
def infer_on_validation_set(model, dataloader, idx_to_char, device, num_samples=10):
    model.eval()
    predictions = []
    with torch.no_grad():
        for raw, clean in dataloader:
            raw = raw.to(device)
            output = model(raw)
            # Get the most likely token at each position (move tensor back to cpu to perform numpy operations)
            pred = torch.argmax(output, dim=-1).cpu().numpy()
            raw = raw.cpu().numpy()
            clean = clean.cpu().numpy()
            # Decode and append
            for i in range(min(len(raw), num_samples)):
                raw_text = "".join(idx_to_char[idx] for idx in raw[i] if idx in idx_to_char)
                pred_text = "".join(idx_to_char[idx] for idx in pred[i] if idx in idx_to_char)
                clean_text = "".join(idx_to_char[idx] for idx in clean[i] if idx in idx_to_char)
                predictions.append((raw_text, pred_text, clean_text))
            if len(predictions) >= num_samples:
                break
    return predictions[:num_samples]

# Run inference on the validation set
num_samples_to_inspect = 10
validation_predictions = infer_on_validation_set(model, val_loader, idx_to_char, device, num_samples_to_inspect)

# Display the predictions
print(f"{'Raw Input':<50} | {'Predicted Output':<50} | {'Ground Truth':<50}")
print("=" * 150)
for raw_text, pred_text, clean_text in validation_predictions:
    print(f"{raw_text:<50} | {pred_text:<50} | {clean_text:<50}")





Raw Input                                          | Predicted Output                                   | Ground Truth                                      
Ihor Vitsinskyy (BMI IPI#767439101)                                                                  | Ihor/Vitsinskyya(BMIaIPI#767439101)aaaaaaaa/aaa//aaaaaaaa/aaaaaaaaaa//aaaaaa/aa/a/aa/aaaa//a/aaaa/aa | Ihor Vitsinskyy                                                                                     
Daniel Kim (김니)                                                                                      | Daniel/Kim/(Pu)aa/aaaaaaaaaaaaaaaa/aaaaaaaa/aaa//aaaaaaaa/aaaaaaaaaa//aaaaaa/aa/a/aa/aaaa//a/aaaa/aa | Daniel Kim                                                                                          
Tatsuji Kimura/Miyako Koda                                                                           | Tatsuji/Kimura/MiyakoaKodaaaaaaaaa/aaaaaaaa/aaa//aaaaaaaa/aaaaaaaaaa//aaaaaa/aa/a/aa/aaaa//a/aaaa/aa | Tatsuji Kimura/Miyako K

In [16]:
# Run inference on the validation set
num_samples_to_inspect = 2500
test_predictions = infer_on_validation_set(model, test_loader, idx_to_char, device, num_samples=num_samples_to_inspect)

raw_list,clean_list,preds_list = [],[],[]
for raw_text, pred_text, clean_text in test_predictions:
    raw_list.append(raw_text)
    clean_list.append(clean_text)
    preds_list.append(pred_text)

dl_df =  pd.DataFrame({
    "RAW_TEXT": raw_list,
    "CLEAN_TEXT": clean_list,
    "DL_OUT": preds_list
})


output_file_path = "output_file_dl.csv"  
dl_df.to_csv(output_file_path, index=False)
