In [1]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from torch.optim import AdamW
import pandas as pd

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example email addresses
incorrect_email = "john.doe@example.con"
correct_email = "john.doe@example.com"

# Tokenize the email address
inputs = tokenizer(incorrect_email, return_tensors='pt', padding=True, truncation=True)
targets = tokenizer(correct_email, return_tensors='pt', padding=True, truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import random

# Define a list of common names and domains to generate correct email addresses
names = ["john.doe", "jane.smith", "michael.jones", "emily.davis", "daniel.brown"]
domains = ["example.com", "gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]

# Function to generate a correct email address
def generate_correct_email():
    name = random.choice(names)
    domain = random.choice(domains)
    return f"{name}@{domain}"

# Function to introduce common errors in an email address and return the label
def introduce_error(email):
    error_type = random.choice(["typo", "domain_misspelling", "missing_character", "swapped_characters"])

    if error_type == "typo":
        # Replace a character with a random one
        index = random.randint(0, len(email) - 1)
        email = email[:index] + random.choice("abcdefghijklmnopqrstuvwxyz") + email[index + 1:]
        label = "typo"

    elif error_type == "domain_misspelling":
        # Replace the domain extension with a common misspelling
        if ".com" in email:
            email = email.replace(".com", ".con")
        elif ".org" in email:
            email = email.replace(".org", ".ogr")
        label = "domain_misspelling"

    elif error_type == "missing_character":
        # Remove a random character
        index = random.randint(0, len(email) - 1)
        email = email[:index] + email[index + 1:]
        label = "missing_character"

    elif error_type == "swapped_characters":
        # Swap two adjacent characters
        index = random.randint(0, len(email) - 2)
        email = email[:index] + email[index + 1] + email[index] + email[index + 2:]
        label = "swapped_characters"

    return email, label

# Function to generate the dataset
def generate_email_dataset(num_samples=1000):
    dataset = []
    for _ in range(num_samples):
        correct_email = generate_correct_email()
        incorrect_email, error_label = introduce_error(correct_email)
        dataset.append((incorrect_email, correct_email))
    return dataset

# Generate a dataset of 1000 email pairs
email_dataset = generate_email_dataset(10000)

# Display a few examples
for i in range(5):
    print(f"Incorrect: {email_dataset[i][0]}, Correct: {email_dataset[i][1]}")


Incorrect: john.do@example.com, Correct: john.doe@example.com
Incorrect: jane.smith@gmail.com, Correct: jane.smith@gmail.com
Incorrect: caniel.brown@outlook.com, Correct: daniel.brown@outlook.com
Incorrect: jane.smith@gmail.gom, Correct: jane.smith@gmail.com
Incorrect: daniel.brown@outlook.con, Correct: daniel.brown@outlook.com


In [None]:
# Assuming `email_dataset` is your dataset of email pairs
df = pd.DataFrame(email_dataset, columns=["incorrect_email", "correct_email"])

# Define the path where you want to save the CSV file
save_path = "/content/drive/MyDrive/projectdatacleaning/email_correction_dataset.csv"

# Save the DataFrame to the specified path
df.to_csv(save_path, index=False)

In [None]:
# Define the path to your CSV file
load_path = "/content/drive/MyDrive/projectdatacleaning/email_correction_dataset.csv"

# Load the dataset from the CSV file
df = pd.read_csv(load_path)

# Display the first few rows of the dataset to verify it loaded correctly
df

Unnamed: 0,incorrect_email,correct_email
0,john.do@example.com,john.doe@example.com
1,jane.smith@gmail.com,jane.smith@gmail.com
2,caniel.brown@outlook.com,daniel.brown@outlook.com
3,jane.smith@gmail.gom,jane.smith@gmail.com
4,daniel.brown@outlook.con,daniel.brown@outlook.com
...,...,...
9995,johnd.oe@outlook.com,john.doe@outlook.com
9996,michae.ljones@outlook.com,michael.jones@outlook.com
9997,daniel.brown@example.con,daniel.brown@example.com
9998,jane.smth@example.com,jane.smith@example.com


In [None]:
import pandas as pd
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the emails
df['incorrect_email_tokenized'] = df['incorrect_email'].apply(lambda x: tokenizer(x, max_length=128, padding='max_length', truncation=True, return_tensors='pt'))
df['correct_email_tokenized'] = df['correct_email'].apply(lambda x: tokenizer(x, max_length=128, padding='max_length', truncation=True, return_tensors='pt'))

In [None]:
from torch.utils.data import Dataset, DataLoader

class EmailCorrectionDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        incorrect_email = self.data['incorrect_email_tokenized'].iloc[idx]
        correct_email = self.data['correct_email_tokenized'].iloc[idx]

        return {
            "input_ids": incorrect_email['input_ids'].squeeze(),
            "attention_mask": incorrect_email['attention_mask'].squeeze(),
            "labels": correct_email['input_ids'].squeeze()  # For seq2seq tasks
        }

In [None]:
from sklearn.model_selection import train_test_split

# Split the data: 80% train, 20% remaining for validation and test
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Further split the remaining data into 50% validation and 50% test
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [None]:
# Create Dataset objects
train_dataset1 = EmailCorrectionDataset(train_df)
val_dataset = EmailCorrectionDataset(val_df)
test_dataset = EmailCorrectionDataset(test_df)

In [None]:
train_dataset[1]

{'input_ids': tensor([  101,  1046,  7295,  1012, 18629,  1030,  2742,  1012,  4012,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [None]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments

from torch.nn import CrossEntropyLoss
import torch

# Assuming class 0 is for padding tokens, ignore it
criterion = CrossEntropyLoss(ignore_index=0)  # This ignores the padding index in the labels

model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=tokenizer.vocab_size)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").cuda()
        attention_mask = inputs.get("attention_mask").cuda()

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Compute the loss using CrossEntropyLoss and ignoring the padding tokens
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))

        # Apply the attention mask to ignore padding tokens
        loss = loss * attention_mask.view(-1)

        # Take the mean of the non-padding losses
        loss = loss.sum() / attention_mask.sum()

        return (loss, outputs) if return_outputs else loss

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=15,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

# Initialize the trainer with the custom loss function
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset1,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0938,0.040978
2,0.0192,0.005837
3,0.0034,0.002974
4,0.0012,0.000607
5,0.0098,0.002033
6,0.0084,0.000334
7,0.0009,0.00022
8,0.0003,0.000164
9,0.0003,0.000131
10,0.0002,0.000107


TrainOutput(global_step=7500, training_loss=0.21971544262172343, metrics={'train_runtime': 3493.7012, 'train_samples_per_second': 34.348, 'train_steps_per_second': 2.147, 'total_flos': 1.000188684288e+16, 'train_loss': 0.21971544262172343, 'epoch': 15.0})

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Define your training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset1,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch,Training Loss,Validation Loss
1,11.0284,7.459532
2,6.8876,2.238815
3,2.6085,0.420852
4,1.2805,0.443094
5,0.5033,0.366209


TrainOutput(global_step=250, training_loss=5.293392892837525, metrics={'train_runtime': 82.1239, 'train_samples_per_second': 48.707, 'train_steps_per_second': 3.044, 'total_flos': 135341801472000.0, 'train_loss': 5.293392892837525, 'epoch': 5.0})

In [None]:
# Evaluate on the test set
trainer.evaluate(test_dataset)

{'eval_loss': 6.321370892692357e-05,
 'eval_runtime': 8.678,
 'eval_samples_per_second': 115.234,
 'eval_steps_per_second': 1.844,
 'epoch': 15.0}

In [None]:
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Ensure the model is on the correct device

predicted_emails = []

model.eval()  # Set the model to evaluation mode

for item in test_dataset:
    # Move input tensors to the same device as the model
    input_ids = item['input_ids'].unsqueeze(0).to(device)
    attention_mask = item['attention_mask'].unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Get predictions using beam search (for better sequence generation)
        predictions = torch.argmax(logits, dim=2)  # Consider using beam search here

        # Decode the predicted token IDs back into a sequence of text
        predicted_email = tokenizer.decode(predictions.squeeze().tolist(), skip_special_tokens=True).strip()

        # Post-process the predicted email to remove unwanted spaces or repeated tokens
        predicted_email = " ".join(predicted_email.split())  # Removes extra spaces
        predicted_emails.append(predicted_email)

# Append the predictions to the test dataframe for comparison
test_df['predicted_email'] = predicted_emails

# Display a few examples of predictions vs actual
for i in range(100):
    print(f"Incorrect: {test_df['incorrect_email'].iloc[i]}")
    print(f"Correct: {test_df['correct_email'].iloc[i]}")
    print(f"Predicted: {test_df['predicted_email'].iloc[i]}\n")

Incorrect: john.doe@yahoo.cm
Correct: john.doe@yahoo.com
Predicted: john. doe @ yahoo. com.. doe @ yahoo..... michael.. doe @ yahoo... john. doe yahoo yahoo. doe @ yahoo.... doe @ yahoo. john

Incorrect: danigl.brown@hotmail.com
Correct: daniel.brown@hotmail.com
Predicted: daniel. brown @ hotmail. com hotmailmailmailmail michael daniel. brown @ hotmailmailmail commail hotmailmail com yahoo daniel. brownmailmail. brown hot hotmailmail hotmail hot hotmail com yahoo daniel hotmail. @ hotmail. commail hotmail com yahoo.. brown hot hotmail. com hot yahoo

Incorrect: micahel.jones@example.com
Correct: michael.jones@example.com
Predicted: michael. jones @ example. com example.. com michael.. jones @ example example... com. example. michael.. michael.. jones @ example. com com.. example. com michael. jones @ example.. jones @ example. com. example.... jones @ example.. example com com. jones

Incorrect: jane.smith@gmail.ocm
Correct: jane.smith@gmail.com
Predicted: jane. smith @ gmail. comilil.