In [None]:
#Initialization
!pip install transformers datasets sentencepiece
from google.colab import drive
drive.mount('/content/drive')



MessageError: Error: credential propagation was unsuccessful

In [None]:
# Model Fine-tune
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    MarianTokenizer, MarianMTModel,
    Seq2SeqTrainingArguments, Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

from google.colab import files
uploaded = files.upload()

filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

translation_pairs = []
current_correct = None

for _, row in df.iterrows():
    word = row['word']
    is_correct = row['label']
    if is_correct == 1:
        current_correct = word
    elif is_correct == 0 and current_correct:
        translation_pairs.append({"src": word, "tgt": current_correct})

df_pairs = pd.DataFrame(translation_pairs)
df_pairs = df_pairs.dropna()
df_pairs["src"] = df_pairs["src"].astype(str)
df_pairs["tgt"] = df_pairs["tgt"].astype(str)

dataset = Dataset.from_pandas(df_pairs)

model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def tokenize_fn(batch):
    model_inputs = tokenizer(batch["src"], max_length=16, truncation=True, padding="max_length")
    labels = tokenizer(batch["tgt"], max_length=16, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/My Drive/Final_MarianMT/marianmt-finetuned",
    per_device_train_batch_size=1048,
    num_train_epochs=300,
    logging_dir="./logs",
    save_total_limit=1,
    eval_strategy="no"
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

import os
os.environ["WANDB_DISABLED"] = "true"
trainer.train()

model.save_pretrained("/content/drive/My Drive/Final_MarianMT/marianmt-spellchecker")
tokenizer.save_pretrained("/content/drive/My Drive/Final_MarianMT/marianmt-spellchecker")

Saving Final_MarianMT_Dataset.csv to Final_MarianMT_Dataset (10).csv




Map:   0%|          | 0/5887 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


Step,Training Loss
500,0.0802
1000,0.0023
1500,0.0017




('/content/drive/My Drive/Final_MarianMT/marianmt-spellchecker/tokenizer_config.json',
 '/content/drive/My Drive/Final_MarianMT/marianmt-spellchecker/special_tokens_map.json',
 '/content/drive/My Drive/Final_MarianMT/marianmt-spellchecker/vocab.json',
 '/content/drive/My Drive/Final_MarianMT/marianmt-spellchecker/source.spm',
 '/content/drive/My Drive/Final_MarianMT/marianmt-spellchecker/target.spm',
 '/content/drive/My Drive/Final_MarianMT/marianmt-spellchecker/added_tokens.json')

In [None]:
# Check Spelling
import csv
import torch
from transformers import MarianMTModel, MarianTokenizer

# Load the model and tokenizer
model_path = "/content/drive/My Drive/Final_MarianMT/marianmt-spellchecker-best"
tokenizer = MarianTokenizer.from_pretrained(model_path)
model = MarianMTModel.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Input and output file paths
input_csv = "/content/drive/MyDrive/Final_MarianMT/word_list_lower.csv"  # Replace with your actual file
output_csv = "/content/drive/My Drive/Final_MarianMT/spellcheck_results_correct_words_only.csv"

results = []

# Load and process each word
with open(input_csv, "r") as f:
    reader = csv.reader(f)
    next(reader)  # Skip header if present
    for row in reader:
        if not row:
            continue
        word = row[0].strip()
        if not word:
            continue

        # Tokenize and generate corrected spelling
        inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        output = model.generate(**inputs)
        corrected = tokenizer.decode(output[0], skip_special_tokens=True)

        if word.lower() == corrected.lower():
            results.append([word, "", ""])  # Correct word
        else:
            results.append(["", word, corrected])  # Incorrect word + correction

# Save results to output CSV
with open(output_csv, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["correct", "incorrect", "corrected"])  # Header
    writer.writerows(results)

print(f"Spellcheck results saved to {output_csv}")


Spellcheck results saved to /content/drive/My Drive/Final_MarianMT/spellcheck_results_correct_words_only.csv


In [None]:
import torch
from transformers import MarianMTModel, MarianTokenizer

model_path = "/content/drive/My Drive/Final_MarianMT/marianmt-spellchecker"
tokenizer = MarianTokenizer.from_pretrained(model_path)
model = MarianMTModel.from_pretrained(model_path)

def check_spelling():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    while True:
        text = input("Enter a word to check (or type 'exit' to quit): ")
        if text.lower() == "exit":
            break

        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        output = model.generate(**inputs)
        corrected = tokenizer.decode(output[0], skip_special_tokens=True)

        if text.lower() == corrected.lower():
            print(f"✅ Correct: {text}\n")
        else:
            print(f"❌ Incorrect: {text} → Correct spelling: {corrected}\n")

check_spelling()



Enter a word to check (or type 'exit' to quit): a
✅ Correct: a

Enter a word to check (or type 'exit' to quit): b
✅ Correct: b

Enter a word to check (or type 'exit' to quit): c
✅ Correct: c

Enter a word to check (or type 'exit' to quit): d
✅ Correct: d

Enter a word to check (or type 'exit' to quit): e
✅ Correct: e

Enter a word to check (or type 'exit' to quit): f
✅ Correct: f

Enter a word to check (or type 'exit' to quit): g
✅ Correct: g

Enter a word to check (or type 'exit' to quit): h
✅ Correct: h

Enter a word to check (or type 'exit' to quit): i
✅ Correct: i

Enter a word to check (or type 'exit' to quit): j
✅ Correct: j

Enter a word to check (or type 'exit' to quit): k
✅ Correct: k

Enter a word to check (or type 'exit' to quit): l
✅ Correct: l

Enter a word to check (or type 'exit' to quit): m
✅ Correct: m

Enter a word to check (or type 'exit' to quit): n
✅ Correct: n

Enter a word to check (or type 'exit' to quit): o
✅ Correct: o

Enter a word to check (or type 'exit' to

#*DATA PROCESSING*

In [None]:
import pandas as pd

def convert_csv_to_lowercase(input_file: str, output_file: str):
    df = pd.read_csv(input_file)

    # Convert all string entries in the DataFrame to lowercase
    df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

    df.to_csv(output_file, index=False)
    print(f"Converted CSV saved to: {output_file}")


# Example usage
if __name__ == "__main__":
    input_csv = "word_list.csv"
    output_csv = "word_list_lower.csv"
    convert_csv_to_lowercase(input_csv, output_csv)


In [None]:
import random
import csv

# File containing correct words (one per line)
correct_words_file = "word_list_lower.csv"

# Read correct words from CSV
correct_words = []
with open(correct_words_file, "r") as f:
    reader = csv.reader(f)
    for row in reader:
        correct_words.extend(row)  # Handles both single-row and multiple-row formats

# Define spelling errors
VOWEL_SWAPS = {'a': 'e', 'e': 'i', 'i': 'y', 'o': 'u', 'u': 'o'}
CONSONANT_SWAPS = {'c': 'k', 'k': 'c', 's': 'z', 'z': 's', 'b': 'p', 'p': 'b', 'd': 't', 't': 'd', 'v': 'f', 'f': 'v',
                   'g': 'j', 'j': 'g'}
PHONETIC_SWAPS = {'ph': 'f', 'f': 'ph', 'th': 'd', 'd': 'th', 'ch': 'k', 'k': 'ch', 'sh': 's', 's': 'sh', 'j': 'g',
                  'g': 'j'}


def replace_random(word):
    """Replaces a random character using swaps."""
    swap_dicts = [VOWEL_SWAPS, CONSONANT_SWAPS]
    swap_dict = random.choice(swap_dicts)

    word_list = list(word)
    indices = [i for i, char in enumerate(word_list) if char in swap_dict]

    if indices:
        idx = random.choice(indices)
        word_list[idx] = swap_dict[word_list[idx]]

    return ''.join(word_list)


def replace_phonetic(word):
    """Replaces a phonetic pair."""
    for key, value in PHONETIC_SWAPS.items():
        if key in word:
            return word.replace(key, value, 1)
    return word


def random_insert(word):
    """Inserts a random letter at a random position."""
    idx = random.randint(0, len(word))
    return word[:idx] + random.choice("abcdefghijklmnopqrstuvwxyz") + word[idx:]


def random_duplicate(word):
    """Duplicates a random character."""
    if len(word) > 1:
        idx = random.randint(0, len(word) - 1)
        return word[:idx] + word[idx] + word[idx:]
    return word


def conditional_omit(word):
    """Removes a letter if there are consecutive vowels or duplicate letters."""
    for i in range(len(word) - 1):
        if word[i] == word[i + 1] or (word[i] in "aeiou" and word[i + 1] in "aeiou"):
            return word[:i] + word[i + 1:]
    return word


# Error functions
error_functions = [replace_random, replace_phonetic, random_insert, random_duplicate, conditional_omit]

# Generate dataset
dataset = []

for word in correct_words:
    word = word.strip()  # Remove whitespace
    if not word:
        continue  # Skip empty lines

    dataset.append((word, "correct", word))  # Store correct word

    for error_function in error_functions:
        for _ in range(5):  # change range depending on how many times you want to produce the error
            misspelled_word = error_function(word)
            dataset.append((misspelled_word, "incorrect", word))

# **Sort by the original correct word**
dataset.sort(key=lambda x: x[2].lower())

# Save to CSV
output_csv = "MarianMT_Dataset_Dirty.csv"
with open(output_csv, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["word", "label", "original_word"])
    writer.writerows(dataset)

print(f"Generated {len(dataset)} unique, sorted spelling verification samples in {output_csv}.")
print(dataset[:10])  # Preview first few entries


In [None]:
import pandas as pd

def clean_and_deduplicate_csv(input_file: str, output_file: str):
    # Load the CSV
    df = pd.read_csv(input_file)

    # Remove rows where 'word' == 'original_word' and 'label' == 'incorrect'
    condition = (df['word'] == df['original_word']) & (df['label'].str.lower() == 'incorrect')
    df_cleaned = df[~condition]  # Keep rows that do NOT meet the condition

    # Remove duplicate rows
    df_cleaned = df_cleaned.drop_duplicates()

    # Save to a new CSV
    df_cleaned.to_csv(output_file, index=False)
    print(f"Cleaned and deduplicated CSV saved to: {output_file}")

# Example usage
if __name__ == "__main__":
    input_csv = "MarianMT_Dataset_Dirty.csv"              # Replace with your actual file
    output_csv = "MarianMT_Dataset_Clean.csv"
    clean_and_deduplicate_csv(input_csv, output_csv)


In [None]:
import pandas as pd

def main():
    # Prompt for file paths
    words_file = "word_list_lower.csv"
    dataset_file = "MarianMT_Dataset_Clean.csv"

    try:
        words_df = pd.read_csv(words_file, header=None)
        words_df.columns = ['word']  # Name the column for consistency
    except Exception as e:
        print(f"❌ Error reading words file: {e}")
        return

        # Load the dataset (with headers)
    try:
        dataset_df = pd.read_csv(dataset_file)
    except Exception as e:
        print(f"❌ Error reading dataset file: {e}")
        return

        # Check required columns
    if "word" not in dataset_df.columns or "label" not in dataset_df.columns:
        print("❌ 'word' and/or 'label' column not found in dataset CSV.")
        return

        # Normalize words for case-insensitive comparison
    word_list = words_df['word'].str.lower().unique()
    dataset_df['word_lower'] = dataset_df['word'].str.lower()
    dataset_df['label_lower'] = dataset_df['label'].str.lower()

    # Identify rows to redact
    redacted_rows = dataset_df[
        (dataset_df['word_lower'].isin(word_list)) &
        (dataset_df['label_lower'] == 'incorrect')
        ]

    # Print redacted rows
    if not redacted_rows.empty:
        print("\n=== Redacted Rows ===")
        print(redacted_rows.drop(columns=['word_lower', 'label_lower']).to_string(index=False))
    else:
        print("\n✅ No rows to redact.")

    # Remove redacted rows and save cleaned dataset
    cleaned_df = dataset_df.drop(redacted_rows.index)
    cleaned_df = cleaned_df.drop(columns=['word_lower', 'label_lower'])
    output_file = "Final_MarianMT_Dataset.csv"
    cleaned_df.to_csv(output_file, index=False)
    print(f"\n✅ Cleaned dataset saved as: {output_file}")
    print(f"Original rows: {len(dataset_df)}, After redaction: {len(cleaned_df)}")


if __name__ == "__main__":
    main()
