In [None]:
!pip install transformers datasets torch
!pip install sacremoses

In [None]:
import pandas as pd

# Load your data
with open('/kaggle/input/khasi-english-dataset/eng1.txt', 'r', encoding='utf-8') as f:
    english_sentences = f.readlines()

with open('/kaggle/input/khasi-english-dataset/khasi1.txt', 'r', encoding='utf-8') as f:
    khasi_sentences = f.readlines()

# Ensure both files have the same number of lines
assert len(english_sentences) == len(khasi_sentences)

# Create a DataFrame
data = {'en': [en.strip() for en in english_sentences],
        'kha': [kha.strip() for kha in khasi_sentences]}

df = pd.DataFrame(data)

# Remove rows with None values
df = df.dropna()

# Save to a CSV file
df.to_csv('translation_dataset.csv', index=False)

print(f"Dataset saved with {len(df)} valid rows.")


In [None]:
df

In [None]:
import wandb

# Initialize wandb
wandb.login(key="aca9cf829fa45dbb446b7c861f28378794d2fee7")

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import MarianTokenizer, MarianMTModel, Trainer, TrainingArguments

# Load the data from the CSV file
df = pd.read_csv('/kaggle/working/translation_dataset.csv')

# Split the data into train and test sets
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Initialize the tokenizer and model
model_name = 'Helsinki-NLP/opus-mt-en-mul'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def preprocess_function(examples):
    inputs = examples['kha']
    targets = examples['en']
    
    valid_inputs = []
    valid_targets = []

    for i, (inp, tgt) in enumerate(zip(inputs, targets)):
        if isinstance(tgt, str) and isinstance(inp, str):
            valid_inputs.append(inp)
            valid_targets.append(tgt)
        else:
            print(f"Skipping invalid input/target pair at index {i}: {inp}, {tgt}")

    # Tokenize inputs and targets
    model_inputs = tokenizer(valid_inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(valid_targets, max_length=512, truncation=True, padding="max_length")

    # Ensure labels are correctly aligned with model inputs
    model_inputs['labels'] = labels['input_ids']

    # Ensure all lists in the dictionary have the same length
    min_length = min(len(v) for v in model_inputs.values())
    return {k: v[:min_length] for k, v in model_inputs.items()}

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)


# Define training arguments
training_args = TrainingArguments(
    output_dir='/kaggle/working/results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    save_total_limit=1,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
)

# Train the model
trainer.train()

In [None]:
!pip install sacrebleu

In [None]:
trainer.save_model('/kaggle/working/trained_model')

In [None]:
import torch
from transformers import MarianTokenizer, MarianMTModel, pipeline
from datasets import load_metric

# Load the trained model
model_name = 'Helsinki-NLP/opus-mt-en-mul'
model_path = '/kaggle/working/trained_model'
model = MarianMTModel.from_pretrained(model_path)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Move the model to CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Preprocess test dataset
def preprocess_test_function(examples):
    inputs = examples['kha']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    return model_inputs

tokenized_test_dataset = test_dataset.map(preprocess_test_function, batched=True, remove_columns=test_dataset.column_names)

# Generate predictions
def generate_predictions(batch):
    inputs = {k: torch.tensor(v).to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

predictions = []
references = []

for i in range(len(tokenized_test_dataset)):
    input_text = tokenized_test_dataset[i]['input_ids']
    prediction = generate_predictions({'input_ids': [input_text]})
    predictions.append(prediction[0])
    references.append(test_dataset[i]['en'])

# Calculate BLEU score
bleu = load_metric("bleu")
results = bleu.compute(predictions=[pred.split() for pred in predictions],
                       references=[[ref.split()] for ref in references])

print(f"BLEU score: {results['bleu']}")


In [None]:
import torch
from transformers import MarianTokenizer, MarianMTModel

# Load the trained model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-mul'
model_path = '/kaggle/working/trained_model'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_path)

# Function to translate Khasi text to English
def translate_khasi_to_english(khasi_text):
    # Tokenize the input text
    inputs = tokenizer(khasi_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    
    # Generate translation using the model
    with torch.no_grad():
        translated_tokens = model.generate(**inputs)
    
    # Decode the tokens to get the translated text
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

# Example usage
khasi_text = "Nga don u soh apple uba saw"
english_translation = translate_khasi_to_english(khasi_text)
print(f"Khasi: {khasi_text}")
print(f"English: {english_translation}")
