In [1]:
!pip install transformers datasets torch
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [2]:
import pandas as pd

# Load your data
with open('/kaggle/input/khasi-english-dataset/eng1.txt', 'r', encoding='utf-8') as f:
    english_sentences = f.readlines()

with open('/kaggle/input/khasi-english-dataset/khasi1.txt', 'r', encoding='utf-8') as f:
    khasi_sentences = f.readlines()

# Ensure both files have the same number of lines
assert len(english_sentences) == len(khasi_sentences)

# Create a DataFrame
data = {'en': [en.strip() for en in english_sentences],
        'kha': [kha.strip() for kha in khasi_sentences]}

df = pd.DataFrame(data)

# Remove rows with None values
df = df.dropna()

# Save to a CSV file
df.to_csv('translation_dataset.csv', index=False)

print(f"Dataset saved with {len(df)} valid rows.")


Dataset saved with 26001 valid rows.


In [3]:
df

Unnamed: 0,en,kha
0,"Behold , therefore I will bring strangers upon...",ngan wanrah ki nongshun kiba sniew ban tur ïal...
1,Now when Jesus was risen early the first day o...,Hadien ba U Jisu u la mihpat na ka jingïap dan...
2,"If men strive , and hurt a woman with child , ...",Lada ki rangbah kiba ïashoh ki pynmynsaw ïa ka...
3,On the eighth day he sent the people away : an...,Ha ka sngi kaba phra u Solomon u phah noh sha ...
4,And they of Ephraim shall be like a mighty man...,Ki paid Israel kin long kiba khlaiñ kum ki shi...
...,...,...
25996,It is sown in dishonour ; it is raised in glor...,"Haba la tep , ka long kaba ijli bad kaba tlot ..."
25997,"That I may know him , and the power of his res...",Baroh kaba nga kwah ka long ba ngan ithuh ïa U...
25998,For I testify again to every man that is circu...,Sa shisien pat nga maham ba uno uno u briew ub...
25999,And shall not uncircumcision which is by natur...,Bad kumta ma phi ki Jiw phin shah pynrem ha ki...


In [4]:
import wandb

# Initialize wandb
wandb.login(key="aca9cf829fa45dbb446b7c861f28378794d2fee7")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import MarianTokenizer, MarianMTModel, Trainer, TrainingArguments

# Load the data from the CSV file
df = pd.read_csv('/kaggle/working/translation_dataset.csv')

# Split the data into train and test sets
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Initialize the tokenizer and model
model_name = 'Helsinki-NLP/opus-mt-en-mul'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def preprocess_function(examples):
    inputs = examples['en']
    targets = examples['kha']
    
    valid_inputs = []
    valid_targets = []

    for i, (inp, tgt) in enumerate(zip(inputs, targets)):
        if isinstance(tgt, str) and isinstance(inp, str):
            valid_inputs.append(inp)
            valid_targets.append(tgt)
        else:
            print(f"Skipping invalid input/target pair at index {i}: {inp}, {tgt}")

    # Tokenize inputs and targets
    model_inputs = tokenizer(valid_inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(valid_targets, max_length=512, truncation=True, padding="max_length")

    # Ensure labels are correctly aligned with model inputs
    model_inputs['labels'] = labels['input_ids']

    # Ensure all lists in the dictionary have the same length
    min_length = min(len(v) for v in model_inputs.values())
    return {k: v[:min_length] for k, v in model_inputs.items()}

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)


# Define training arguments
training_args = TrainingArguments(
    output_dir='/kaggle/working/results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    save_total_limit=1,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
)

# Train the model
trainer.train()

Map:   0%|          | 0/20801 [00:00<?, ? examples/s]

Skipping invalid input/target pair at index 881: None, None


Map:   0%|          | 0/5200 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.8981,1.571173
2,1.4474,1.42969


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


TrainOutput(global_step=2600, training_loss=1.8351048161433294, metrics={'train_runtime': 211.7162, 'train_samples_per_second': 196.489, 'train_steps_per_second': 12.281, 'total_flos': 176271497625600.0, 'train_loss': 1.8351048161433294, 'epoch': 2.0})

In [8]:
!pip install sacrebleu

  pid, fd = os.forkpty()


Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.0-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.10.0 sacrebleu-2.4.2


In [9]:
trainer.save_model('/kaggle/working/trained_model')

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


In [13]:
import torch
from transformers import MarianTokenizer, MarianMTModel, pipeline
from datasets import load_metric

# Load the trained model
model_name = 'Helsinki-NLP/opus-mt-en-mul'
model_path = '/kaggle/working/trained_model'
model = MarianMTModel.from_pretrained(model_path)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Move the model to CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Preprocess test dataset
def preprocess_test_function(examples):
    inputs = examples['en']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    return model_inputs

tokenized_test_dataset = test_dataset.map(preprocess_test_function, batched=True, remove_columns=test_dataset.column_names)

# Generate predictions
def generate_predictions(batch):
    inputs = {k: torch.tensor(v).to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

predictions = []
references = []

for i in range(len(tokenized_test_dataset)):
    input_text = tokenized_test_dataset[i]['input_ids']
    prediction = generate_predictions({'input_ids': [input_text]})
    predictions.append(prediction[0])
    references.append(test_dataset[i]['kha'])

# Calculate BLEU score
bleu = load_metric("bleu")
results = bleu.compute(predictions=[pred.split() for pred in predictions],
                       references=[[ref.split()] for ref in references])

print(f"BLEU score: {results['bleu']}")


Map:   0%|          | 0/5200 [00:00<?, ? examples/s]

  bleu = load_metric("bleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

BLEU score: 0.0031127745988714164


In [18]:
import torch
from transformers import MarianTokenizer, MarianMTModel

# Load the trained model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-mul'
model_path = '/kaggle/working/trained_model'  # Path to the trained model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_path)

# Function to translate Khasi text to English
def translate_khasi_to_english(khasi_text):
    # Tokenize the input text
    inputs = tokenizer(khasi_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    
    # Generate translation using the model
    with torch.no_grad():
        translated_tokens = model.generate(**inputs)
    
    # Decode the tokens to get the translated text
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

# Example usage
english_text = "i have a red apple"
khasi_translation = translate_khasi_to_english(english_text)
print(f"English: {english_text}")
print(f"Khasi: {khasi_translation}")


English: i have a red apple
Khasi: Nga don u masi khyndiat,
