In [113]:
import torch
import torch.nn as nn
import sentencepiece as spm
from encoder_rnn import Encoder
from decoder_rnn import Decoder
from seq2seq_rnn import Seq2Seq


In [114]:
sp_en = spm.SentencePieceProcessor()
sp_en_w = spm.SentencePieceProcessor()
sp_en.load("../data/processed/spm_eng_n.model")
sp_en_w.load("../data/processed/spm_eng_word.model") 

sp_npi = spm.SentencePieceProcessor()
sp_npi_w = spm.SentencePieceProcessor()
sp_npi.load("../data/processed/spm_npi_e.model")
sp_npi_w.load("../data/processed/spm_npi_word.model")

True

In [115]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device("cpu")
device

device(type='cpu')

In [118]:
# INPUT_DIM = 3000
# OUTPUT_DIM =3000
# ENC_EMB_DIM = 128
# DEC_EMB_DIM = 128
# HID_DIM = 256

# Define parameters
INPUT_DIM = 4000   # size of src vocab
OUTPUT_DIM = 4000  # size of trg vocab
INPUT_DIM_WORD = 3200   # size of src vocab for word-level
OUTPUT_DIM_WORD = 3200  # size of trg vocab for word-level
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HIDDEN_DIM = 512

In [119]:
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HIDDEN_DIM)
# For word-level models
encoder_w = Encoder(INPUT_DIM_WORD, ENC_EMB_DIM, HIDDEN_DIM)
decoder_w = Decoder(OUTPUT_DIM_WORD, DEC_EMB_DIM, HIDDEN_DIM)

model = Seq2Seq(encoder, decoder, device).to(device)
model2 = Seq2Seq(encoder_w, decoder_w, device).to(device)


In [120]:
model.load_state_dict(torch.load("seq2seq_gru_eng_npi_model.pt", map_location=device))
model.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4000, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(256, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(4000, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(256, 512, batch_first=True)
    (fc_out): Linear(in_features=512, out_features=4000, bias=True)
  )
)

In [121]:
model2.load_state_dict(torch.load("seq2seq_gru_eng_npi_model_word.pt", map_location=device))
model2.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(3200, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(256, 512, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(3200, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(256, 512, batch_first=True)
    (fc_out): Linear(in_features=512, out_features=3200, bias=True)
  )
)

In [107]:
def translate_sentence(sentence, model, sp_en, sp_de, device, max_len=20):
    model.eval()
    tokens = sp_en.encode(sentence, out_type=int)
    tokens = [2] + tokens + [3]  # BOS and EOS tokens
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)  # batch_size=1, seq_len=...

    with torch.no_grad():
        hidden = model.encoder(src_tensor)

    trg_indexes = [2]  # BOS token

    for _ in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden)
            pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        if pred_token == 3:  # EOS token
            break

    translated_text = sp_de.decode(trg_indexes[1:-1])  # remove BOS and EOS
    return translated_text


In [137]:
english_sentence = "Today is my birthday"

try:
    translation = translate_sentence(english_sentence, model, sp_en, sp_npi, device)
    print("English:", english_sentence)
    print("Nepali:", translation)
    
except Exception as e:
    print("Error during translation:", e)

English: Today is my birthday
Nepali: मेरो जन्मदिन मेरो नाम हो।


BLEU score

In [71]:
# !pip install nltk

In [72]:
import nltk
from nltk.translate.bleu_score import corpus_bleu

nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Saurav
[nltk_data]     Karki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [127]:
import pandas as pd

# Path to your file
file_path = "../data/raw/npi-eng/npi.txt"

# Read the file, split by tabs, and keep only the first two columns (English and German)
df = pd.read_csv(file_path, sep="\t", header=None, usecols=[0,1], names=["English", "Nepali"])

df.head()


Unnamed: 0,English,Nepali
0,Who?,को?
1,Hide.,लुकाउनुहोस्।
2,Hide.,लुक।
3,Stay.,बस्नुहोस्।
4,Hello!,नमस्ते!


In [128]:
new_df = df.sample(500, random_state=42)  # Randomly sample 500 rows
new_df.reset_index(drop=True, inplace=True)
new_df.head()

Unnamed: 0,English,Nepali
0,A friend lent me that book.,त्यो किताब साथिबाट उधारो पाएको हो।
1,What color is the roof of your house?,तपाईको घरको छानो कस्तो रङको छ?
2,I was very upset.,म निकै उदास थिएँ ।
3,I went sightseeing.,म घुम्न गएँ ।
4,Japan began to import rice from the United Sta...,जापानले अमेरिकाबाट चामल आयात गर्न थाल्यो।


In [129]:
test_sentences = new_df['English'].tolist()  # list of English sentences (source)
reference_sentences = new_df['Nepali'].tolist()  # list of German sentences (reference)

print(test_sentences[:3])

print(reference_sentences[:3])
# 

['A friend lent me that book.', 'What color is the roof of your house?', 'I was very upset.']
['त्यो किताब साथिबाट उधारो पाएको हो।', 'तपाईको घरको छानो कस्तो रङको छ?', 'म निकै उदास थिएँ ।']


In [130]:
def translate_sentences(sentences, model, sp_en, sp_de, device):
    translations = []
    for sent in sentences:
        # print(f"Translating: {sent}")
        translation = translate_sentence(sent, model, sp_en, sp_de, device)
        # print(f"Translation: {translation}")
        translations.append(translation)
    return translations

In [131]:
input_data = new_df['English'].tolist()  # list of English sentences (source)
input_data[:3]

['A friend lent me that book.',
 'What color is the roof of your house?',
 'I was very upset.']

In [132]:
#making References
# Original list
refs = new_df['Nepali'].tolist()

# Convert to list of lists (each inner list with one reference string)
references = [[ref] for ref in refs]

print(references[:3])


[['त्यो किताब साथिबाट उधारो पाएको हो।'], ['तपाईको घरको छानो कस्तो रङको छ?'], ['म निकै उदास थिएँ ।']]


In [133]:
# candidates are model translations

candidates = translate_sentences(input_data, model, sp_en, sp_npi, device)
print("Candidates:", candidates[:3])



Candidates: ['हामीले मलाई गर्न सजिलो त्यो गर्न दिनुहोस्।', 'संसारको सबैभन्दा अग्लो कति छ?', 'धेरै धेरै धेरै आए।']


### **BLEU, METEOR, and chrF (with NLTK and SacreBLEU)**

In [23]:
# !pip install sacrebleu

In [134]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from sacrebleu.metrics import CHRF



# Tokenize (basic whitespace tokenization)
tokenized_refs = [[ref.split() for ref in refs] for refs in references]
tokenized_cands = [cand.split() for cand in candidates]



In [81]:
# 1. BLEU Score
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction


smooth = SmoothingFunction().method1

# Calculate BLEU with weights to only consider 1-gram
bleu_1gram = corpus_bleu(references, candidates, weights=(1, 0, 0, 0) , smoothing_function=smooth)
bleu_2gram = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth)
bleu_3gram = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth)
bleu_4gram = corpus_bleu(references, candidates, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)

print(f"1-gram BLEU score: {bleu_1gram:.4f}")
print(f"2-gram BLEU score: {bleu_2gram:.4f}")
print(f"3-gram BLEU score: {bleu_3gram:.4f}")
print(f"4-gram BLEU score: {bleu_4gram:.4f}")

1-gram BLEU score: 0.1262
2-gram BLEU score: 0.0612
3-gram BLEU score: 0.0431
4-gram BLEU score: 0.0319


In [135]:
# 2. CHRF Score

from sacrebleu.metrics import CHRF


chrf_metric = CHRF()
chrf_score = chrf_metric.corpus_score(candidates, references)
print(f"CHRF SCORE: {chrf_score.score:.4f}")



CHRF SCORE: 45.0758
