In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import sentencepiece as spm

In [2]:
def load_and_preprocess_data(csv_path, max_length=128, batch_size=64, test_size=0.1, data_fraction=0.1):
    # Load CSV file
    df = pd.read_csv(csv_path)

    # Convert all entries to strings and remove any rows with NaN values
    df['english'] = df['english'].astype(str)
    df['hindi'] = df['hindi'].astype(str)
    df = df.dropna()

    # Reduce the dataset size

    english_texts = df['english'].tolist()
    hindi_texts = df['hindi'].tolist()

    # Function to write texts to a temporary file
    def write_to_temp_file(texts, filename):
        with open(filename, 'w', encoding='utf-8') as f:
            for text in texts:
                f.write(text + '\n')

    # Train SentencePiece models
    write_to_temp_file(english_texts, 'temp_english.txt')
    write_to_temp_file(hindi_texts, 'temp_hindi.txt')

    # Train SentencePiece models without explicit padding token
    spm.SentencePieceTrainer.train(input='temp_english.txt', model_prefix='en_model', vocab_size=8000)
    spm.SentencePieceTrainer.train(input='temp_hindi.txt', model_prefix='hi_model', vocab_size=8000)

    en_tokenizer = spm.SentencePieceProcessor()
    hi_tokenizer = spm.SentencePieceProcessor()
    en_tokenizer.load('en_model.model')
    hi_tokenizer.load('hi_model.model')

    # Split the data
    en_train, en_test, hi_train, hi_test = train_test_split(english_texts, hindi_texts, test_size=test_size, random_state=42)

    # Create datasets
    train_dataset = TranslationDataset(en_train, hi_train, en_tokenizer, hi_tokenizer, max_length)
    test_dataset = TranslationDataset(en_test, hi_test, en_tokenizer, hi_tokenizer, max_length)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Return the loaders and tokenizers
    return train_loader, test_loader, en_tokenizer, hi_tokenizer
class TranslationDataset(Dataset):
    def __init__(self, english_texts, hindi_texts, en_tokenizer, hi_tokenizer, max_length):
        self.english_texts = english_texts
        self.hindi_texts = hindi_texts
        self.en_tokenizer = en_tokenizer
        self.hi_tokenizer = hi_tokenizer
        self.max_length = max_length
        self.pad_id = 0 

    def __len__(self):
        return len(self.english_texts)

    def tokenize_and_pad(self, text, tokenizer):
        tokens = tokenizer.encode(text)
        tokens = [tokenizer.bos_id()] + tokens + [tokenizer.eos_id()]
        padding = [self.pad_id] * (self.max_length - len(tokens))
        return (tokens + padding)[:self.max_length]



    def __getitem__(self, idx):
        en_text = self.english_texts[idx]
        hi_text = self.hindi_texts[idx]

        en_tokens = self.tokenize_and_pad(en_text, self.en_tokenizer)
        hi_tokens = self.tokenize_and_pad(hi_text, self.hi_tokenizer)

        return {
            'en_input': torch.tensor(en_tokens),
            'hi_input': torch.tensor(hi_tokens[:-1]),  # Input for decoder (without EOS)
            'hi_output': torch.tensor(hi_tokens[1:])   # Target for decoder (without BOS)
        }

In [3]:
csv_path = 'hindi_english_parallel.csv'
train_loader, test_loader, en_tokenizer, hi_tokenizer = load_and_preprocess_data(csv_path)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: temp_english.txt
  input_format: 
  model_prefix: en_model
  model_type: UNIGRAM
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  d

In [4]:
for i, batch in enumerate(train_loader):
    if i == 0:  # Print only for the first batch
        print("English input shape:", batch['en_input'].shape)
        print("English input sample:", batch['en_input'][0])
        print("Hindi input shape:", batch['hi_input'].shape)
        print("Hindi input sample:", batch['hi_input'][0])
        break

English input shape: torch.Size([64, 128])
English input sample: tensor([    1,    82,    12,  4636,    15,   124,   684,     9,   775,   239,
          680,     6,     3,  1220,     7,  1062,     9,   722, 11972,  8197,
            7,    24,   563,    14,  6039,   232,   235,   708,     4,     2,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,

In [5]:
# Vocabulary sizes for the Transformer model
en_vocab_size = len(en_tokenizer)
hi_vocab_size = len(hi_tokenizer)
print(f"English vocabulary size: {en_vocab_size}")
print(f"Hindi vocabulary size: {hi_vocab_size}")

English vocabulary size: 12000
Hindi vocabulary size: 12000


In [4]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import TransformerEncoder, TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer


class TranslationTransformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, nhead=4, num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=1024, dropout=0.1):
        super(TranslationTransformer, self).__init__()

        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout)

        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_encoder_layers)

        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_decoder_layers)

        self.output_layer = nn.Linear(d_model, tgt_vocab_size)

        self.d_model = d_model
        self.nhead = nhead

    def forward(self, src, tgt):
        #print(f"Input shapes: src = {src.shape}, tgt = {tgt.shape}")

        src_embedded = self.positional_encoding(self.src_embedding(src))
        tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt))

        #print(f"Embedded shapes: src = {src_embedded.shape}, tgt = {tgt_embedded.shape}")

        src_padding_mask = (src == 0)  # Shape: (batch_size, src_len)
        tgt_padding_mask = (tgt == 0)  # Shape: (batch_size, tgt_len)

        #print(f"Padding mask shapes: src = {src_padding_mask.shape}, tgt = {tgt_padding_mask.shape}")

        tgt_mask = self.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)

        #print(f"Target mask shape: {tgt_mask.shape}")

        memory = self.transformer_encoder(src_embedded.transpose(0, 1), src_key_padding_mask=src_padding_mask)
        #print(f"Memory shape: {memory.shape}")

        output = self.transformer_decoder(tgt_embedded.transpose(0, 1), memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_padding_mask)
        #print(f"Decoder output shape: {output.shape}")

        return self.output_layer(output)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
from torch.optim.lr_scheduler import OneCycleLR



BATCH_SIZE = 32
MAX_LENGTH = 128
LEARNING_RATE = 0.0005
EPOCHS = 1
D_MODEL = 256  # Reduced from 512
NHEAD = 8  # Reduced from 8
NUM_ENCODER_LAYERS = 3  # Reduced from 6
NUM_DECODER_LAYERS = 3  # Reduced from 6
DIM_FEEDFORWARD = 1024  # Reduced from 2048
DROPOUT = 0.1
PATIENCE = 5




# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TranslationTransformer(
    src_vocab_size=len(en_tokenizer),
    tgt_vocab_size=len(hi_tokenizer),
    d_model=D_MODEL,
    nhead=NHEAD,
    num_encoder_layers=NUM_ENCODER_LAYERS,
    num_decoder_layers=NUM_DECODER_LAYERS,
    dim_feedforward=DIM_FEEDFORWARD,
    dropout=DROPOUT
).to(device)



# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Assuming 0 is the padding index
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = OneCycleLR(optimizer, max_lr=LEARNING_RATE, epochs=EPOCHS, steps_per_epoch=len(train_loader))





In [8]:
import os
import torch
import time
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F

def calculate_accuracy(output, target):
    output_flat = output.argmax(dim=-1).contiguous().view(-1)
    target_flat = target.contiguous().view(-1)
    mask = target_flat != 0  # Ignore padding tokens
    correct = torch.sum(output_flat[mask] == target_flat[mask]).item()
    total = torch.sum(mask).item()
    return correct, total

def train(model, train_loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total_tokens = 0
    start_time = time.time()

    print("Starting training...")
    print(f"Number of batches: {len(train_loader)}")

    for i, batch in enumerate(train_loader):
        src = batch['en_input'].to(device)
        tgt = batch['hi_input'].to(device)
        tgt_output = batch['hi_output'].to(device)

        optimizer.zero_grad()
        try:
            output = model(src, tgt)
            output = output.transpose(0, 1)
            loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
            loss.backward()
            
            # Add gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            total_loss += loss.item()

            # Calculate accuracy
            correct, total = calculate_accuracy(output, tgt_output)
            total_correct += correct
            total_tokens += total

            if i == 0:
                print(f"Processed first batch in {time.time() - start_time:.2f} seconds")
                print(f"First batch loss: {loss.item():.4f}")

        except Exception as e:
            print(f"Error in batch {i}:")
            print(f"src shape: {src.shape}")
            print(f"tgt shape: {tgt.shape}")
            print(f"tgt_output shape: {tgt_output.shape}")
            print(f"output shape: {output.shape}")
            raise e

        if (i + 1) % 10 == 0:  # Print every 10 batches
            current_time = time.time() - start_time
            print(f"Processed {i + 1} batches. Time elapsed: {current_time:.2f} seconds")
            print(f"Current loss: {loss.item():.4f}")
            print(f"Current accuracy: {correct/total:.4f}")
            print(f"Current learning rate: {optimizer.param_groups[0]['lr']:.6f}")

    epoch_time = time.time() - start_time
    avg_loss = total_loss / len(train_loader)
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0
    print(f"Epoch completed in {epoch_time:.2f} seconds")
    return avg_loss, accuracy, epoch_time

def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_tokens = 0
    start_time = time.time()
    
    with torch.no_grad():
        for batch in test_loader:
            src = batch['en_input'].to(device)
            tgt = batch['hi_input'].to(device)
            tgt_output = batch['hi_output'].to(device)

            output = model(src, tgt)
            output = output.transpose(0, 1)
            loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
            total_loss += loss.item()

            # Calculate accuracy
            correct, total = calculate_accuracy(output, tgt_output)
            total_correct += correct
            total_tokens += total

    avg_loss = total_loss / len(test_loader)
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0
    eval_time = time.time() - start_time
    print(f"Evaluation completed in {eval_time:.2f} seconds")
    return avg_loss, accuracy

# Create a directory to store the models
checkpoint_dir = 'model_checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Initialize variables for training from scratch
start_epoch = 0
best_val_loss = float('inf')
patience_counter = 0

# Main training loop
total_start_time = time.time()
print(f"Starting training from scratch with full dataset")
print(f"Total number of epochs: {EPOCHS}")
print(f"Patience for early stopping: {PATIENCE}")

for epoch in range(start_epoch, EPOCHS):
    epoch_start_time = time.time()
    print(f"\n{'='*20} Epoch {epoch+1}/{EPOCHS} {'='*20}")
    
    # Train
    train_loss, train_acc, train_time = train(model, train_loader, optimizer, scheduler, criterion, device)
    
    # Validate
    val_loss, val_acc = evaluate(model, test_loader, criterion, device)
    
    scheduler.step(val_loss)  # Step the scheduler

    
    epoch_time = time.time() - epoch_start_time
    
    # Print epoch results
    print(f"\nEpoch {epoch+1} Summary:")
    print(f"Train loss: {train_loss:.4f} | Train accuracy: {train_acc:.4f}")
    print(f"Val loss: {val_loss:.4f} | Val accuracy: {val_acc:.4f}")
    print(f"Epoch time: {epoch_time:.2f} seconds")
    print(f"Learning rate: {optimizer.param_groups[0]['lr']:.6f}")
    
    # Save model after every epoch
    epoch_checkpoint_path = os.path.join(checkpoint_dir, f'model_epoch_{epoch+1}.pth')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'train_loss': train_loss,
        'val_loss': val_loss,
        'train_acc': train_acc,
        'val_acc': val_acc,
    }, epoch_checkpoint_path)
    print(f"Saved model checkpoint for epoch {epoch+1}")
    
    # Check for improvement
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        print("New best model found. Saving best model checkpoint...")
        best_model_path = os.path.join(checkpoint_dir, 'best_model.pth')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'train_acc': train_acc,
            'val_acc': val_acc,
        }, best_model_path)
    else:
        patience_counter += 1
        print(f"No improvement in validation loss. Patience: {patience_counter}/{PATIENCE}")
    
    # Early stopping
    if patience_counter >= PATIENCE:
        print(f"Early stopping triggered after {epoch+1} epochs")
        break

total_time = time.time() - total_start_time
print(f"\nTotal training time: {total_time:.2f} seconds ({total_time/3600:.2f} hours)")

# Final evaluation
print("\nPerforming final evaluation...")
best_model_path = os.path.join(checkpoint_dir, 'best_model.pth')
model.load_state_dict(torch.load(best_model_path)['model_state_dict'])
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f'Final Test loss: {test_loss:.4f} | Test accuracy: {test_acc:.4f}')

# Save the final model
final_model_path = os.path.join(checkpoint_dir, 'final_translation_model.pth')
torch.save(model.state_dict(), final_model_path)
print("Final model saved.")

Starting training from scratch with full dataset
Total number of epochs: 1
Patience for early stopping: 5

Starting training...
Number of batches: 21964




Processed first batch in 1.01 seconds
First batch loss: 9.5711
Processed 10 batches. Time elapsed: 4.46 seconds
Current loss: 9.3535
Current accuracy: 0.0012
Current learning rate: 0.000020
Processed 20 batches. Time elapsed: 8.30 seconds
Current loss: 9.1218
Current accuracy: 0.0500
Current learning rate: 0.000020
Processed 30 batches. Time elapsed: 12.13 seconds
Current loss: 9.0154
Current accuracy: 0.0449
Current learning rate: 0.000020
Processed 40 batches. Time elapsed: 15.99 seconds
Current loss: 8.8491
Current accuracy: 0.0617
Current learning rate: 0.000020
Processed 50 batches. Time elapsed: 19.83 seconds
Current loss: 8.7317
Current accuracy: 0.0658
Current learning rate: 0.000020
Processed 60 batches. Time elapsed: 23.67 seconds
Current loss: 8.7525
Current accuracy: 0.0552
Current learning rate: 0.000020
Processed 70 batches. Time elapsed: 27.53 seconds
Current loss: 8.6353
Current accuracy: 0.0548
Current learning rate: 0.000020
Processed 80 batches. Time elapsed: 31.40 s



Saved model checkpoint for epoch 1
New best model found. Saving best model checkpoint...

Total training time: 9210.60 seconds (2.56 hours)

Performing final evaluation...


  model.load_state_dict(torch.load(best_model_path)['model_state_dict'])


Evaluation completed in 332.11 seconds
Final Test loss: 5.2892 | Test accuracy: 0.1987
Final model saved.


In [6]:
from nltk.translate.bleu_score import corpus_bleu


def load_model(model_path, model_params, device):
    model = TranslationTransformer(**model_params).to(device)
    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    return model



def evaluate(model, data_loader, criterion, device, en_tokenizer, hi_tokenizer):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_tokens = 0
    all_predictions = []
    all_references = []

    with torch.no_grad():
        for batch in data_loader:
            src = batch['en_input'].to(device)
            tgt = batch['hi_input'].to(device)
            tgt_output = batch['hi_output'].to(device)

            output = model(src, tgt)
            loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))

            total_loss += loss.item()

            output = output.transpose(0, 1)
            _, predicted = torch.max(output, dim=2)

            correct = (predicted == tgt_output) & (tgt_output != 0)
            total_correct += correct.sum().item()
            total_tokens += (tgt_output != 0).sum().item()

            # Decode token IDs to text
            pred_texts = [hi_tokenizer.decode(pred.tolist()) for pred in predicted]
            ref_texts = [hi_tokenizer.decode(ref.tolist()) for ref in tgt_output]

            all_predictions.extend(pred_texts)
            all_references.extend(ref_texts)

    accuracy = total_correct / total_tokens if total_tokens > 0 else 0
    avg_loss = total_loss / len(data_loader)

    # Calculate BLEU score using decoded text
    references = [[ref.split()] for ref in all_references]  # BLEU expects a list of tokens
    hypotheses = [hyp.split() for hyp in all_predictions]
    bleu_score = corpus_bleu(references, hypotheses)

    return avg_loss, accuracy, bleu_score


In [7]:
# Load the saved model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Define your model parameters
model_params = {
    'src_vocab_size': 8000,  # Adjust based on your tokenizer
    'tgt_vocab_size': 8000,  # Adjust based on your tokenizer
    'd_model': 256,
    'nhead': 8,
    'num_encoder_layers': 3,
    'num_decoder_layers': 3,
    'dim_feedforward': 1024,
    'dropout': 0.1
}

model_path = 'model_checkpoints_old_2/best_model.pth'
model = load_model(model_path, model_params, device)

  checkpoint = torch.load(model_path, map_location=device)


In [11]:
test_loss, test_accuracy, test_bleu = evaluate(model, test_loader, criterion, device, en_tokenizer, hi_tokenizer)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, BLEU Score: {test_bleu:.4f}')

Test Loss: 8.9991, Test Accuracy: 0.1987, BLEU Score: 0.0000


In [8]:
import torch
import torch.nn.functional as F

def beam_search_translate(model, sentence, en_tokenizer, hi_tokenizer, device, beam_size=5, max_length=64):
    model.eval()
    tokens = en_tokenizer.encode(sentence)
    source = torch.LongTensor([[en_tokenizer.bos_id()] + tokens + [en_tokenizer.eos_id()]]).to(device)
    
    # Initialize the beam
    beam = [([hi_tokenizer.bos_id()], 0)]
    completed_translations = []

    for _ in range(max_length):
        candidates = []
        for seq, score in beam:
            if seq[-1] == hi_tokenizer.eos_id():
                completed_translations.append((seq, score))
                continue

            target = torch.LongTensor([seq]).to(device)
            with torch.no_grad():
                output = model(source, target)
                # Transpose output to (batch_size, sequence_length, vocab_size)
                output = output.transpose(0, 1)
                log_probs = F.log_softmax(output[:, -1], dim=-1)

            # Get top k candidates
            values, indices = log_probs.topk(beam_size)
            for i in range(beam_size):
                candidates.append((seq + [indices[0, i].item()], score + values[0, i].item()))

        # Select top beam_size candidates
        beam = sorted(candidates, key=lambda x: x[1], reverse=True)[:beam_size]

        # Stop if all beams have reached the end token
        if all(b[0][-1] == hi_tokenizer.eos_id() for b in beam):
            break

    # If no translations completed, force end token on top beam
    if not completed_translations:
        completed_translations = [(beam[0][0] + [hi_tokenizer.eos_id()], beam[0][1])]

    # Return the best translation
    best_translation = max(completed_translations, key=lambda x: x[1])
    translated = hi_tokenizer.decode(best_translation[0])
    return translated

# Test the translation
test_sentences = [
    "What is your name?",
    "The quick brown fox jumps over the lazy dog.",
    "I love eating pizza and pasta.",
    "Can you help me solve this math problem?",
    "The concert was amazing, but the venue was too crowded.",
    "How does photosynthesis work in plants?",
    "She sells seashells by the seashore.",
    "To be or not to be, that is the question.",
    "Climate change is affecting ecosystems worldwide.",
    "What's the capital city of France?",
    "The library is closed on Sundays and public holidays.",
    "I can't believe it's not butter!",
    "Do you prefer coffee or tea in the morning?",
    "The Eiffel Tower was built in 1889.",
    "My favorite color is blue, what's yours?",
    "Please remember to bring your umbrella, it might rain later.",
    "The password must contain at least one uppercase letter and one number.",
    "How many planets are there in our solar system?",
    "I'm learning to play the guitar, but it's quite challenging.",
    "Would you like to go to the movies this weekend?"
]

for sentence in test_sentences:
    translated = beam_search_translate(model, sentence, en_tokenizer, hi_tokenizer, device)
    print(f"English: {sentence}")
    print(f"Hindi: {translated}")
    print()



English: What is your name?
Hindi: आपका नाम आपका नाम क्या है? आपका नाम क्या है? आपका नाम क्या है? आपका नाम क्या है? आपका नाम क्या है? आपका नाम क्या है? आपका नाम क्या है? आपका नाम क्या है? आपका नाम क्या है? आपका नाम क्या है? आपका नाम क्या है? आपका नाम क्या है? आपका नाम

English: The quick brown fox jumps over the lazy dog.
Hindi: लाल काले काले के नीचे के नीचे के नीचे काली के नीचे के नीचे के नीचे गिरे के नीचे गिरे के नीचे गिरे के नीचे गिरे के नीचे गिरे को काट डालता है। ज्वरी के नीचे गिरी के नीचे गिरे के नीचे गिरे के नीचे गिरे हुए

English: I love eating pizza and pasta.
Hindi: मैं प्यार कर रहा हूँ प्यार मैं प्यार करता हूं और प्रेमी प्रेम और प्रेमी प्यार करते हैं। मैं प्यार करता हूँ मैं प्यार करता हूँ और प्यार करता हूँ। मैं प्यार करता हूँ। मैं प्यार करता हूँ और मैं प्यार करता हूँ। मैं प्यार करता हूँ मैं प्यार करता हूँ। मैं प्यार करता हूँ और प्रेम

English: Can you help me solve this math problem?
Hindi: क्या आप मुझे इस समस्या में मदद कर सकते हैं कि आप मुझे इस समस्या में मदद कर सकते हैं कि

In [5]:
def count_lines(file_path):
    with open(file_path, 'r') as file:
        line_count = sum(1 for line in file)
    return line_count

# Example usage
file_path = 'temp_hindi.txt'
num_lines = count_lines(file_path)
print(f"The file contains {num_lines} lines.")

The file contains 1561841 lines.
