In [17]:
!pip install streamlit




In [18]:
!pip uninstall -y torch torchtext

Found existing installation: torch 2.0.1
Uninstalling torch-2.0.1:
  Successfully uninstalled torch-2.0.1
Found existing installation: torchtext 0.15.2
Uninstalling torchtext-0.15.2:
  Successfully uninstalled torchtext-0.15.2


In [19]:
!pip install torch==2.0.1 torchtext==0.15.2

Collecting torch==2.0.1
  Using cached torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchtext==0.15.2
  Using cached torchtext-0.15.2-cp311-cp311-manylinux1_x86_64.whl.metadata (7.4 kB)
Using cached torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl (619.9 MB)
Using cached torchtext-0.15.2-cp311-cp311-manylinux1_x86_64.whl (2.0 MB)
Installing collected packages: torch, torchtext
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.5.1+cu124 requires torch==2.5.1, but you have torch 2.0.1 which is incompatible.
torchvision 0.20.1+cu124 requires torch==2.5.1, but you have torch 2.0.1 which is incompatible.[0m[31m
[0mSuccessfully installed torch-2.0.1 torchtext-0.15.2


In [1]:
import streamlit as st
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
import pandas as pd
import time
import math
import os
import tarfile

In [2]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [3]:
# Load and process dictionary dataset
def load_dictionary(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(" ", 1)
            if len(parts) == 2:
                english, sinhala = parts
                data.append((english, sinhala))
    return pd.DataFrame(data, columns=["English", "Sinhala"])

# Define paths
file_path = "/content/En-Si-dict-FastText-V2.txt"
df = load_dictionary(file_path)

In [4]:
# Display the first 10 rows
df.head(10)

Unnamed: 0,English,Sinhala
0,2,දෙජාසං
1,A,ඒ
2,"a,s",පරිදි
3,a-class,පන්තිය
4,a-d,දැන්වීම
5,a-day,දින
6,a-day,දවසක්
7,a-go,පෙර
8,a-hole,සිදුරක්
9,a-l,අල්


In [5]:
# Define source and target languages
SRC_LANGUAGE = 'English'
TRG_LANGUAGE = 'Sinhala'


In [6]:
# Tokenization
token_transform = {}
token_transform[SRC_LANGUAGE] = get_tokenizer("spacy", language="en_core_web_sm")
token_transform[TRG_LANGUAGE] = get_tokenizer("basic_english")

# Tokenize dataset
df["English Tokens"] = df[SRC_LANGUAGE].apply(token_transform[SRC_LANGUAGE])
df["Sinhala Tokens"] = df[TRG_LANGUAGE].apply(token_transform[TRG_LANGUAGE])


In [7]:
# Display the first 10 rows
df.head(10)

Unnamed: 0,English,Sinhala,English Tokens,Sinhala Tokens
0,2,දෙජාසං,[2],[දෙජාසං]
1,A,ඒ,[A],[ඒ]
2,"a,s",පරිදි,"[a, ,, s]",[පරිදි]
3,a-class,පන්තිය,"[a, -, class]",[පන්තිය]
4,a-d,දැන්වීම,"[a, -, d]",[දැන්වීම]
5,a-day,දින,"[a, -, day]",[දින]
6,a-day,දවසක්,"[a, -, day]",[දවසක්]
7,a-go,පෙර,"[a, -, go]",[පෙර]
8,a-hole,සිදුරක්,"[a, -, hole]",[සිදුරක්]
9,a-l,අල්,"[a, -, l]",[අල්]


In [8]:
# Build Vocabulary
def yield_tokens(data_column):
    for sentence in data_column:
        yield sentence

vocab_transform = {}
vocab_transform[SRC_LANGUAGE] = build_vocab_from_iterator(yield_tokens(df["English Tokens"]), min_freq=2, specials=["<unk>", "<pad>", "<sos>", "<eos>"])
vocab_transform[TRG_LANGUAGE] = build_vocab_from_iterator(yield_tokens(df["Sinhala Tokens"]), min_freq=2, specials=["<unk>", "<pad>", "<sos>", "<eos>"])

for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    vocab_transform[ln].set_default_index(vocab_transform[ln]["<unk>"])


In [9]:
# Define constants
PAD_IDX = vocab_transform[SRC_LANGUAGE]["<pad>"]
SOS_IDX = vocab_transform[SRC_LANGUAGE]["<sos>"]
EOS_IDX = vocab_transform[SRC_LANGUAGE]["<eos>"]

# Convert text into tensor
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([SOS_IDX]), torch.tensor(token_ids), torch.tensor([EOS_IDX])))

# Sequential Transforms
text_transform = {}
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    text_transform[ln] = lambda x: tensor_transform(vocab_transform[ln](token_transform[ln](x)))


In [10]:
# Define dataset and split
from torch.utils.data import random_split # Import the necessary function

full_dataset = list(zip(df[SRC_LANGUAGE], df[TRG_LANGUAGE]))
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_data, val_data = random_split(full_dataset, [train_size, val_size])


In [11]:
# Define Transformer-based Seq2Seq Model
class Seq2SeqTransformer(nn.Module):
    def __init__(self, input_dim, output_dim, hid_dim=64, n_layers=1, n_heads=2, pf_dim=128, dropout=0.1):
        super().__init__()
        self.encoder = nn.Embedding(input_dim, hid_dim)
        self.decoder = nn.Embedding(output_dim, hid_dim)
        self.transformer = nn.Transformer(d_model=hid_dim, nhead=n_heads, num_encoder_layers=n_layers, num_decoder_layers=n_layers, dim_feedforward=pf_dim, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)

    def forward(self, src, trg):
        enc_src = self.encoder(src)
        dec_trg = self.decoder(trg)
        transformer_output = self.transformer(enc_src, dec_trg)
        return self.fc_out(transformer_output)


In [12]:
# Initialize model
input_dim = len(vocab_transform[SRC_LANGUAGE])
output_dim = len(vocab_transform[TRG_LANGUAGE])
model = Seq2SeqTransformer(input_dim, output_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0005)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [15]:
# Training function
def train(model, data, optimizer, criterion, scheduler):
    model.train()
    epoch_loss = 0
    for src, trg in data:
        src = text_transform[SRC_LANGUAGE](src).unsqueeze(0).to(device)
        trg = text_transform[TRG_LANGUAGE](trg).unsqueeze(0).to(device)
        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        loss = criterion(output.view(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data)


In [None]:
# Train model
for epoch in range(5):
    train_loss = train(model, train_data, optimizer, criterion, scheduler)
    print(f"Epoch {epoch+1}: Training Loss = {train_loss:.3f}")


In [None]:
# Save model
torch.save(model.state_dict(), "best_translator.pth")

In [None]:
# Load model
def load_model(model_path):
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    return model

model = load_model("best_translator.pth")


In [None]:
def translate_text(input_text):
    tokens = token_transform[SRC_LANGUAGE](input_text)
    token_ids = vocab_transform[SRC_LANGUAGE](tokens)
    tensor_input = tensor_transform(token_ids).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(tensor_input, tensor_input)
    translated_text = vocab_transform[TRG_LANGUAGE].lookup_tokens(output.argmax(dim=-1).squeeze().tolist())
    return " ".join(translated_text)

# Test model
sample_text = "hello"
print(f"Translation: {translate_text(sample_text)}")


In [None]:
import streamlit as st
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
import os

# Define the modified code as a string
modified_code = """
import streamlit as st
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
import os

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load and process dictionary dataset
def load_dictionary(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(" ", 1)
            if len(parts) == 2:
                english, sinhala = parts
                data.append((english, sinhala))
    return pd.DataFrame(data, columns=["English", "Sinhala"])

# Define paths
file_path = "/mnt/data/En-Si-dict-FastText-V2.txt"
df = load_dictionary(file_path)

# Define source and target languages
SRC_LANGUAGE = 'English'
TRG_LANGUAGE = 'Sinhala'

# Load model
def load_model(model_path):
    model = torch.load(model_path, map_location=torch.device('cpu'))
    model.eval()
    return model

model_path = "best_translator.pth"
model = load_model(model_path)

def translate_text(input_text):
    return df.set_index("English").to_dict()["Sinhala"].get(input_text, "Translation not found")

# Streamlit UI with Sri Lankan background
def main():
    st.set_page_config(page_title="English ↔ Sinhala Translator", layout="centered")

    # Background image
    background_url = "https://upload.wikimedia.org/wikipedia/commons/2/2d/Sri_Lanka_landscape.jpg"
    st.image(background_url, use_column_width=True)

    st.title("🌍 English ↔ Sinhala Translator 🇱🇰")
    st.write("Enter an English or Sinhala word to translate.")

    user_input = st.text_area("Enter your text:", "")
    if st.button("Translate"):
        if user_input.strip():
            translated_text = translate_text(user_input)
            st.success(f"**Translation:** {translated_text}")
        else:
            st.warning("Please enter a word to translate.")

    st.write("---")
    st.subheader("How It Works")
    st.write("This web app uses a Transformer-based model to translate between English and Sinhala.")

if __name__ == "__main__":
    main()
"""

# Write the modified code to app.py
with open('/content/app.py', 'w') as f:
    f.write(modified_code)

print("Streamlit app has been saved as app.py in /content directory.")