In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertModel
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm

# Paths
train_path = "/kaggle/input/nlp-autosuggestion/preprocessed_train.csv"
val_path = "/kaggle/input/nlp-autosuggestion/prerocessed_validation.csv"

In [2]:
class HinglishDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_len=128, sample_size=10000):
        self.data = pd.read_csv(file_path).sample(sample_size)  # Take a subset of the data
        self.phrases = self.data['phrases'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.phrases)

    def __getitem__(self, index):
        text = self.phrases[index]
        encoded = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return encoded.input_ids.squeeze(0), encoded.attention_mask.squeeze(0)

# Initialize Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

# Prepare DataLoaders
train_dataset = HinglishDataset(train_path, tokenizer, sample_size=2000)  # Reduced size
val_dataset = HinglishDataset(val_path, tokenizer, sample_size=500)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

In [3]:
# Define Model
class LSTMWithDistilBERT(nn.Module):
    def __init__(self, bert_model, hidden_size=128, num_layers=1, dropout=0.1):
        super(LSTMWithDistilBERT, self).__init__()
        self.bert = bert_model
        for param in self.bert.parameters():
            param.requires_grad = False  # Freeze BERT layers
        self.lstm = nn.LSTM(
            input_size=self.bert.config.hidden_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_size, self.bert.config.vocab_size)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        lstm_output, _ = self.lstm(bert_output)
        output = self.fc(lstm_output)
        return output

In [4]:
# Initialize Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
model = LSTMWithDistilBERT(bert_model).to(device)

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]



In [5]:
# Training Setup
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=3e-5)

In [6]:
# Mixed Precision Training
scaler = torch.cuda.amp.GradScaler()

  scaler = torch.cuda.amp.GradScaler()


In [7]:
# Training Loop
def train_model(model, train_loader, val_loader, criterion, optimizer, scaler, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        loop = tqdm(train_loader, leave=True)
        for input_ids, attention_mask in loop:
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs.view(-1, outputs.size(-1)), input_ids.view(-1))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
            loop.set_description(f'Epoch {epoch + 1}')
            loop.set_postfix(loss=loss.item())

    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for input_ids, attention_mask in val_loader:
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            with torch.cuda.amp.autocast():
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs.view(-1, outputs.size(-1)), input_ids.view(-1))
            val_loss += loss.item()
        print(f'Validation Loss: {val_loss / len(val_loader):.4f}')

In [10]:
train_model(model, train_loader, val_loader, criterion, optimizer, scaler, epochs=20)

  with torch.cuda.amp.autocast():
Epoch 1: 100%|██████████| 63/63 [00:11<00:00,  5.70it/s, loss=3.84]
Epoch 2: 100%|██████████| 63/63 [00:11<00:00,  5.69it/s, loss=3.23]
Epoch 3: 100%|██████████| 63/63 [00:11<00:00,  5.70it/s, loss=2.65]
Epoch 4: 100%|██████████| 63/63 [00:11<00:00,  5.70it/s, loss=2.47]
Epoch 5: 100%|██████████| 63/63 [00:11<00:00,  5.69it/s, loss=2.02]
Epoch 6: 100%|██████████| 63/63 [00:11<00:00,  5.69it/s, loss=1.85]
Epoch 7: 100%|██████████| 63/63 [00:11<00:00,  5.69it/s, loss=1.52]
Epoch 8: 100%|██████████| 63/63 [00:11<00:00,  5.69it/s, loss=1.58]
Epoch 9: 100%|██████████| 63/63 [00:11<00:00,  5.69it/s, loss=1.28]
Epoch 10: 100%|██████████| 63/63 [00:11<00:00,  5.71it/s, loss=1.33]
Epoch 11: 100%|██████████| 63/63 [00:11<00:00,  5.70it/s, loss=1.19]
Epoch 12: 100%|██████████| 63/63 [00:11<00:00,  5.72it/s, loss=1.36]
Epoch 13: 100%|██████████| 63/63 [00:11<00:00,  5.70it/s, loss=1.16]
Epoch 14: 100%|██████████| 63/63 [00:11<00:00,  5.71it/s, loss=1.63]
Epoch 15:

Validation Loss: 1.7605


In [11]:
import torch

# Evaluation Function
def evaluate_model(model, val_loader, criterion):
    model.eval()
    val_loss = 0
    total_batches = 0
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(val_loader, desc="Evaluating"):
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            with torch.cuda.amp.autocast():
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs.view(-1, outputs.size(-1)), input_ids.view(-1))
            val_loss += loss.item()
            total_batches += 1
    avg_loss = val_loss / total_batches
    perplexity = torch.exp(torch.tensor(avg_loss))
    print(f"Validation Loss: {avg_loss:.4f}")
    print(f"Perplexity: {perplexity:.4f}")
    return avg_loss, perplexity

# Save Model
def save_model(model, tokenizer, model_path="lstm_distilbert_model.pt", tokenizer_path="tokenizer"):
    torch.save(model.state_dict(), model_path)
    tokenizer.save_pretrained(tokenizer_path)
    print(f"Model saved to {model_path}")
    print(f"Tokenizer saved to {tokenizer_path}")

# Run Evaluation and Save Model
validation_loss, validation_perplexity = evaluate_model(model, val_loader, criterion)
save_model(model, tokenizer)


  with torch.cuda.amp.autocast():
Evaluating: 100%|██████████| 16/16 [00:01<00:00,  9.33it/s]


Validation Loss: 1.7605
Perplexity: 5.8154
Model saved to lstm_distilbert_model.pt
Tokenizer saved to tokenizer


In [12]:
# Load Tokenizer
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained("tokenizer")

# Load Model
model = LSTMWithDistilBERT(bert_model).to(device)
model.load_state_dict(torch.load("lstm_distilbert_model.pt"))
model.eval()

  model.load_state_dict(torch.load("lstm_distilbert_model.pt"))


LSTMWithDistilBERT(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Li

In [37]:
import torch.nn.functional as F

def predict_next_word_with_sampling(model, tokenizer, input_text, max_length=5, temperature=1.0):
    """
    Predict the next word(s) using sampling for variability.

    Parameters:
    - model: Trained model
    - tokenizer: Tokenizer for input/output
    - input_text: Starting text
    - max_length: Number of tokens to predict
    - temperature: Controls randomness in predictions; lower = deterministic

    Returns:
    - Generated text
    """
    model.eval()
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    attention_mask = torch.ones(input_ids.shape).to(device)

    predicted_text = input_text
    with torch.no_grad():
        for _ in range(max_length):
            with torch.amp.autocast(device_type="cuda"):  # Updated for warning
                outputs = model(input_ids, attention_mask)

            logits = outputs[:, -1, :] / temperature  # Apply temperature scaling
            probabilities = F.softmax(logits, dim=-1)

            # Use torch.multinomial for sampling
            predicted_token_id = torch.multinomial(probabilities, num_samples=1).squeeze(1)

            # Decode and append
            predicted_word = tokenizer.decode(predicted_token_id.item())  # Extract single token ID
            if predicted_word in tokenizer.all_special_tokens:
                break
            predicted_text += " " + predicted_word.strip()

            # Update input_ids for next prediction
            input_ids = torch.cat([input_ids, predicted_token_id.unsqueeze(0)], dim=1)
            attention_mask = torch.cat(
                [attention_mask, torch.ones((1, 1)).to(device)], dim=1
            )

    return predicted_text.strip()

# Example usage
input_text = "u"
predicted_text = predict_next_word_with_sampling(model, tokenizer, input_text, max_length=5, temperature=0.8)
print(f"Input Text: {input_text}")
print(f"Predicted Text: {predicted_text}")


Input Text: u
Predicted Text: u ky


In [41]:
input_text = "haa"
predicted_text = predict_next_word_with_sampling(model, tokenizer, input_text, max_length=5, temperature=0.8)
print(f"Input Text: {input_text}")
print(f"Predicted Text: {predicted_text}")

Input Text: haa
Predicted Text: haa хорошо


In [46]:
input_text = "par"
predicted_text = predict_next_word_with_sampling(model, tokenizer, input_text, max_length=5, temperature=0.8)
print(f"Input Text: {input_text}")
print(f"Predicted Text: {predicted_text}")

Input Text: par
Predicted Text: par ba


In [50]:
input_text = "ki"
predicted_text = predict_next_word_with_sampling(model, tokenizer, input_text, max_length=5, temperature=0.8)
print(f"Input Text: {input_text}")
print(f"Predicted Text: {predicted_text}")

Input Text: ki
Predicted Text: ki previous


In [61]:
input_text = "baje"
predicted_text = predict_next_word_with_sampling(model, tokenizer, input_text, max_length=5, temperature=0.8)
print(f"Input Text: {input_text}")
print(f"Predicted Text: {predicted_text}")

Input Text: baje
Predicted Text: baje районов


In [62]:
input_text = "raha"
predicted_text = predict_next_word_with_sampling(model, tokenizer, input_text, max_length=5, temperature=0.8)
print(f"Input Text: {input_text}")
print(f"Predicted Text: {predicted_text}")

Input Text: raha
Predicted Text: raha


In [63]:
input_text = "yaad"
predicted_text = predict_next_word_with_sampling(model, tokenizer, input_text, max_length=5, temperature=0.8)
print(f"Input Text: {input_text}")
print(f"Predicted Text: {predicted_text}")

Input Text: yaad
Predicted Text: yaad
