<a href="https://colab.research.google.com/github/harshithamadarapu/Team16_Hinglish-Auto-suggestions/blob/main/DistilBERT_GPU_Limit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the datasets
train_df = pd.read_csv('/content/preprocessed_train.csv')
val_df = pd.read_csv('/content/prerocessed_validation (1).csv')

# Extract the `phrases` column
train_sentences = train_df['phrases'].tolist()
val_sentences = val_df['phrases'].tolist()

In [2]:
from transformers import DistilBertTokenizer
import random

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

def mask_text(sentences, tokenizer, mask_probability=0.15):
    inputs, labels = [], []
    for sentence in sentences:
        # Tokenize the sentence
        tokenized = tokenizer(sentence, truncation=True, padding="max_length", max_length=128, return_tensors="pt")

        # Create labels and randomly mask tokens
        input_ids = tokenized["input_ids"][0]
        label_ids = input_ids.clone()

        for i in range(len(input_ids)):
            if random.random() < mask_probability and input_ids[i] != tokenizer.pad_token_id:
                input_ids[i] = tokenizer.mask_token_id  # Replace token with [MASK]
            else:
                label_ids[i] = -100  # Ignore token for loss calculation

        inputs.append(input_ids)
        labels.append(label_ids)

    return {"input_ids": inputs, "labels": labels}

# Prepare datasets
train_data = mask_text(train_sentences, tokenizer)
val_data = mask_text(val_sentences, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

class HinglishDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs[idx],
            "labels": self.labels[idx],
        }


train_dataset = HinglishDataset(train_data["input_ids"], train_data["labels"])
val_dataset = HinglishDataset(val_data["input_ids"], val_data["labels"])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [4]:
from transformers import DistilBertForMaskedLM, AdamW
from transformers import get_scheduler
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

model = DistilBertForMaskedLM.from_pretrained('distilbert-base-multilingual-cased')

optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]



DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=

In [None]:
epochs = 3
for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())


  0%|          | 0/11783 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Epoch 0: 100%|██████████| 11783/11783 [1:33:09<00:00,  2.11it/s, loss=0.751]
Epoch 1:   0%|          | 0/11783 [00:00<?, ?it/s]

In [None]:
model.save_pretrained('./hinglish_next_word_model')
tokenizer.save_pretrained('./hinglish_next_word_model')

In [None]:
from transformers import pipeline

fill_mask = pipeline('fill-mask', model='./hinglish_next_word_model', tokenizer='./hinglish_next_word_model')

# example
text = "Mujhe lagta hai ki [MASK]"
predictions = fill_mask(text)

for pred in predictions:
    print(f"Suggested word: {pred['token_str']} with score: {pred['score']}")

In [None]:
from transformers import pipeline

fill_mask = pipeline('fill-mask', model='./hinglish_next_word_model', tokenizer='./hinglish_next_word_model')

def predict_next_word():
    print("Hinglish Next-Word Prediction")
    print("Type a word or sentence to predict the next word.")
    print("Type 'exit' to quit.")

    while True:
        user_input = input("Enter a word or sentence: ").strip()
        if user_input.lower() == 'exit':
            print("Exiting. Goodbye!")
            break

        input_with_mask = f"{user_input} [MASK]"

        try:
            predictions = fill_mask(input_with_mask)
            print("\nPredicted next words:")
            for i, pred in enumerate(predictions):
                print(f"{i+1}. {pred['token_str']} (score: {pred['score']:.4f})")
        except Exception as e:
            print("Error during prediction:", e)
        print()

predict_next_word()