In [None]:
#Qustion1
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
import numpy as np
from collections import defaultdict


def read_tsv(path):
    data = []
    with open(path, encoding='utf-8') as f:
        for line in f:
            dev, lat, freq = line.strip().split('\t')
            data.extend([(lat, dev)] * int(freq))
    return data


def build_vocab(sequences):
    vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
    for seq in sequences:
        for char in seq:
            if char not in vocab:
                vocab[char] = len(vocab)
    return vocab


class TransliterationDataset(Dataset):
    def __init__(self, data, input_vocab, target_vocab):
        self.data = data
        self.input_vocab = input_vocab
        self.target_vocab = target_vocab

    def __len__(self):
        return len(self.data)

    def encode_seq(self, seq, vocab, add_sos_eos=False):
        ids = [vocab[c] for c in seq]
        if add_sos_eos:
            ids = [vocab['<sos>']] + ids + [vocab['<eos>']]
        return torch.tensor(ids, dtype=torch.long)

    def __getitem__(self, idx):
        latin, dev = self.data[idx]
        return self.encode_seq(latin, self.input_vocab), self.encode_seq(
            dev, self.target_vocab, True)


class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, rnn_type='gru'):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        rnn_cls = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}[rnn_type]
        self.rnn = rnn_cls(emb_dim, hid_dim, n_layers, batch_first=True)
        self.rnn_type = rnn_type

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, hidden = self.rnn(embedded)
        return hidden


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, rnn_type='gru'):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        rnn_cls = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}[rnn_type]
        self.rnn = rnn_cls(emb_dim, hid_dim, n_layers, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.rnn_type = rnn_type

    def forward(self, input, hidden):
        input = input.unsqueeze(1)
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded, hidden)
        return self.fc_out(output.squeeze(1)), hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device # The device attribute is defined here

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.shape
        output_dim = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, trg_len,
                              output_dim).to(self.device)
        hidden = self.encoder(src)

        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE) # Pass device to the Seq2Seq constructor

def train(model, data_loader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    for src, trg in data_loader:
        src, trg = src.to(model.device), trg.to(model.device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)


def accuracy(model, data_loader, trg_vocab):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for src, trg in data_loader:
            src, trg = src.to(model.device), trg.to(model.device)
            output = model(src, trg, 0)
            preds = output.argmax(-1)
            for pred, true in zip(preds, trg):
                if torch.equal(pred[1:], true[1:]):  # ignore <sos>
                    correct += 1
                total += 1
    return correct / total


def predict(model, src_seq, input_vocab, output_vocab, max_len=30):
    model.eval()
    inv_vocab = {v: k for k, v in output_vocab.items()}
    src_tensor = torch.tensor([input_vocab[c] for c in src_seq],
                             dtype=torch.long).unsqueeze(0).to(model.device)
    hidden = model.encoder(src_tensor)
    input = torch.tensor([output_vocab['<sos>']], device=model.device)
    output = []
    for _ in range(max_len):
        out, hidden = model.decoder(input, hidden)
        top1 = out.argmax(1)
        char = inv_vocab[top1.item()]
        if char == '<eos>':
            break
        output.append(char)
        input = top1
    return ''.join(output)


# Assuming 'dev_path' is the path to your validation data
dev_path = "/content/hi.translit.sampled.dev.tsv"
val_data = read_tsv(dev_path)

# Build vocabularies
input_vocab = build_vocab([d[0] for d in val_data])
target_vocab = build_vocab([d[1] for d in val_data])

# Create the validation dataset
val_dataset = TransliterationDataset(val_data, input_vocab, target_vocab)

# ... (Rest of your code: Model creation, training, etc.) ...

# Show a few predictions
for i in range(5):
    src_sample, tgt_sample = val_data[i]
    pred = predict(model, src_sample, input_vocab, target_vocab)
    print(f"Input: {src_sample} | Target: {tgt_sample} | Predicted: {pred}")

Input: ankan | Target: अंकन | Predicted: <sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos>
Input: ankan | Target: अंकन | Predicted: <sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos>
Input: ankan | Target: अंकन | Predicted: <sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos>
Input: angkor | Target: अंगकोर | Predicted: <sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos>
Input: angkor | Target: अंगकोर | Predicted: <sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos><sos>


In [None]:
#Question2
import pandas as pd
import re
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, pipeline

# Load CSVs
khalid_df = pd.read_csv('/content/Khalid_chintu.csv')
gaga_df = pd.read_csv('/content/LadyGaga_chintu.csv')

# Combine them
lyrics_df = pd.concat([khalid_df, gaga_df])

# Clean the lyrics
def clean_lyrics(lyric):
    if pd.isna(lyric):
        return ""
    lyric = str(lyric)
    lyric = re.sub(r'^#+', '', lyric)  # remove leading hashes
    lyric = lyric.encode('utf-8').decode('utf-8', 'ignore')  # remove weird chars
    lyric = re.sub(r'[\u2018\u2019\u201c\u201d]+', "'", lyric)  # smart quotes to '
    lyric = re.sub(r'[^\x00-\x7F]+', '', lyric)  # remove non-ascii (optional)
    return lyric.strip()

# Apply cleaning and extract as list
lyrics_texts = lyrics_df['Lyric'].dropna().apply(clean_lyrics).tolist()

# Save cleaned lyrics to a text file
with open("lyrics_dataset.txt", "w", encoding="utf-8") as f:
    for lyric in lyrics_texts:
        f.write(lyric + "\n\n")

# Load dataset from text
dataset = load_dataset("text", data_files={"train": "lyrics_dataset.txt"})

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize data
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define training args
training_args = TrainingArguments(
    output_dir="./gpt2-lyrics",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_steps=100,
    save_steps=500,
    save_total_limit=1,
    prediction_loss_only=True,
    report_to="none",  # 🚫 Disable W&B
    fp16=False
)


# Setup Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
)

# Train the model
# The labels are the same as the input_ids in language modeling
tokenized_dataset = tokenized_dataset.map(lambda examples: {'labels': examples['input_ids']}, batched=True)
trainer.train_dataset = tokenized_dataset["train"]
trainer.train()

# Generate text from trained model
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
print(generator("I remember those nights when", max_length=100, num_return_sequences=1)[0]["generated_text"])

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  trainer = Trainer(


Map:   0%|          | 0/918 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,1.0379
200,0.9006
300,0.8144
400,0.7617
500,0.7881
600,0.753
700,0.6766
800,0.6895
900,0.7616
1000,0.6397


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


I remember those nights when he cried me just you and me would dance and when we kissed our lips underneath the rainbow light 'cause we kissed and we mumbled we were still together i had a baby before you but i don't ever want another baby  pre and the love that i've worked so hard for in my life i won't ever get enough of it i'm on the road to a new life no other life is as good as a love life now   where does it go when
