In [1]:
!pip install torchtext==0.15.1



Defaulting to user installation because normal site-packages is not writeable
Collecting torchtext==0.15.1
  Downloading torchtext-0.15.1-cp310-cp310-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting torch==2.0.0 (from torchtext==0.15.1)
  Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchdata==0.6.0 (from torchtext==0.15.1)
  Downloading torchdata-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (919 bytes)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.0->torchtext==0.15.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.0->torchtext==0.15.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.0->torchtext==0.15.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.met

In [1]:
import torch
print(torch.__version__)


2.0.0+cu117


In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/home/jayesh-_-/Documents/questions.csv')

# Filter only the paraphrase pairs
paraphrases = df[df['is_duplicate'] == 1]

# Randomly sample 100,000 rows
paraphrases_sampled = paraphrases.sample(n=50000, random_state=42)

# Save the reduced dataset
paraphrases_sampled.to_csv("paraphrase_100k.csv", index=False)

print("Done");


Done


In [2]:
# Re-import necessary libraries
from transformers import AutoTokenizer
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch

# Re-initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Re-load the dataset and filter for paraphrases
df = pd.read_csv('/home/jayesh-_-/Documents/questions.csv')
paraphrases = df[df['is_duplicate'] == 1]
paraphrases_sampled = paraphrases.sample(n=100000, random_state=42)
questions = list(zip(paraphrases_sampled['question1'], paraphrases_sampled['question2']))

# Define the Dataset class
class ParaphraseDataset(Dataset):
    def __init__(self, questions, tokenizer, max_len=32):
        self.questions = questions
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        q1, q2 = self.questions[idx]
        src = self.tokenizer.encode(q1, truncation=True, padding='max_length', max_length=self.max_len)
        tgt = self.tokenizer.encode(q2, truncation=True, padding='max_length', max_length=self.max_len)
        src_tensor = torch.tensor(src)
        tgt_tensor = torch.tensor(tgt)

        src_mask = (src_tensor != self.tokenizer.pad_token_id).unsqueeze(1)
        tgt_mask = (tgt_tensor != self.tokenizer.pad_token_id).unsqueeze(1)

        return {
            'src': src_tensor,
            'tgt': tgt_tensor,
            'src_mask': src_mask,
            'tgt_mask': tgt_mask
        }

# Create the dataset and dataloader
train_dataset = ParaphraseDataset(questions, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)


print("Done");



Done


In [3]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.d_v = d_model // num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)

        self.fc_out = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        # Linear transformations and split into num_heads
        Q = self.q_linear(query).view(batch_size, -1, self.num_heads, self.d_k)
        K = self.k_linear(key).view(batch_size, -1, self.num_heads, self.d_k)
        V = self.v_linear(value).view(batch_size, -1, self.num_heads, self.d_v)

        # Transpose to get dimensions batch_size x num_heads x seq_len x d_k
        Q = Q.transpose(1, 2)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        # Attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        # Softmax and apply dropout
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        # Weighted sum
        output = torch.matmul(attn, V)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v)

        # Final output linear transformation
        output = self.fc_out(output)
        return output


class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, ff_dim, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, ff_dim)
        self.fc2 = nn.Linear(ff_dim, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, ff_dim, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.ffn = PositionwiseFeedForward(d_model, ff_dim, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask):
        # Multi-Head Attention block
        attn_output = self.attention(x, x, x, mask)
        x = self.layer_norm1(x + self.dropout1(attn_output))

        # Feed-Forward block
        ff_output = self.ffn(x)
        x = self.layer_norm2(x + self.dropout2(ff_output))

        return x


class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, ff_dim, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.attention1 = MultiHeadAttention(d_model, num_heads, dropout)
        self.attention2 = MultiHeadAttention(d_model, num_heads, dropout)
        self.ffn = PositionwiseFeedForward(d_model, ff_dim, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.layer_norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, x, encoder_output, tgt_mask, src_mask):
        # First Attention block (Self-attention for the decoder)
        attn_output1 = self.attention1(x, x, x, tgt_mask)
        x = self.layer_norm1(x + self.dropout1(attn_output1))

        # Second Attention block (Encoder-decoder attention)
        attn_output2 = self.attention2(x, encoder_output, encoder_output, src_mask)
        x = self.layer_norm2(x + self.dropout2(attn_output2))

        # Feed-Forward block
        ff_output = self.ffn(x)
        x = self.layer_norm3(x + self.dropout3(ff_output))

        return x


class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, ff_dim, num_layers, dropout=0.1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, ff_dim, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)
        self.position_encoding = self._get_position_encoding(d_model)

    def _get_position_encoding(self, d_model, max_len=512):
        position = torch.arange(max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)

    def forward(self, x, mask):
        x = self.embedding(x) + self.position_encoding[:, :x.size(1)]
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, mask)

        return x


class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, ff_dim, num_layers, dropout=0.1):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, ff_dim, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.position_encoding = self._get_position_encoding(d_model)

    def _get_position_encoding(self, d_model, max_len=512):
        position = torch.arange(max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)

    def forward(self, x, encoder_output, tgt_mask, src_mask):
        x = self.embedding(x) + self.position_encoding[:, :x.size(1)]
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, encoder_output, tgt_mask, src_mask)

        x = self.fc_out(x)
        return x


class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, ff_dim, num_layers, dropout=0.1):
        super(Transformer, self).__init__()
        self.encoder = Encoder(vocab_size, d_model, num_heads, ff_dim, num_layers, dropout)
        self.decoder = Decoder(vocab_size, d_model, num_heads, ff_dim, num_layers, dropout)

    def forward(self, src, tgt, src_mask, tgt_mask):
        encoder_output = self.encoder(src, src_mask)
        decoder_output = self.decoder(tgt, encoder_output, tgt_mask, src_mask)
        return decoder_output

print("Done");

Done


In [4]:
# Load your data
paraphrases_sampled = pd.read_csv('paraphrase_100k.csv')

# Create a list of question pairs
questions = list(zip(paraphrases_sampled['question1'], paraphrases_sampled['question2']))

# Create the dataset and dataloader
train_dataset = ParaphraseDataset(questions, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

print("Done");


Done


In [5]:
vocab_size = len(tokenizer)  # this will be the size of your vocabulary based on tokenizer
d_model = 512
num_heads = 8
ff_dim = 2048
num_layers = 6

# Initialize the Transformer model
model = Transformer(vocab_size, d_model, num_heads, ff_dim, num_layers)

print("Done");


Done


In [6]:
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)


In [7]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [8]:
import torch
from tqdm import tqdm  # Progress bar library

# Training loop
num_epochs = 5
batch_size = 32
patience = 3  # Number of epochs to wait for improvement in loss before stopping
best_loss = float('inf')  # Initial high value to compare with
epochs_no_improve = 0  # Counter for the number of epochs without improvement

# Set up tqdm progress bar for batches
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    # Create a tqdm progress bar for each epoch
    with tqdm(range(0, len(train_dataset), batch_size), desc=f"Epoch {epoch+1}/{num_epochs}") as pbar:
        for i in pbar:
            # Get the current batch
            batch = [train_dataset[j] for j in range(i, min(i + batch_size, len(train_dataset)))]

            # Stack the inputs
            src_batch = torch.stack([item['src'] for item in batch])
            tgt_batch = torch.stack([item['tgt'] for item in batch])

            # Create masks
            src_mask = (src_batch != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)
            tgt_mask = (tgt_batch != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)

            # Forward pass
            optimizer.zero_grad()
            output = model(src_batch, tgt_batch, src_mask, tgt_mask)

            # Compute the loss (only compare to non-padding tokens)
            loss = criterion(output.view(-1, vocab_size), tgt_batch.view(-1))

            # Backward pass
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Update progress bar description with current loss
            pbar.set_postfix(loss=total_loss/(i+batch_size))

    avg_loss = total_loss / len(train_dataset)

    # Print average loss for this epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Avg Loss: {avg_loss}")

    # Check for early stopping: If the loss hasn't improved for 'patience' epochs, stop training
    if avg_loss < best_loss:
        best_loss = avg_loss
        epochs_no_improve = 0  # Reset counter for no improvement
    else:
        epochs_no_improve += 1

    # Early stopping check
    if epochs_no_improve >= patience:
        print(f"Early stopping after {epoch+1} epochs due to no improvement in loss.")
        break


Epoch 1/5: 100%|███████████████| 1563/1563 [45:37<00:00,  1.75s/it, loss=0.0384]


Epoch 1/5, Avg Loss: 0.038373061949908735


Epoch 2/5: 100%|██████████████| 1563/1563 [46:43<00:00,  1.79s/it, loss=0.00836]


Epoch 2/5, Avg Loss: 0.0083619914598763


Epoch 3/5: 100%|██████████████| 1563/1563 [46:42<00:00,  1.79s/it, loss=0.00347]


Epoch 3/5, Avg Loss: 0.0034694616346247495


Epoch 4/5: 100%|██████████████| 1563/1563 [46:44<00:00,  1.79s/it, loss=0.00159]


Epoch 4/5, Avg Loss: 0.0015909034139662982


Epoch 5/5: 100%|█████████████| 1563/1563 [46:52<00:00,  1.80s/it, loss=0.000723]

Epoch 5/5, Avg Loss: 0.0007232237377832643





In [9]:
torch.save(model, 'TransformParaphraseFull.pt')
print("Full model saved successfully.")


Full model saved successfully.


In [14]:
!pip install sentencepiece


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[0mInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [4]:
import torch
import torch.nn as nn
from transformers import T5ForConditionalGeneration, T5Tokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Your custom Transformer class
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.model = T5ForConditionalGeneration.from_pretrained('t5-small')

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

# Instantiate tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Load the model
model = torch.load('TransformParaphraseFull.pt', map_location=device)
model.to(device)
model.eval()


Transformer(
  (encoder): Encoder(
    (embedding): Embedding(30522, 512)
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (attention): MultiHeadAttention(
          (q_linear): Linear(in_features=512, out_features=512, bias=True)
          (k_linear): Linear(in_features=512, out_features=512, bias=True)
          (v_linear): Linear(in_features=512, out_features=512, bias=True)
          (fc_out): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ffn): PositionwiseFeedForward(
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (layer_norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dr

In [10]:
# Define generation function
START_TOKEN = tokenizer.convert_tokens_to_ids("[CLS]") or 101
END_TOKEN = tokenizer.convert_tokens_to_ids("[SEP]") or 102

def generate_paraphrase(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    decoder_input_ids = torch.tensor([[START_TOKEN]]).to(device)  # Start token

    max_len = 50
    output_ids = []

    with torch.no_grad():
        for _ in range(max_len):
            output = model(input_ids, decoder_input_ids)
            logits = output[:, -1, :]
            next_token_id = torch.argmax(logits, dim=-1).item()

            if next_token_id == END_TOKEN:
                break

            output_ids.append(next_token_id)
            decoder_input_ids = torch.cat(
                [decoder_input_ids, torch.tensor([[next_token_id]]).to(device)], dim=1
            )

    return tokenizer.decode(output_ids, skip_special_tokens=True)


# Take user input
user_input = input("Enter a sentence to paraphrase: ")
paraphrased_output = generate_paraphrase(user_input)
print(f"\nParaphrased Output:\n{paraphrased_output}")


Enter a sentence to paraphrase:  what can make physics exam easy to learn ?


AttributeError: 'Transformer' object has no attribute 'model'

In [10]:
# Step 1: Convert your filtered DataFrame into (q1, q2) pairs
# Assumes columns are: 'question1', 'question2'

#questions = list(zip(df['question1'].fillna(""), df['question2'].fillna("")))


In [18]:
'''from torch.utils.data import DataLoader

# Step 2: Create dataset and dataloader
max_len = 32
batch_size = 32

dataset = ParaphraseDataset(questions, tokenizer, max_len=max_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)'''


In [19]:
'''def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        src = batch['src'].to(device)
        tgt = batch['tgt'].to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        src_mask = batch['src_mask'].to(device)
        tgt_mask = batch['tgt_mask'][:, :-1].to(device)

        optimizer.zero_grad()
        logits = model(src, tgt_input, src_mask, tgt_mask)

        loss = criterion(logits.view(-1, logits.size(-1)), tgt_output.reshape(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)'''

In [21]:
'''from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score

def evaluate(model, dataloader, tokenizer, device):
    model.eval()
    total_bleu = 0
    total_meteor = 0
    n = 0

    with torch.no_grad():
        for batch in dataloader:
            src = batch['src'].to(device)
            src_mask = batch['src_mask'].to(device)

            for i in range(src.size(0)):
                input_ids = src[i].unsqueeze(0)
                input_mask = src_mask[i].unsqueeze(0)

                generated_ids = generate_paraphrase(model, input_ids, input_mask, tokenizer, device)
                generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

                ref_text = tokenizer.decode(batch['tgt'][i], skip_special_tokens=True)

                total_bleu += sentence_bleu([ref_text.split()], generated_text.split())
                total_meteor += meteor_score([ref_text], generated_text)
                n += 1

    return total_bleu / n, total_meteor / n'''


In [22]:
'''def generate_paraphrase(model, src, src_mask, tokenizer, device, max_len=32):
    model.eval()
    tgt = torch.tensor([[tokenizer.cls_token_id]], device=device)  # Start with [CLS] or custom token

    for _ in range(max_len):
        tgt_mask = (tgt != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)
        output = model(src, tgt, src_mask, tgt_mask)
        next_token = output[:, -1, :].argmax(-1).unsqueeze(1)
        tgt = torch.cat([tgt, next_token], dim=1)
        if next_token.item() == tokenizer.sep_token_id:
            break

    return tgt.squeeze(0)'''


In [23]:
'''import torch
import torch.optim as optim
import time

# Hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 5
learning_rate = 1e-4
batch_size = 32
max_len = 32

# Initialize model, tokenizer, and optimizer
vocab_size = len(tokenizer)
model = Transformer(vocab_size, d_model=512, num_heads=8, ff_dim=2048, num_layers=6, dropout=0.1).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)
criterion = torch.nn.CrossEntropyLoss(ignore_index=0)

# Prepare the dataset and dataloaders
dataset = ParaphraseDataset(questions, tokenizer, max_len=max_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training Loop
best_bleu = 0
for epoch in range(epochs):
    start_time = time.time()

    # Train for one epoch
    train_loss = train_one_epoch(model, dataloader, optimizer, criterion, device)

    # Evaluate after training for the epoch
    bleu_score, meteor_score_avg = evaluate(model, dataloader, tokenizer, device)

    # Print epoch results
    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Training Loss: {train_loss:.4f}")
    print(f"BLEU Score: {bleu_score:.4f}")
    print(f"METEOR Score: {meteor_score_avg:.4f}")
    print(f"Time for epoch: {time.time() - start_time:.2f} seconds")

    # Save the best model based on BLEU score
    if bleu_score > best_bleu:
        best_bleu = bleu_score
        torch.save(model.state_dict(), 'best_model.pth')
        print("Model saved with better BLEU score!")

# Final model saved
print("Training completed!")
'''

RuntimeError: The size of tensor a (32) must match the size of tensor b (31) at non-singleton dimension 2