In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dataset = 'manual' # 'mrpc','manual'
sentiment = 'positive' # 'positive', 'negative'

folder_input_path = '/content/drive/My Drive/Colab Notebooks/5_Corpora/corpora/'
folder_pretrained_path = '/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/'
csv_file_path = f'{dataset}-triplet-corpus.csv'

# Load and Preprocess the Data

In [4]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Load the dataset
df = pd.read_csv(folder_input_path + csv_file_path)

# Simple preprocessing
# Assuming the file has columns 'original' and given by sentiment variable for paraphrase pairs
original_sentences = df['original'].tolist()
positive_sentences = df[sentiment].tolist()

# Combine for vocabulary building
all_sentences = original_sentences + positive_sentences

# Build Vocabulary

In [11]:
def build_vocab(sentences, min_freq=1):
    # Tokenize sentences
    tokenized_sentences = [sentence.split() for sentence in sentences]

    # Count word frequencies
    word_freq = Counter(word for sentence in tokenized_sentences for word in sentence)

    # Filter words by min frequency and sort
    vocab = [word for word, freq in word_freq.items() if freq >= min_freq]
    vocab = sorted(vocab)

    # Add <pad> and <unk> tokens
    vocab = ['<pad>', '<unk>'] + vocab

    # Create word to index mapping
    word2idx = {word: idx for idx, word in enumerate(vocab)}

    return vocab, word2idx

vocab, word2idx = build_vocab(all_sentences)
vocab_size = len(vocab)
vocab_size

1040

# Create Dataset and DataLoader

In [12]:
class ParaphraseDataset(Dataset):
    def __init__(self, original_sentences, positive_sentences, word2idx):
        self.original_sentences = [self.encode(sentence, word2idx) for sentence in original_sentences]
        self.positive_sentences = [self.encode(sentence, word2idx) for sentence in positive_sentences]

    def encode(self, sentence, word2idx):
        return [word2idx.get(word, word2idx['<unk>']) for word in sentence.split()]

    def pad_sequence(self, sequence):
        max_len = max(len(seq) for seq in self.original_sentences + self.positive_sentences)
        return [seq + [word2idx['<pad>']] * (max_len - len(seq)) for seq in sequence]

    def __getitem__(self, index):
        return torch.tensor(self.pad_sequence([self.original_sentences[index]])[0]), torch.tensor(self.pad_sequence([self.positive_sentences[index]])[0])

    def __len__(self):
        return len(self.original_sentences)

dataset = ParaphraseDataset(original_sentences, positive_sentences, word2idx)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [17]:
class Seq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, hid_dim, n_layers, dropout=0.1):
        super().__init__()
        self.encoder = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.decoder = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.encoder(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        predictions = self.decoder(outputs)
        return predictions

# Example usage
input_dim = vocab_size  # Size of the input vocabulary
output_dim = vocab_size # Size of the output vocabulary
emb_dim = 256      # Size of the embeddings
hid_dim = 512      # Size of the hidden layers
n_layers = 2       # Number of layers in the RNN

model = Seq2Seq(input_dim, output_dim, emb_dim, hid_dim, n_layers)

# Training

In [20]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<pad>'])

num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for original, positive in dataloader:
        optimizer.zero_grad()
        output = model(original)  # Forward pass, assume output is [batch_size, seq_len, output_dim]
        output_dim = output.shape[-1]

        # Reshape output to [batch_size, num_classes, seq_len] for CrossEntropyLoss
        # output = output.permute(1, 2, 0)  # Now [batch_size, num_classes, seq_len]

        output = output.reshape(-1, output_dim) # [batch_size * seq_len, output_dim]
        positive = positive.view(-1) # [batch_size * seq_len]

        loss = criterion(output, positive) # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights
        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')

Epoch 1, Loss: 0.42290156015328
Epoch 2, Loss: 0.3658991924354008
Epoch 3, Loss: 0.2777726650238037
Epoch 4, Loss: 0.23327209055423737
Epoch 5, Loss: 0.20547428088528769
Epoch 6, Loss: 0.1854569081749235
Epoch 7, Loss: 0.1674469666821616
Epoch 8, Loss: 0.1586049965449742
Epoch 9, Loss: 0.14113516360521317
Epoch 10, Loss: 0.13711594258035933
Epoch 11, Loss: 0.12984411524874823
Epoch 12, Loss: 0.1260456389614514
Epoch 13, Loss: 0.12606431863137654
Epoch 14, Loss: 0.1192490713936942
Epoch 15, Loss: 0.11300882803542274
Epoch 16, Loss: 0.11184119752475194
Epoch 17, Loss: 0.11227015405893326
Epoch 18, Loss: 0.11348794187818255
Epoch 19, Loss: 0.1080552414059639
Epoch 20, Loss: 0.11902696426425662
Epoch 21, Loss: 0.12472308001347951
Epoch 22, Loss: 0.12079328830753054
Epoch 23, Loss: 0.10941466582672936
Epoch 24, Loss: 0.1070938626570361
Epoch 25, Loss: 0.1043108164199761
Epoch 26, Loss: 0.09549398081643241
Epoch 27, Loss: 0.09559263927595955
Epoch 28, Loss: 0.09538963969264712
Epoch 29, Loss

In [21]:
def generate_paraphrase(sentence, word2idx, idx2word, model, max_length=50):
    model.eval()
    tokens = [word2idx.get(word, word2idx['<unk>']) for word in sentence.split()]
    tokens_tensor = torch.tensor(tokens).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        output = model(tokens_tensor)
    output_indices = output.argmax(2)  # Get the index with the highest probability
    paraphrased_sentence = ' '.join(idx2word[idx.item()] for idx in output_indices[0] if idx != word2idx['<pad>'])  # Decode
    return paraphrased_sentence

# Example usage
sentence = "This is an example sentence to paraphrase."
idx2word = {idx: word for word, idx in word2idx.items()}  # Create inverse mapping
paraphrased_sentence = generate_paraphrase(sentence, word2idx, idx2word, model)
print(paraphrased_sentence)


This is a a a level of
