
# Word2Vec and Seq2Seq Project

This notebook is designed to use a Word2Vec model to process textual data, followed by a sequence-to-sequence (Seq2Seq) neural network architecture.
The Seq2Seq model will be trained to predict article titles from abstracts, transforming each input sequence of word embeddings into a target sequence.


In [1]:
# Load the necessary libraries
import pandas as pd

# Load data (assuming a similar CSV file with 'abstract' and 'title' columns)
articles = pd.read_csv('../final_project/NYT/NYT_Dataset_Preprocessed.csv')

articles = articles[articles['preprocessed_abstract'] != '']
articles = articles[articles['preprocessed_title'] != '']
articles = articles[articles['preprocessed_keywords'] != '']

articles = articles[(articles['preprocessed_abstract'].apply(type) == str) &
                    (articles['preprocessed_title'].apply(type) == str)]

abstracts = articles['preprocessed_abstract']
titles = articles['preprocessed_title']
keywords = articles['preprocessed_keywords']

print("Data loaded successfully.")
print(f"Sample abstract: {abstracts.iloc[0]}")
print(f"Sample title: {titles.iloc[0]}")
print(f"Sample keywords: {keywords.iloc[0]}")

Data loaded successfully.
Sample abstract: pakistan ambassador u said government would endorse separate inquiry modeled one carried u n assassination rafik hariri lebanon 2005
Sample title: reversal pakistan welcome outside help inquiry bhutto
Sample keywords: ['assassination attempted assassination', 'pakistan', 'bhutto benazir', 'federal bureau investigation', 'united nation']


In [2]:
# Import necessary libraries for Word2Vec
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Tokenize abstracts for Word2Vec training
abstracts_tokenized = [word_tokenize(abstract.lower()) for abstract in abstracts]

# "Train" Word2Vec model
word2vec_model = Word2Vec(sentences=abstracts_tokenized, vector_size=100, window=5, min_count=2, workers=4)

In [3]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        # Removed embedding layer as we are already working with precomputed embeddings
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)

    def forward(self, src):
        # No need for embedding, just pass the input (already embeddings) to LSTM
        outputs, (hidden, cell) = self.lstm(src)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, trg, hidden, cell):
        # trg is expected to already be embeddings, so skip nn.Embedding
        print(f"Input to LSTM (trg): {trg.shape}")  # Debug shape
        outputs, (hidden, cell) = self.lstm(trg, (hidden, cell))
        predictions = self.fc(outputs)
        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg):
        print(f"in foward: src shape = {src.shape}, trg shape = {trg.shape}, trg.dim = {trg.dim()}, src.dim = {src.dim()}")
        hidden, cell = self.encoder(src)
        outputs, _, _ = self.decoder(trg, hidden, cell)
        return outputs

In [4]:
import numpy as np
from collections import defaultdict

max_len = 50
word2vec_model = Word2Vec(vector_size=100, window=5, min_count=1)

In [5]:
def embed_text(text, word2vec, max_len):
    pad_vector = np.zeros(word2vec.vector_size)
    tokens = word_tokenize(text.lower())
    embeddings = [word2vec.wv[token] if token in word2vec.wv else pad_vector for token in tokens]
    if len(embeddings) < max_len:
        embeddings += [pad_vector] * (max_len - len(embeddings))  # Pad to max_len
    return np.array(embeddings[:max_len])

In [6]:
embedded_abstracts = [embed_text(abstract, word2vec_model, max_len) for abstract in abstracts]
embedded_keywords = [embed_text(keyword, word2vec_model, max_len) for keyword in keywords]
embedded_titles = [embed_text(title, word2vec_model, max_len) for title in titles]

In [7]:
import torch

embedded_titles = torch.tensor(embedded_titles, dtype=torch.long)
embedded_abstracts = torch.tensor(embedded_abstracts, dtype=torch.float32)
embedded_keywords = torch.tensor(embedded_keywords, dtype=torch.float32)

# Print shapes for verification
print("Embedded titles shape:", embedded_titles.shape)  # (num_samples, max_len)
print("Embedded abstracts shape:", embedded_abstracts.shape)  # (num_samples, max_len, embedding_dim)
print("Embedded keywords shape:", embedded_keywords.shape)  # (num_samples, max_len, embedding_dim)

Embedded titles shape: torch.Size([105883, 50, 100])
Embedded abstracts shape: torch.Size([105883, 50, 100])
Embedded keywords shape: torch.Size([105883, 50, 100])


  embedded_titles = torch.tensor(embedded_titles, dtype=torch.long)
  embedded_titles = torch.tensor(embedded_titles, dtype=torch.long)


In [8]:
# Look at this
def build_vocab(texts, dictionary):
    for text in texts:
        for word in word_tokenize(text.lower()):
            dictionary[word] += 1
    return dictionary
    # return {word: idx + 1 for idx, (word, _) in enumerate(dictionary.items())}

In [None]:
word_freq = defaultdict(int)

word_freq = build_vocab(abstracts, word_freq)
word_freq = build_vocab(titles, word_freq)
word_freq = build_vocab(keywords, word_freq)
word2idx = {word: idx + 1 for idx, (word, _) in enumerate(word_freq.items())}

# Define the sizes (replace these with your specific values)
input_size = 200  # Size of the input vocabulary
output_size = len(word2idx) + 1  # Size of the output vocabulary
hidden_size = 512  # Size of the hidden layer (common choice)

# Initialize the Encoder and Decoder
encoder = Encoder(input_size, hidden_size)
decoder = Decoder(output_size, hidden_size)

# Initialize the Seq2Seq model
seq2seq_model = Seq2Seq(encoder, decoder)

In [None]:
from sklearn.model_selection import train_test_split
import torch

# Concatenate embedded abstracts and keywords to form input data
input_data = torch.cat((embedded_abstracts, embedded_keywords), dim=-1)
indices = torch.arange(input_data.size(0))

# Split the data into train and test sets (80% train, 20% test)
train_inputs, test_inputs, train_titles, test_titles, train_indices, test_indices = train_test_split(input_data, embedded_titles, indices, test_size=0.2)

In [None]:
# Model parameters
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(seq2seq_model.parameters(), lr=0.001)

epochs = 2
batch_size = 32

# Calculate the number of batches based on batch size
num_batches = len(train_inputs) // batch_size

for epoch in range(epochs):
    epoch_loss = 0  # Track epoch loss
    
    for i in range(num_batches):
        # Get batch of data
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, len(train_inputs))
        
        # Select batch from tensors
        src = train_inputs[start_idx:end_idx, :, :]  # Abstract and Keyword embeddings
        print(train_titles.shape)
        trg = train_titles[start_idx:end_idx, :, :]  # Title embedding

        # Debugging: Print shapes before reshaping
        print(f"Before permute: src shape = {src.shape}, trg shape = {trg.shape}, trg.dim = {trg.dim()}, src.dim = {src.dim()}")

        # Reshape target for decoder
        trg = trg.permute(1, 0, 2)  # Now (seq_len, batch_size, embedding_dim)

        # Debugging: Print shapes after reshaping
        print(f"After permute: src shape = {src.shape}, trg shape = {trg.shape}, trg.dim = {trg.dim()}, src.dim = {src.dim()}")
        
        # Forward pass
        optimizer.zero_grad()
        output = seq2seq_model(src, trg)
        
        # Compute loss and backpropagation
        loss = criterion(output.view(-1, output_size), trg.view(-1))
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # Print average loss for each epoch
    avg_epoch_loss = epoch_loss / num_batches
    print(f"Epoch {epoch+1}, Average Loss: {avg_epoch_loss:.8f}")

print("Training completed.")

In [None]:
def decode_predictions(predicted_vectors, word2vec_model):
    words = []
    for vector in predicted_vectors:
        # Find the closest word in the embedding space
        closest_word = word2vec_model.wv.similar_by_vector(vector, topn=1)[0][0]
        words.append(closest_word)
    return words

def decode_ground_truth(trg_embeddings, word2vec_model):
    words = []
    for vector in trg_embeddings:
        closest_word = word2vec_model.wv.similar_by_vector(vector, topn=1)[0][0]
        words.append(closest_word)
    return words

In [None]:
# Set model to evaluation mode
seq2seq_model.eval()

batch_size = 32

# Calculate the number of batches based on batch size
num_batches = len(test_inputs) // batch_size

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(seq2seq_model.parameters(), lr=0.001)

# Initialize variables for tracking test loss and predictions
test_loss = 0
predicted_titles = []
ground_truth_titles = []

with torch.no_grad():
    num_test_batches = len(test_inputs) // batch_size
    
    for i in range(num_test_batches):
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, len(test_inputs))

        src = test_inputs[start_idx:end_idx]
        trg = test_titles[start_idx:end_idx]

        # Forward pass
        output = seq2seq_model(src, trg)

        # Compute loss
        loss = criterion(output.view(-1, output_size), trg.view(-1))
        test_loss += loss.item()

        # Decode predicted title vectors to words
        for j in range(batch_size):
            predicted_vectors = output[j].cpu().detach().numpy()
            predicted_title = decode_predictions(predicted_vectors, word2vec_model)
            predicted_titles.append(predicted_title)

            # Decode ground truth titles to words
            ground_truth_vectors = trg[j].cpu().detach().numpy()
            ground_truth_title = decode_ground_truth(ground_truth_vectors, word2vec_model)
            ground_truth_titles.append(ground_truth_title)

# Calculate average test loss
avg_test_loss = test_loss / num_test_batches
print(f"Average Test Loss: {avg_test_loss:.8f}")

In [None]:
for i in range(50):
    index = test_indices[i].item()

    print(f"Sample {i+1}")
    print("Ground Truth Title:", " ".join(ground_truth_titles[i]))
    print("Predicted Title:  ", " ".join(predicted_titles[i]))
    print("Abstract:  ", "".join(abstracts.iloc[index]))
    print("Keywords:  ", "".join(keywords.iloc[index]))
    print("-" * 50)


In [None]:
import torch

# Specify the path where you want to save the model
model_path = 'seq2seq_model.pth'

# Save the model's state_dict
torch.save({
    'model_state_dict': seq2seq_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),  # If you want to save the optimizer as well
}, model_path)

print(f"Model saved to {model_path}")

In [None]:
# Specify the path where the model was saved
model_path = 'seq2seq_model.pth'

# Initialize the model architecture
seq2seq_model = Seq2Seq(encoder, decoder)
optimizer = torch.optim.Adam(seq2seq_model.parameters())  # Initialize optimizer if saved

# Load the saved state_dict into the model and optimizer
checkpoint = torch.load(model_path)
seq2seq_model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Set the model to evaluation mode if you plan on evaluating
seq2seq_model.eval()
print("Model loaded and ready for evaluation.")