
# Word2Vec and Seq2Seq Project

This notebook is designed to use a Word2Vec model to process textual data, followed by a sequence-to-sequence (Seq2Seq) neural network architecture.
The Seq2Seq model will be trained to predict article titles from abstracts, transforming each input sequence of word embeddings into a target sequence.


In [17]:
# Load the necessary libraries
import pandas as pd

# Load data (assuming a similar CSV file with 'abstract' and 'title' columns)
articles = pd.read_csv('../final_project/NYT/NYT_Dataset_Preprocessed.csv')

articles = articles[articles['preprocessed_abstract'] != '']
articles = articles[articles['preprocessed_title'] != '']
articles = articles[articles['preprocessed_keywords'] != '']

articles = articles[(articles['preprocessed_abstract'].apply(type) == str) &
                    (articles['preprocessed_title'].apply(type) == str)]

abstracts = articles['preprocessed_abstract']
titles = articles['preprocessed_title']
keywords = articles['preprocessed_keywords']

print("Data loaded successfully.")
print(f"Sample abstract: {abstracts.iloc[0]}")
print(f"Sample title: {titles.iloc[0]}")
print(f"Sample keywords: {keywords.iloc[0]}")

Data loaded successfully.
Sample abstract: pakistan ambassador u said government would endorse separate inquiry modeled one carried u n assassination rafik hariri lebanon 2005
Sample title: reversal pakistan welcome outside help inquiry bhutto
Sample keywords: ['assassination attempted assassination', 'pakistan', 'bhutto benazir', 'federal bureau investigation', 'united nation']


In [18]:
# Import necessary libraries for Word2Vec
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Tokenize abstracts for Word2Vec training
abstracts_tokenized = [word_tokenize(abstract.lower()) for abstract in abstracts]

# "Train" Word2Vec model
word2vec_model = Word2Vec(sentences=abstracts_tokenized, vector_size=100, window=5, min_count=2, workers=4)

# Example of retrieving word vectors
sample_word = "example"
if sample_word in word2vec_model.wv:
    print(f"Vector for '{sample_word}': {word2vec_model.wv[sample_word]}")
else:
    print(f"'{sample_word}' not in vocabulary.")

Vector for 'example': [-0.05913113  0.27169517  0.0590232  -0.13798061 -0.2831471  -0.6246296
 -0.27181587  0.74478906 -0.26388487  0.2583592  -0.06662034  0.11616062
 -0.6762518   0.19939935 -0.09089735 -0.15078533  0.2705024  -0.37845543
 -0.05315786 -0.54115     0.15202066  0.01698012  0.49734187  0.06917489
 -0.11187842 -0.03796555 -0.16971691 -0.50890833 -0.1737636   0.268011
  0.11165025 -0.11699247  0.17932595 -0.47495317  0.56669956  0.144165
  0.23882341  0.02605644  0.4227661  -0.3308594  -0.29068613  0.03540949
 -0.15051241  0.03239902  0.17510875 -0.32569832 -0.5801519  -0.25071952
 -0.29022154  0.0925482   0.01482133 -0.03597991 -0.2805173   0.2923417
  0.29262242  0.3938807   0.2625209   0.30393863 -0.09936269  0.04643556
  0.11571067  0.21628615 -0.7251391  -0.23639937 -0.13608964  0.36544597
  0.52631587  0.4848516  -0.14277932  0.31991404 -0.2088836  -0.25399607
 -0.31199712  0.05175032  0.16974089  0.12479095 -0.23159651  0.28848618
 -0.24661824 -0.00135885 -0.2442137

In [None]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        # Removed embedding layer as we are already working with precomputed embeddings
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)

    def forward(self, src):
        # No need for embedding, just pass the input (already embeddings) to LSTM
        outputs, (hidden, cell) = self.lstm(src)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, trg, hidden, cell):
        embedded = self.embedding(trg)
        outputs, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        predictions = self.fc(outputs)
        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg):
        hidden, cell = self.encoder(src)
        outputs, _, _ = self.decoder(trg, hidden, cell)
        return outputs

In [20]:
import numpy as np
from collections import defaultdict

max_len = 50
word2vec_model = Word2Vec(vector_size=100, window=5, min_count=1)

In [None]:
# Look at this
def build_vocab(titles):
    word_freq = defaultdict(int)
    for title in titles:
        for word in word_tokenize(title.lower()):
            word_freq[word] += 1
    return {word: idx + 1 for idx, (word, _) in enumerate(word_freq.items())}

def encode_title(title, word2idx, max_len):
    tokens = [word2idx.get(word, 0) for word in word_tokenize(title.lower())]  # 0 for unknown words
    return tokens[:max_len] + [0] * (max_len - len(tokens))

def embed_text(text, word2vec, max_len):
    pad_vector = np.zeros(word2vec.vector_size)
    tokens = word_tokenize(text.lower())
    embeddings = [word2vec.wv[token] if token in word2vec.wv else pad_vector for token in tokens]
    if len(embeddings) < max_len:
        embeddings += [pad_vector] * (max_len - len(embeddings))  # Pad to max_len
    return np.array(embeddings[:max_len])

In [22]:
word2idx = build_vocab(titles)

embedded_abstracts = [embed_text(abstract, word2vec_model, max_len) for abstract in abstracts]
embedded_keywords = [embed_text(keyword, word2vec_model, max_len) for keyword in keywords]
encoded_titles = [encode_title(title, word2idx, max_len) for title in titles]

In [23]:
import torch

encoded_titles = torch.tensor(encoded_titles, dtype=torch.long)
embedded_abstracts = torch.tensor(embedded_abstracts, dtype=torch.float32)
embedded_keywords = torch.tensor(embedded_keywords, dtype=torch.float32)

# Print shapes for verification
print("Encoded titles shape:", encoded_titles.shape)  # (num_samples, max_len)
print("Embedded abstracts shape:", embedded_abstracts.shape)  # (num_samples, max_len, embedding_dim)
print("Embedded keywords shape:", embedded_keywords.shape)  # (num_samples, max_len, embedding_dim)

Encoded titles shape: torch.Size([105883, 50])
Embedded abstracts shape: torch.Size([105883, 50, 100])
Embedded keywords shape: torch.Size([105883, 50, 100])


In [24]:
# Define the sizes (replace these with your specific values)
input_size = 200  # Size of the input vocabulary
output_size = len(word2idx) + 1  # Size of the output vocabulary
hidden_size = 512  # Size of the hidden layer (common choice)

# Initialize the Encoder and Decoder
encoder = Encoder(input_size, hidden_size)
decoder = Decoder(output_size, hidden_size)

# Initialize the Seq2Seq model
seq2seq_model = Seq2Seq(encoder, decoder)

In [25]:
from sklearn.model_selection import train_test_split
import torch

# Concatenate embedded abstracts and keywords to form input data
input_data = torch.cat((embedded_abstracts, embedded_keywords), dim=-1)
indices = torch.arange(input_data.size(0))

# Split the data into train and test sets (80% train, 20% test)
train_inputs, test_inputs, train_titles, test_titles, train_indices, test_indices = train_test_split(input_data, encoded_titles, indices, test_size=0.2)

In [26]:
# Create idx_to_word dictionary for mapping indices back to words
idx_to_word = {idx: word for word, idx in word2idx.items()}

In [100]:
# Model parameters
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(seq2seq_model.parameters(), lr=0.001)

epochs = 2
batch_size = 32

# Calculate the number of batches based on batch size
num_batches = len(train_inputs) // batch_size

for epoch in range(epochs):
    epoch_loss = 0  # Track epoch loss
    
    for i in range(num_batches):
        # Get batch of data
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, len(train_inputs))
        
        # Select batch from tensors
        src = train_inputs[start_idx:end_idx]  # Abstract and Keyword embeddings
        trg = train_titles[start_idx:end_idx]  # Title encodings
        
        # Forward pass
        optimizer.zero_grad()
        output = seq2seq_model(src, trg)
        
        # Compute loss and backpropagation
        loss = criterion(output.view(-1, output_size), trg.view(-1))
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # Print average loss for each epoch
    avg_epoch_loss = epoch_loss / num_batches
    print(f"Epoch {epoch+1}, Average Loss: {avg_epoch_loss:.8f}")

print("Training completed.")


Epoch 1, Average Loss: 0.15598492
Epoch 2, Average Loss: 0.01072013
Training completed.


In [35]:
# Set model to evaluation mode
seq2seq_model.eval()

batch_size = 32

# Calculate the number of batches based on batch size
num_batches = len(test_inputs) // batch_size

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(seq2seq_model.parameters(), lr=0.001)

# Initialize variables for tracking test loss and predictions
test_loss = 0
predicted_titles = []
ground_truth_titles = []

with torch.no_grad():
    num_test_batches = len(test_inputs) // batch_size
    
    for i in range(num_test_batches):
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, len(test_inputs))

        src = test_inputs[start_idx:end_idx]
        trg = test_titles[start_idx:end_idx]

        # Forward pass
        output = seq2seq_model(src, trg)

        # Compute loss
        loss = criterion(output.view(-1, output_size), trg.view(-1))
        test_loss += loss.item()

        # Decode predicted title tokens to words
        _, predicted_indices = output.max(dim=2)
        
        for j in range(batch_size):
            predicted_title = [idx_to_word[idx.item()] for idx in predicted_indices[j] if idx.item() != 0]
            predicted_titles.append(predicted_title)

            ground_truth_title = [idx_to_word[idx.item()] for idx in trg[j] if idx.item() != 0]
            ground_truth_titles.append(ground_truth_title)

# Calculate average test loss
avg_test_loss = test_loss / num_test_batches
print(f"Average Test Loss: {avg_test_loss:.8f}")

Average Test Loss: 0.00638555


In [48]:
for i in range(50):
    index = test_indices[i].item()

    print(f"Sample {i+1}")
    print("Ground Truth Title:", " ".join(ground_truth_titles[i]))
    print("Predicted Title:  ", " ".join(predicted_titles[i]))
    print("Abstract:  ", "".join(abstracts.iloc[index]))
    print("Keywords:  ", "".join(keywords.iloc[index]))
    print("-" * 50)


Sample 1
Ground Truth Title: myanmar election panel say aung san suu kyi party majority
Predicted Title:   myanmar election panel say aung san suu kyi party majority
Abstract:   news national league democracy taken 348 seat parliament reinforces rout military backed ruling party landmark election
Keywords:   ['myanmar', 'election', 'aung san suu kyi', 'thein sein', 'defense military force']
--------------------------------------------------
Sample 2
Ground Truth Title: never bernie voter threw biden changed primary
Predicted Title:   never bernie voter threw biden changed primary
Abstract:   bernie sander new hampshire nevada campaign hit roadblock wide range democrat would anything stop joe biden became vehicle
Keywords:   ['biden joseph r jr', 'sander bernard', 'democratic party', 'presidential election 2020', 'primary caucus', 'voting voter', 'poll public opinion', 'buttigieg pete 1982', 'warren elizabeth', 'united state politics government']
----------------------------------------

In [112]:
import torch

# Specify the path where you want to save the model
model_path = 'seq2seq_model.pth'

# Save the model's state_dict
torch.save({
    'model_state_dict': seq2seq_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),  # If you want to save the optimizer as well
}, model_path)

print(f"Model saved to {model_path}")

Model saved to seq2seq_model.pth


In [None]:
# Specify the path where the model was saved
model_path = 'seq2seq_model.pth'

# Initialize the model architecture
seq2seq_model = Seq2Seq(encoder, decoder)
optimizer = torch.optim.Adam(seq2seq_model.parameters())  # Initialize optimizer if saved

# Load the saved state_dict into the model and optimizer
checkpoint = torch.load(model_path)
seq2seq_model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Set the model to evaluation mode if you plan on evaluating
seq2seq_model.eval()
print("Model loaded and ready for evaluation.")

  checkpoint = torch.load(model_path)


Model loaded and ready for evaluation.
