In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import ast
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Assume you have a pandas DataFrame named `articles` with 'abstracts', 'keywords', and 'titles' columns
articles = pd.read_csv('E:/CS554 - NLP/final_project/NYT/NYT_Dataset_Preprocessed.csv')  # Replace with your actual dataset

articles = articles[articles['preprocessed_abstract'] != '']
articles = articles[articles['preprocessed_title'] != '']
articles = articles[articles['preprocessed_keywords'] != '']

articles = articles[(articles['preprocessed_abstract'].apply(type) == str) &
                    (articles['preprocessed_title'].apply(type) == str)]

  from .autonotebook import tqdm as notebook_tqdm





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jackh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jackh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
stop_words = set(stopwords.words('english'))

def preprocess_keywords_row(keyword_string):
    # Convert string representation of list to actual list
    keywords = ast.literal_eval(keyword_string)
    processed_keywords = []
    
    for keyword in keywords:
        # Convert to lowercase
        keyword = keyword.lower()
        
        # Remove punctuation
        keyword = keyword.translate(str.maketrans("", "", string.punctuation))
        
        # Tokenize
        words = word_tokenize(keyword)
        
        # Remove stop words and duplicates
        filtered_words = [word for word in words if word not in stop_words]
        
        # Remove duplicates while maintaining order
        filtered_words = list(dict.fromkeys(filtered_words))
        
        # Join back into a processed string
        processed_keywords.append(" ".join(filtered_words))

    kws = ''

    for keys in processed_keywords:
        kws += keys + " "
    
    return kws[:-1]

In [3]:
articles['processed_keywords'] = articles['preprocessed_keywords'].apply(preprocess_keywords_row)

# Combine abstracts and keywords
articles['input_text'] = articles['preprocessed_abstract'] + ' ' + articles['processed_keywords']

# Use SentenceTransformer to generate embeddings
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')  # A lightweight model for sentence embeddings

# Generate embeddings
articles['input_embeddings'] = articles['input_text'].apply(lambda x: sentence_transformer.encode(x))
articles['title_embeddings'] = articles['preprocessed_title'].apply(lambda x: sentence_transformer.encode(x))

# Convert embeddings to arrays
X = np.vstack(articles['input_embeddings'])
y = np.vstack(articles['title_embeddings'])

In [4]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

class EmbeddingDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

# Create PyTorch dataset
dataset = EmbeddingDataset(X, y)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

class Seq2SeqModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Seq2SeqModel, self).__init__()
        self.encoder = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Add sequence length dimension (seq_len = 1)
        x = x.unsqueeze(1)  # Shape: (batch_size, seq_len=1, input_size)
        # print(f"Input to encoder: {x.shape}")  # Debugging print

        # Encode input
        _, (hidden, cell) = self.encoder(x)  # Hidden and cell have shape (1, batch_size, hidden_size)
        # print(f"Encoder hidden state: {hidden.shape}, cell state: {cell.shape}")

        # Decoder expects (seq_len, batch_size, hidden_size)
        # Reshape hidden state for decoder input
        decoder_input = hidden.transpose(0, 1)  # Shape: (batch_size, seq_len=1, hidden_size)
        # print(f"Input to decoder: {decoder_input.shape}")

        # Decode using hidden state and cell state
        decoder_outputs, _ = self.decoder(decoder_input)  # Outputs: (batch_size, seq_len=1, hidden_size)
        # print(f"Decoder outputs: {decoder_outputs.shape}")

        # Fully connected layer to project to output size
        output = self.fc(decoder_outputs.squeeze(1))  # Remove seq_len dimension, shape: (batch_size, output_size)
        # print(f"Final output shape: {output.shape}")
        return output

# Initialize model
input_size = X.shape[1]
hidden_size = 512
output_size = y.shape[1]
model = Seq2SeqModel(input_size, hidden_size, output_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.MSELoss()

In [5]:
# Training loop
epochs = 25
model.train()

for epoch in range(epochs):
    total_loss = 0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

Epoch 1/25, Loss: 0.0019
Epoch 2/25, Loss: 0.0017
Epoch 3/25, Loss: 0.0016
Epoch 4/25, Loss: 0.0016
Epoch 5/25, Loss: 0.0016
Epoch 6/25, Loss: 0.0016
Epoch 7/25, Loss: 0.0016
Epoch 8/25, Loss: 0.0016
Epoch 9/25, Loss: 0.0016
Epoch 10/25, Loss: 0.0016
Epoch 11/25, Loss: 0.0016
Epoch 12/25, Loss: 0.0016
Epoch 13/25, Loss: 0.0016
Epoch 14/25, Loss: 0.0015
Epoch 15/25, Loss: 0.0015
Epoch 16/25, Loss: 0.0015
Epoch 17/25, Loss: 0.0015
Epoch 18/25, Loss: 0.0015
Epoch 19/25, Loss: 0.0015
Epoch 20/25, Loss: 0.0015
Epoch 21/25, Loss: 0.0015
Epoch 22/25, Loss: 0.0015
Epoch 23/25, Loss: 0.0015
Epoch 24/25, Loss: 0.0015
Epoch 25/25, Loss: 0.0015


In [6]:
# Generate predictions
model.eval()
with torch.no_grad():
    predictions = model(torch.tensor(X, dtype=torch.float32))

In [7]:
from sentence_transformers.util import cos_sim

# Function to decode embeddings into text
def decode_embedding(embedding, reference_embeddings, original_titles):
    """
    Decodes an embedding by finding the closest match in the original sentences.

    Args:
        embedding: The embedding vector to decode.
        original_sentences: List of reference sentences (titles in this case).
        sentence_transformer_model: The pretrained SentenceTransformer model.

    Returns:
        The decoded sentence.
    """
    # Compute cosine similarity between the embedding and reference embeddings
    similarities = cos_sim(embedding, reference_embeddings)

    # Find the index of the most similar sentence
    most_similar_idx = similarities.argmax()

    return original_titles[most_similar_idx]

In [8]:
original_titles = articles['preprocessed_title'].tolist()

# Pre-encode the original titles
reference_embeddings = sentence_transformer.encode(original_titles)  # Shape: (num_titles, embedding_dim)

In [9]:
# Decode all predicted embeddings
predicted_titles = [
    decode_embedding(prediction, reference_embeddings, original_titles)
    for prediction in predictions.numpy()
]

# Print sample predictions
for i in range(5):
    print(f"Predicted Title: {predicted_titles[i]}")
    print(f"Ground Truth Title: {original_titles[i]}")

Predicted Title: adroit envoy state case pakistan
Ground Truth Title: reversal pakistan welcome outside help inquiry bhutto
Predicted Title: election violence kenya
Ground Truth Title: fighting intensifies election kenya
Predicted Title: west bank 99 7 public land grant israel go settler
Ground Truth Title: israel olmert curb settlement
Predicted Title: gay marriage backer celebrate germany need hide
Ground Truth Title: gay muslim pack dance floor
Predicted Title: election night g iraq
Ground Truth Title: iraqi reveler embrace new year


In [11]:
from sklearn.metrics.pairwise import cosine_similarity
from nltk.util import ngrams
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
from Levenshtein import distance as levenshtein_distance
from textstat import flesch_kincaid_grade

# Cosine similarity
def compute_cosine_similarity(gt_titles, pred_titles, sentence_transformer):
    gt_embeddings = [sentence_transformer.encode(title) for title in gt_titles]
    pred_embeddings = [sentence_transformer.encode(title) for title in pred_titles]
    similarities = [
        cosine_similarity([gt], [pred])[0][0]
        for gt, pred in zip(gt_embeddings, pred_embeddings)
    ]
    return similarities

# Jaccard similarity
def compute_jaccard_similarity(gt_titles, pred_titles):
    jaccard_scores = []
    for gt, pred in zip(gt_titles, pred_titles):
        gt_set = set(gt.split())
        pred_set = set(pred.split())
        intersection = len(gt_set & pred_set)
        union = len(gt_set | pred_set)
        jaccard_scores.append(intersection / union if union > 0 else 0)
    return jaccard_scores

# Levenshtein distance
def compute_levenshtein_distances(gt_titles, pred_titles):
    return [levenshtein_distance(gt, pred) for gt, pred in zip(gt_titles, pred_titles)]

# Flesch-Kincaid readability
def compute_flesch_kincaid_readability(titles):
    return [flesch_kincaid_grade(title) for title in titles]

# Example usage
from sentence_transformers import SentenceTransformer
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')

# Calculate metrics
cosine_similarities = compute_cosine_similarity(original_titles, predicted_titles, sentence_transformer)
jaccard_similarities = compute_jaccard_similarity(original_titles, predicted_titles)
levenshtein_distances = compute_levenshtein_distances(original_titles, predicted_titles)
flesch_kincaid_ground = compute_flesch_kincaid_readability(original_titles)
flesch_kincaid_predicted = compute_flesch_kincaid_readability(predicted_titles)

# Compile results
import pandas as pd

results = {
    "Cosine Similarity": cosine_similarities,
    "Jaccard Similarity": jaccard_similarities,
    "Levenshtein Distance": levenshtein_distances,
    "Flesch-Kincaid Grade (Ground Truth)": flesch_kincaid_ground,
    "Flesch-Kincaid Grade (Predicted)": flesch_kincaid_predicted,
}

results_df = pd.DataFrame(results)
print(results_df)

        Cosine Similarity  Jaccard Similarity  Levenshtein Distance  \
0                0.546575            0.090909                    40   
1                0.875486            0.400000                    23   
2                0.563120            0.076923                    38   
3                0.320188            0.090909                    33   
4                0.416743            0.000000                    25   
...                   ...                 ...                   ...   
105878           0.315784            0.066667                    49   
105879           0.495784            0.142857                    28   
105880           0.383617            0.000000                    27   
105881           0.307145            0.000000                    43   
105882           0.348425            0.062500                    63   

        Flesch-Kincaid Grade (Ground Truth)  Flesch-Kincaid Grade (Predicted)  
0                                      10.7                        

In [13]:
results_df.head()

Unnamed: 0,Cosine Similarity,Jaccard Similarity,Levenshtein Distance,Flesch-Kincaid Grade (Ground Truth),Flesch-Kincaid Grade (Predicted)
0,0.546575,0.090909,40,10.7,2.9
1,0.875486,0.4,23,13.1,9.2
2,0.56312,0.076923,38,7.2,3.7
3,0.320188,0.090909,33,0.5,7.2
4,0.416743,0.0,25,5.2,1.3


In [12]:
average_cosine = results_df["Cosine Similarity"].mean()
average_jaccard = results_df["Jaccard Similarity"].mean()
average_levenshtein = results_df["Levenshtein Distance"].mean()

# Compute readability averages
average_readability_ground = results_df["Flesch-Kincaid Grade (Ground Truth)"].mean()
average_readability_predicted = results_df["Flesch-Kincaid Grade (Predicted)"].mean()

# Print results
print("General Assessment of Model:")
print(f"Average Cosine Similarity: {average_cosine:.4f}")
print(f"Average Jaccard Similarity: {average_jaccard:.4f}")
print(f"Average Levenshtein Distance: {average_levenshtein:.2f}")
print(f"Average Readability (Ground Truth): {average_readability_ground:.2f}")
print(f"Average Readability (Predicted): {average_readability_predicted:.2f}")

# Compare readability
readability_gap = abs(average_readability_ground - average_readability_predicted)
print(f"Readability Gap: {readability_gap:.2f} (Lower is better)")

General Assessment of Model:
Average Cosine Similarity: 0.6049
Average Jaccard Similarity: 0.2923
Average Levenshtein Distance: 28.03
Average Readability (Ground Truth): 7.22
Average Readability (Predicted): 7.29
Readability Gap: 0.08 (Lower is better)
