In [118]:
import numpy as np
import sys
import json
import random
# from sklearn.preprocessing import StandardScaler
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from collections import Counter
from nltk.tree import Tree
from collections import defaultdict
from nltk.parse import CoreNLPParser
from nltk.parse.corenlp import CoreNLPDependencyParser
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import math 
from rouge_metric import PyRouge
import tqdm
    
class RougeEvaluator:

    def __init__(self) -> None:
        self.rouge = PyRouge(rouge_n=(1, 2, 4), rouge_l=True, rouge_w=False, rouge_s=False, rouge_su=False)

    def batch_score(self, gen_summaries, reference_summaries):
        score = self.rouge.evaluate(gen_summaries, [[x] for x in reference_summaries])
        return score
    
    def score(self, gen_summary, reference_summary):
        score = self.rouge.evaluate([gen_summary], [[reference_summary]])
        return score

def evaluating_validation_set():
    evaluator = RougeEvaluator()
    
    with open(f"../data/validation.json", 'r') as f:
        eval_data = json.load(f)

    with open("val_preds.json", 'r') as f:
        pred_data = json.load(f)

    assert len(eval_data) == len(pred_data)

    pred_sums = []
    eval_sums = []
    for eval, pred in tqdm.tqdm(zip(eval_data, pred_data), total=len(eval_data)):
        pred_sums.append(pred['summary'])
        eval_sums.append(eval['summary'])

    scores = evaluator.batch_score(pred_sums, eval_sums)
    return scores['rouge-1']["f"]

In [119]:
# Word2Vec model
model = api.load("word2vec-google-news-300")

def preprocess(X):
    """
    X: list of list of sentences (i.e., comprising an article)
    """
    split_articles = [[s.strip() for s in x.split('.')] for i, x in enumerate(X)]
    return split_articles

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def shuffle_dataframe(df):
    # Shuffle the DataFrame
    df_shuffled = df.sample(frac=1).reset_index(drop=True)
    
    # Split the DataFrame back into X and y
    X_shuffled = df_shuffled.drop('target', axis=1)
    y_shuffled = df_shuffled['target']
    
    return X_shuffled, y_shuffled




### See feature_engineering for the generation of data
data_train = pd.read_csv("train_processed.csv")
# X_train, y_train = train_data.iloc[:,:-1], train_data.iloc[:,-1]
X_val = pd.read_csv("validation_processed.csv")
X_test= pd.read_csv("test_processed.csv")

In [138]:
X_val

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_191.1,feature_192.1,feature_193.1,feature_194.1,feature_195.1,feature_196.1,feature_197.1,feature_198.1,feature_199.1,feature_200.1
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35910,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
35911,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
35912,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
35913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [120]:
class MLP:
    def __init__(self, hidden_dim1=64, hidden_dim2=64, output_dim=1):
        self.W1 = self.b1 = self.W2 = self.b2 = self.W3 = self.b3 = None
        self.hidden_dim1 = hidden_dim1
        self.hidden_dim2 = hidden_dim2
        self.output_dim = output_dim

        self.mW1 = self.vW1 = self.mW2 = self.vW2 = self.mW3 = self.vW3 = None
        self.mb1 = self.vb1 = self.mb2 = self.vb2 = self.mb3 = self.vb3 = None

    def forward(self, X):
        if self.W1 is None:
            self.W1 = np.random.randn(X.shape[1], self.hidden_dim1) / np.sqrt(X.shape[1])
            self.b1 = np.zeros((self.hidden_dim1, 1))
            self.W2 = np.random.randn(self.hidden_dim1, self.hidden_dim2) / np.sqrt(self.hidden_dim1)
            self.b2 = np.zeros((self.hidden_dim2, 1))
            self.W3 = np.random.randn(self.hidden_dim2, self.output_dim) / np.sqrt(self.hidden_dim2)
            self.b3 = np.zeros((self.output_dim, 1))

            self.mW1 = np.zeros_like(self.W1)
            self.vW1 = np.zeros_like(self.W1)
            self.mb1 = np.zeros_like(self.b1)
            self.vb1 = np.zeros_like(self.b1)

            self.mW2 = np.zeros_like(self.W2)
            self.vW2 = np.zeros_like(self.W2)
            self.mb2 = np.zeros_like(self.b2)
            self.vb2 = np.zeros_like(self.b2)

            self.mW3 = np.zeros_like(self.W3)
            self.vW3 = np.zeros_like(self.W3)
            self.mb3 = np.zeros_like(self.b3)
            self.vb3 = np.zeros_like(self.b3)

        self.z1 = X.dot(self.W1) + self.b1.T
        self.a1 = np.maximum(0, self.z1)
        self.z2 = self.a1.dot(self.W2) + self.b2.T
        self.a2 = np.maximum(0, self.z2)
        self.z3 = self.a2.dot(self.W3) + self.b3.T
        self.output = sigmoid(self.z3)
        return self.output

    def train(self, data_train, X_val=None, epochs=20, initial_lr=0.001, batch_size=64, beta1=0.9, beta2=0.9, epsilon=1e-8, patience=3, lambda_l2=0.001):
        learning_rate = initial_lr
        best_val_loss = float('inf')
        epochs_without_improvement = 0
        losses = []
        val_losses = [0]

        t = 0
        for epoch in range(epochs):
            X_train, y_train = shuffle_dataframe(data_train)
            X_shuffled, y_shuffled = X_train, y_train
            loss_counter = 0
            for i in range(0, len(data_train)//64):
                t += 1
                """
                IMPORTANT: it's considered that batch_size == #sentences
                """
                try:
                    X_batch = X_shuffled.iloc[i*64:i*64+64]  
                    y_batch = y_shuffled.iloc[i*64:i*64+64]

                except:
                    X_batch = X_shuffled.iloc[i*64:]  
                    y_batch = y_shuffled.iloc[i*64:]
          

                """
                NOW: Do feature engineering for each article's sentences (batch) and then do forward & backprop
                """ 
                X_batch = np.array(X_batch)
                y_batch = np.array(y_batch).reshape(-1, 1)  # Reshape to make it a column vector
    
                output = self.forward(X_batch)    
                error = output - y_batch
                loss_counter += np.mean(np.square(error))
                # Weight the error by instance
                instance_weights = np.where(y_batch == 1, 17, 1) # Determine instance weights based on class labels
                weighted_error = error * instance_weights
    
                # Adjust the loss computation to use the weighted error
                mse_loss = np.mean(np.square(weighted_error))
                l2_loss = lambda_l2 * (np.sum(np.square(self.W1)) + np.sum(np.square(self.W2)) + np.sum(np.square(self.W3)))
                total_loss = mse_loss + l2_loss

                if i%2000==0:
                    print("epoch=", epoch, "loss=",loss_counter/i)

                losses.append(total_loss)

                # Adjust backpropagation to use the weighted error
                sigmoid_derivative = output * (1 - output)
                weighted_error *= sigmoid_derivative
                
                # Now continue with backpropagation as usual, but use 'weighted_error' instead of 'error'
                dW3 = (self.a2.T).dot(2 * weighted_error)
                db3 = np.sum(2 * weighted_error, axis=0, keepdims=True).T
                da2 = (2 * weighted_error).dot(self.W3.T)
                dz2 = da2 * (self.a2 > 0)
                dW2 = (self.a1.T).dot(dz2)
                db2 = np.sum(dz2, axis=0, keepdims=True).T
                da1 = dz2.dot(self.W2.T)
                dz1 = da1 * (self.a1 > 0)
                dW1 = np.dot(X_batch.T, dz1)
                db1 = np.sum(dz1, axis=0, keepdims=True).T

                # Adding the regularisation term to the gradients
                dW3 += 2 * lambda_l2 * self.W3
                dW2 += 2 * lambda_l2 * self.W2
                dW1 += 2 * lambda_l2 * self.W1

                self.mW1 = beta1 * self.mW1 + (1 - beta1) * dW1
                self.vW1 = beta2 * self.vW1 + (1 - beta2) * np.square(dW1)
                mW1_corr = self.mW1 / (1 - beta1 ** t)
                vW1_corr = self.vW1 / (1 - beta2 ** t)

                self.mW2 = beta1 * self.mW2 + (1 - beta1) * dW2
                self.vW2 = beta2 * self.vW2 + (1 - beta2) * np.square(dW2)
                mW2_corr = self.mW2 / (1 - beta1 ** t)
                vW2_corr = self.vW2 / (1 - beta2 ** t)

                self.mW3 = beta1 * self.mW3 + (1 - beta1) * dW3
                self.vW3 = beta2 * self.vW3 + (1 - beta2) * np.square(dW3)
                mW3_corr = self.mW3 / (1 - beta1 ** t)
                vW3_corr = self.vW3 / (1 - beta2 ** t)

                self.mb1 = beta1 * self.mb1 + (1 - beta1) * db1
                self.vb1 = beta2 * self.vb1 + (1 - beta2) * np.square(db1)
                mb1_corr = self.mb1 / (1 - beta1 ** t)
                vb1_corr = self.vb1 / (1 - beta2 ** t)

                self.mb2 = beta1 * self.mb2 + (1 - beta1) * db2
                self.vb2 = beta2 * self.vb2 + (1 - beta2) * np.square(db2)
                mb2_corr = self.mb2 / (1 - beta1 ** t)
                vb2_corr = self.vb2 / (1 - beta2 ** t)

                self.mb3 = beta1 * self.mb3 + (1 - beta1) * db3
                self.vb3 = beta2 * self.vb3 + (1 - beta2) * np.square(db3)
                mb3_corr = self.mb3 / (1 - beta1 ** t)
                vb3_corr = self.vb3 / (1 - beta2 ** t)

                self.W1 -= learning_rate * (mW1_corr / (np.sqrt(vW1_corr) + epsilon) + 2 * lambda_l2 * self.W1)
                self.W2 -= learning_rate * (mW2_corr / (np.sqrt(vW2_corr) + epsilon) + 2 * lambda_l2 * self.W2)
                self.W3 -= learning_rate * (mW3_corr / (np.sqrt(vW3_corr) + epsilon) + 2 * lambda_l2 * self.W3)

                self.b1 -= learning_rate * (mb1_corr / (np.sqrt(vb1_corr) + epsilon))
                self.b2 -= learning_rate * (mb2_corr / (np.sqrt(vb2_corr) + epsilon))
                self.b3 -= learning_rate * (mb3_corr / (np.sqrt(vb3_corr) + epsilon))
                

            """
            REMEMBER TO CHANGE/COMPLETE THIS to the real VAL set FOR THE FINAL VERSION
            """  
            if True:
                X_val_batch = X_val
                X_val_batch = np.array(X_val_batch)
                val_preds = self.forward(X_val_batch)

                with open("../data/validation.json", 'r') as f:
                    eval_data = json.load(f)

                eval_articles = [article['article'] for article in eval_data]
                preprocessed_val_articles = [[s.strip() for s in x.split('.')] for i, x in enumerate(eval_articles)]

                summaries = summary_extraction(preprocessed_val_articles, val_preds)
                pred_data = [{'article': article, 'summary': summary} for article, summary in zip(eval_articles, summaries)]

                with open("val_preds.json", 'w') as f:
                    json.dump(pred_data, f, indent=4)  # indent parameter is optional, it makes the output more readable
    
                val_rogue_f1 = evaluating_validation_set()
                best_val_rogue_f1 = max(val_losses)
                
                if val_rogue_f1 > best_val_rogue_f1:
                    best_val_rogue_f1 = val_rogue_f1
                    epochs_without_improvement = 0
                else:
                    epochs_without_improvement += 1
                    print("epochs num: ", epochs_without_improvement,
                          "\nval_rogue_f1: ", val_rogue_f1)
                    learning_rate *= 0.9
                    
                val_losses.append(val_rogue_f1)

                print(f'\nEpoch {epoch+1}/{epochs}\n')
                print(f'Val ROGUE-1 F1: {val_rogue_f1:.3f}, Learning Rate: {learning_rate:.6f}\n')

                if epochs_without_improvement >= patience:
                    print("Early stopping triggered.")
                    break
                
            
    def test(self, X_test):
        X_test_batch = X_test  

        X_test_batch = np.array(X_test_batch)

        test_output = self.forward(X_test_batch)

        return test_output

    

In [121]:
# Center and scale the training data
X_train = data_train.iloc[:,:-1]
train_mean = np.mean(X_train, axis=0)
train_std = np.std(X_train, axis=0)
X_train_scaled = (X_train - train_mean) / train_std
data_train_scaled = data_train
data_train_scaled.iloc[:,:-1] = X_train_scaled
data_train_scaled.fillna(0)

# scale the validation and test data using the training mean and std:
X_val_scaled = (X_val - train_mean) / train_std
X_test_scaled = (X_test - train_mean) / train_std
X_val_scaled.fillna(0)
X_test_scaled.fillna(0)

# Identify columns that contain NaN values in the training set and remove these columns on training, validation and test sets
nan_columns = data_train_scaled.columns[data_train_scaled.isna().any()].tolist()
data_train_scaled = data_train_scaled.drop(nan_columns, axis=1)
X_val_scaled = X_val_scaled.drop(nan_columns, axis=1)
X_test_scaled = X_test_scaled.drop(nan_columns, axis=1)

In [122]:
# Initialize and train MLP
mlp = MLP()
mlp.train(data_train_scaled, X_val=X_val_scaled, epochs=30, initial_lr=0.001, 
          batch_size=64, beta1=0.9, beta2=0.9, epsilon=1e-8, lambda_l2=0.001)

# Evaluate the model on test set
# test_output = mlp.forward(feature_engineering(X_test))
# test_predictions = (test_output >= 0.5).astype(int)

  print("epoch=", epoch, "loss=",loss_counter/i)


epoch= 0 loss= inf
epoch= 0 loss= 0.2171515148697838
epoch= 0 loss= 0.21450771458020232
epoch= 0 loss= 0.21342162233264034


100%|██████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<?, ?it/s]



Epoch 1/30

Val ROGUE-1 F1: 0.343, Learning Rate: 0.001000



  print("epoch=", epoch, "loss=",loss_counter/i)


epoch= 1 loss= inf
epoch= 1 loss= 0.21020990593301028
epoch= 1 loss= 0.21005891486197165
epoch= 1 loss= 0.20904689182793212


100%|██████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 998406.09it/s]



Epoch 2/30

Val ROGUE-1 F1: 0.345, Learning Rate: 0.001000



  print("epoch=", epoch, "loss=",loss_counter/i)


epoch= 2 loss= inf
epoch= 2 loss= 0.20982777472249733
epoch= 2 loss= 0.2099803824307542
epoch= 2 loss= 0.20826197184953293


100%|██████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 982042.61it/s]


epochs num:  1 
val_rogue_f1:  0.3449009538204685

Epoch 3/30

Val ROGUE-1 F1: 0.345, Learning Rate: 0.000900



  print("epoch=", epoch, "loss=",loss_counter/i)


epoch= 3 loss= inf
epoch= 3 loss= 0.20720309611207738
epoch= 3 loss= 0.20749972779823508
epoch= 3 loss= 0.20794888954486965


100%|█████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1000788.36it/s]



Epoch 4/30

Val ROGUE-1 F1: 0.346, Learning Rate: 0.000900



  print("epoch=", epoch, "loss=",loss_counter/i)


epoch= 4 loss= inf
epoch= 4 loss= 0.20642852416550234
epoch= 4 loss= 0.20793284167383305
epoch= 4 loss= 0.20798112894631235


100%|█████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1000549.62it/s]



Epoch 5/30

Val ROGUE-1 F1: 0.348, Learning Rate: 0.000900



  print("epoch=", epoch, "loss=",loss_counter/i)


epoch= 5 loss= inf
epoch= 5 loss= 0.20812544467377256
epoch= 5 loss= 0.20814373799287028
epoch= 5 loss= 0.20742042324493817


100%|██████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<?, ?it/s]


epochs num:  1 
val_rogue_f1:  0.34474699314217416

Epoch 6/30

Val ROGUE-1 F1: 0.345, Learning Rate: 0.000810



  print("epoch=", epoch, "loss=",loss_counter/i)


epoch= 6 loss= inf
epoch= 6 loss= 0.205960531422735
epoch= 6 loss= 0.20761906661350446
epoch= 6 loss= 0.2072713690250376


100%|█████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1001266.17it/s]


epochs num:  2 
val_rogue_f1:  0.3468641153209727

Epoch 7/30

Val ROGUE-1 F1: 0.347, Learning Rate: 0.000729



  print("epoch=", epoch, "loss=",loss_counter/i)


epoch= 7 loss= inf
epoch= 7 loss= 0.20372580244856564
epoch= 7 loss= 0.20549382068355243
epoch= 7 loss= 0.20832705985340252


100%|██████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 499797.90it/s]


epochs num:  3 
val_rogue_f1:  0.34588436379011667

Epoch 8/30

Val ROGUE-1 F1: 0.346, Learning Rate: 0.000656

Early stopping triggered.


In [123]:
train_preds_greedy = data_train["target"]
train_preds = mlp.test(data_train_scaled.iloc[:,:-1])
val_preds = mlp.test(X_val_scaled)
test_preds = mlp.test(X_test_scaled)

In [134]:
import math 

def top_m_indicator(a_preds, m):
    # Step 1: Identify the top m highest probabilities and their indices
    a_preds = [float(p) for p in a_preds]

    top_m_indices = np.argsort(a_preds)[-m:]
    
    # Step 2: Create a new list of zeros
    indicator_list = np.zeros(len(a_preds), dtype=int)
    
    # Step 3: Set the elements corresponding to the top m probabilities to 1
    indicator_list[top_m_indices] = 1
    
    return indicator_list.tolist()


def summary_extraction(prepro_articles, preds):
    i = 0
    summaries = []
    for a in prepro_articles:
        a_preds = preds[i:i+len(a)]
        m = 3
        summary_index = top_m_indicator(a_preds, m)
        summary = [s for i, s in enumerate(a) if summary_index[i] == 1]
        summary = '\n'.join(summary)
        summaries.append(summary)
        i += len(a)
        
    return summaries

with open("../data/validation.json", 'r') as f:
    eval_data = json.load(f)

eval_articles = [article['article'] for article in eval_data]
preprocessed_val_articles = preprocess(eval_articles) 

summaries = summary_extraction(preprocessed_val_articles, val_preds)
pred_data = [{'article': article, 'summary': summary} for article, summary in zip(eval_articles, summaries)]

with open("validation_pred_data.json", 'w') as f:
    json.dump(pred_data, f, indent=4)  # indent parameter is optional, it makes the output more readable

In [135]:
with open("../data/test.json", 'r') as f:
    eval_data = json.load(f)

eval_articles = [article['article'] for article in eval_data]
preprocessed_val_articles = preprocess(eval_articles) 

summaries = summary_extraction(preprocessed_val_articles, test_preds)
pred_data = [{'article': article, 'summary': summary} for article, summary in zip(eval_articles, summaries)]

with open("test_pred_data.json", 'w') as f:
    json.dump(pred_data, f, indent=4)  # indent parameter is optional, it makes the output more readable

In [136]:
with open("../data/train.json", 'r') as f:
    eval_data = json.load(f)

eval_articles = [article['article'] for article in eval_data]
preprocessed_val_articles = preprocess(eval_articles) 

summaries = summary_extraction(preprocessed_val_articles, train_preds)
pred_data = [{'article': article, 'summary': summary} for article, summary in zip(eval_articles, summaries)]

with open("train_pred_data.json", 'w') as f:
    json.dump(pred_data, f, indent=4)  # indent parameter is optional, it makes the output more readable

In [130]:
"""
Best ROGUE scores that we can achieve
"""

def summary_extraction_train(prepro_articles, preds):
    i = 0
    summaries = []
    for a in prepro_articles:
        a_preds = preds[i:i+len(a)]
        summary = [s for i, s in enumerate(a) if a_preds.iloc[i] == 1]
        summary = '\n'.join(summary)
        summaries.append(summary)
        i += len(a)
        
    return summaries

with open("../data/train.json", 'r') as f:
    eval_data = json.load(f)

eval_articles = [article['article'] for article in eval_data]
preprocessed_val_articles = preprocess(eval_articles) 

summaries = summary_extraction_train(preprocessed_val_articles, train_preds_greedy)
pred_data = [{'article': article, 'summary': summary} for article, summary in zip(eval_articles, summaries)]

with open("train_greedy_pred_data.json", 'w') as f:
    json.dump(pred_data, f, indent=4)  # indent parameter is optional, it makes the output more readable

# Compute the baselines

In [112]:
def random_summary_extraction(prepro_articles, preds):
    i = 0
    summaries = []
    for a in prepro_articles:
        n = len(a)
        a_preds = preds[i:i+n]
        m = 3
        
        ones_list = [1] * m
        zeros_list = [0] * (n-m)
        summary_index = ones_list + zeros_list
        random.shuffle(summary_index)
        
        summary = [s for i, s in enumerate(a) if summary_index[i] == 1]
        summary = '\n'.join(summary)
        summaries.append(summary)
        i += len(a)
        
    return summaries

with open("C:/Users/17245/MPhil ACS/L90-Summarization-main/data/validation.json", 'r') as f:
    eval_data = json.load(f)

eval_articles = [article['article'] for article in eval_data]
preprocessed_val_articles = preprocess(eval_articles) 

summaries = random_summary_extraction(preprocessed_val_articles, val_preds)
pred_data = [{'article': article, 'summary': summary} for article, summary in zip(eval_articles, summaries)]

with open("val_data_baseline.json", 'w') as f:
    json.dump(pred_data, f, indent=4)  # indent parameter is optional, it makes the output more readable
    
    
with open("C:/Users/17245/MPhil ACS/L90-Summarization-main/data/test.json", 'r') as f:
    eval_data = json.load(f)

eval_articles = [article['article'] for article in eval_data]
preprocessed_val_articles = preprocess(eval_articles) 

summaries = random_summary_extraction(preprocessed_val_articles, val_preds)
pred_data = [{'article': article, 'summary': summary} for article, summary in zip(eval_articles, summaries)]
    
with open("test_data_baseline.json", 'w') as f:
    json.dump(pred_data, f, indent=4)  # indent parameter is optional, it makes the output more readable