<a href="https://colab.research.google.com/github/iguerrasevillano/TFM/blob/main/Extractive_summaries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Libraries

In [2]:
!pip install --upgrade tensorflow

Collecting tensorflow
  Downloading tensorflow-2.15.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.16,>=2.15 (from tensorflow)
  Downloading tensorboard-2.15.1-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow-estimator<2.16,>=2.15.0 (from tensorflow)
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl (441 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.0/442.0 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras<2.16,>=2.15.0 (from tensorflow)
  Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
Collecting google-auth-oa

In [3]:
# Libraries
import os
import pandas as pd
import numpy as np
import json

# Visualization
import matplotlib.pyplot as plt

# Keras
import tensorflow as tf

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Graph
import networkx as nx
import re

# Time
import time

# AST
import ast

# Current directory
os.getcwd()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


'/content'

In [1]:
import tensorflow as tf
print(tf.__version__)

2.12.0


### Load and clean raw data

In [4]:
# Connect w/ Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
BASE_PATH = "/content/drive/MyDrive/VIU/TFM/Desarrollo/Data/"

documents = os.listdir(BASE_PATH)

In [6]:
# AUXILIAR FUNCTIONS

def load_data(data):

  jsonl_file_path = BASE_PATH + 'TLDR/' + data + '.jsonl'

  # Read the JSON Lines file into a list of dictionaries
  data_list = []
  with open(jsonl_file_path, 'r') as jsonl_file:
      for line in jsonl_file:
          data_dict = json.loads(line)
          data_list.append(data_dict)

  return data_list



# Join all the sentences of target
def join_words(df, column):
  df[column] = df[column].apply(lambda x : ' '.join(x))
  return df


# Count number of words of target
def count_words(df, column):
  return df[column].apply(lambda x : len(x.split()))



def similarity(sentence1, sentence2, stopwords=None):
  if stopwords is None:
    stopwords = []
  sentence1 = [w.lower() for w in sentence1]
  sentence2 = [w.lower() for w in sentence2]

  all_words = list(set(sentence1 + sentence2))

  vector1 = [0] * len(all_words)
  vector2 = [0] * len(all_words)

  #build the vector for the first sentence
  for word in sentence1:
    if not word in stopwords:
      vector1[all_words.index(word)]+=1

  #build the vector for the second sentence
  for word in sentence2:
    if not word in stopwords:
      vector2[all_words.index(word)]+=1

  norm_vector1 = np.sqrt(np.dot(vector1, vector1))
  norm_vector2 = np.sqrt(np.dot(vector2, vector2))

  if norm_vector1 == 0 or norm_vector2 == 0:
    return 0

  return 1-cosine_distance(vector1, vector2)



def similarity_matrix(sentences, stop_words):
  similarity_matrix = np.zeros((len(sentences), len(sentences)))

  for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
      if idx1 != idx2:
        similarity_matrix[idx1][idx2] = similarity(sentences[idx1], sentences[idx2], stop_words)

  return similarity_matrix



def generate_extractive_summary(sentences, top_n, stop_words):

  summarize_text = []

  # Step1: generate similarity matrix
  sentence_similarity_matrix = similarity_matrix(sentences, stop_words)

  # Step2: Rank sentences in similarity matrix
  sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
  scores = nx.pagerank(sentence_similarity_graph)

  # Step3: sort the rank and place top sentences
  ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

  # Step4: get the top n number of sentences based on rank
  for i in range(top_n):
    if len(ranked_sentences) > i:
      summarize_text.append(ranked_sentences[i][1])
    else:
      break

  # Step5: reorder the sentences as in the original text
  real_indexes = []
  for sentence in summarize_text:
    if sentence in sentences:
      real_indexes.append(sentences.index(sentence))
    else:
      continue

  sorted_real_indexes = sorted(real_indexes)

  ordered_summarized_text = []
  for i in sorted_real_indexes:
    ordered_summarized_text.append(sentences[i])

  # Step6 : output the summarized version
  return ' '.join(ordered_summarized_text)

In [32]:
data_list = load_data('train')
data_list.extend(load_data('dev'))
data_list.extend(load_data('test'))

# Convert the list of dictionaries to a Pandas DataFrame
data = pd.DataFrame(data_list)

# Display the DataFrame
display(data.head())
print(data['source'][0])

Unnamed: 0,source,source_labels,rouge_scores,paper_id,target,title
0,[Due to the success of deep learning to solvin...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.3018867874688502, 0.37209301838831804, 0.60...",SysEexbRb,[We provide necessary and sufficient analytica...,
1,[The backpropagation (BP) algorithm is often t...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.13043477920604923, 0.142857139229...",SygvZ209F7,"[Biologically plausible learning algorithms, p...",
2,"[We introduce the 2-simplicial Transformer, an...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.33333332839506175, 0.8888888839111112, 0.11...",rkecJ6VFvr,[We introduce the 2-simplicial Transformer and...,
3,"[We present Tensor-Train RNN (TT-RNN), a novel...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.06666666222222252, 0.06451612466181092, 0.0...",HJJ0w--0W,[Accurate forecasting over very long time hori...,
4,[Recent efforts on combining deep models with ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.2777777727932099, 0.5714285666581633, 0.095...",HyH9lbZAW,[We propose a variational message-passing algo...,


['Due to the success of deep learning to solving a variety of challenging machine learning tasks, there is a rising interest in understanding loss functions for training neural networks from a theoretical aspect.', 'Particularly, the properties of critical points and the landscape around them are of importance to determine the convergence performance of optimization algorithms.', 'In this paper, we provide a necessary and sufficient characterization of the analytical forms for the critical points (as well as global minimizers) of the square loss functions for linear neural networks.', 'We show that the analytical forms of the critical points characterize the values of the corresponding loss functions as well as the necessary and sufficient conditions to achieve global minimum.', 'Furthermore, we exploit the analytical forms of the critical points to characterize the landscape properties for the loss functions of linear neural networks and shallow ReLU networks.', 'One particular conclu

In [33]:
# Drop columns
no_need_columns = ['source_labels', 'rouge_scores']
data = data.drop(columns=no_need_columns)

data = join_words(data, 'target')

data['number_words_target'] = count_words(data, 'target')

print(data['number_words_target'].describe())

count    3229.000000
mean       34.522453
std        24.459065
min         3.000000
25%        16.000000
50%        24.000000
75%        51.000000
max       149.000000
Name: number_words_target, dtype: float64


In [34]:
# Summaries with less than 25 words are eliminated from the data
clean_data = data[data['number_words_target']>=30]
clean_data = clean_data.reset_index(drop=True)

clean_data['number_words_target'].describe()

count    1312.000000
mean       58.935213
std        20.402388
min        30.000000
25%        41.000000
50%        58.000000
75%        73.000000
max       149.000000
Name: number_words_target, dtype: float64

In [44]:
clean_data['extractive_summary'] = np.nan
clean_data.head()

Unnamed: 0,source,paper_id,target,title,number_words_target,extractive_summary
0,[Due to the success of deep learning to solvin...,SysEexbRb,We provide necessary and sufficient analytical...,,38,
1,[Generative Adversarial Networks (GANs) have a...,ryj38zWRb,Are GANs successful because of adversarial tra...,,36,
2,[Dialogue systems require a great deal of diff...,BJepraEFPr,"In this paper, we propose to learn a dialogue ...",,30,
3,[Backdoor attacks aim to manipulate a subset o...,rkgyS0VFvr,We proposed a novel distributed backdoor attac...,,35,
4,[The integration of a Knowledge Base (KB) int...,SJl7tREFvr,Conventional memory networks generate many red...,,32,


In [7]:
#RESET DATA
import ast

clean_data = pd.read_csv(BASE_PATH+'extractive_summaries+good.csv')

# Assuming 'df' is your DataFrame and 'list_column' is the column with lists
# For example, if your DataFrame looks like this:
# df = pd.DataFrame({'list_column': [[1, 2, 3], [4, 5], [6, 7, 8]]})

# Define a function to safely convert strings to lists
def convert_to_list(cell):
    try:
        return ast.literal_eval(cell)
    except (SyntaxError, ValueError):
        return cell

# Apply the function to the column with lists
clean_data['source'] = clean_data['source'].apply(convert_to_list)

# Now, 'list_column' contains lists
# Access the first list in the first row, for example
first_list = clean_data.at[0, 'source']
print(first_list)

['Due to the success of deep learning to solving a variety of challenging machine learning tasks, there is a rising interest in understanding loss functions for training neural networks from a theoretical aspect.', 'Particularly, the properties of critical points and the landscape around them are of importance to determine the convergence performance of optimization algorithms.', 'In this paper, we provide a necessary and sufficient characterization of the analytical forms for the critical points (as well as global minimizers) of the square loss functions for linear neural networks.', 'We show that the analytical forms of the critical points characterize the values of the corresponding loss functions as well as the necessary and sufficient conditions to achieve global minimum.', 'Furthermore, we exploit the analytical forms of the critical points to characterize the landscape properties for the loss functions of linear neural networks and shallow ReLU networks.', 'One particular conclu

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')
top_n = 16
total = clean_data.shape[0]
prev_index = 0

reset_from = 990

for index in range(reset_from, total):
  if index % 10 == 0:
    if (prev_index+10) == index:
      end_time = time.time()
      print(f'{int(end_time - start_time)}s')
    print(f'[INFO]: generating summary {index}/{total}...')
    prev_index = index
    start_time = time.time()
    clean_data.to_csv(BASE_PATH+'extractive_summaries_good.csv', index=False)

  clean_data['extractive_summary'][index] = generate_extractive_summary(clean_data['source'][index],
                                                                        top_n,
                                                                        stop_words)

clean_data.to_csv(BASE_PATH+'extractive_summaries+good.csv', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[INFO]: generating summary 990/1312...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_data['extractive_summary'][index] = generate_extractive_summary(clean_data['source'][index],


294s
[INFO]: generating summary 1000/1312...
290s
[INFO]: generating summary 1010/1312...
344s
[INFO]: generating summary 1020/1312...
396s
[INFO]: generating summary 1030/1312...
285s
[INFO]: generating summary 1040/1312...
351s
[INFO]: generating summary 1050/1312...
306s
[INFO]: generating summary 1060/1312...
438s
[INFO]: generating summary 1070/1312...
256s
[INFO]: generating summary 1080/1312...
367s
[INFO]: generating summary 1090/1312...
468s
[INFO]: generating summary 1100/1312...
463s
[INFO]: generating summary 1110/1312...
355s
[INFO]: generating summary 1120/1312...
329s
[INFO]: generating summary 1130/1312...
300s
[INFO]: generating summary 1140/1312...
329s
[INFO]: generating summary 1150/1312...
433s
[INFO]: generating summary 1160/1312...
443s
[INFO]: generating summary 1170/1312...
254s
[INFO]: generating summary 1180/1312...


In [10]:
  clean_data['extractive_summary'][1311]


'Our approach transfers the structure of a visual representation space to the textual space by using two complementary sources of information: (1) the cluster information: the implicit knowledge that two sentences associated with the same visual content describe the same underlying reality and (2) the perceptual information contained within the structure of the visual space. Moreover, encoding semantics of sentences is paramount because sentences describe relationships between objects and thus convey complex and high-level knowledge better than individual words, which mostly refer to a single concept BID38 .Relying only on text can lead to biased representations and unrealistic predictions (e.g., text-based models could predict that "the sky is green" BID1 ). While there exist numerous approaches to learning sentence representations from text corpora only, and to learning multimodal word embeddings, the problem of the visual grounding of sentences is quite new to the research community

In [12]:
clean_data['sentences_extractive_summary'] = clean_data['extractive_summary'].apply(sent_tokenize)

In [17]:
print(clean_data['sentences_extractive_summary'][1])
print(clean_data['source'][1])

['The experimental results presented in this work suggest that, in the image domain, we can recover many of the properties of GAN models by using convnets trained with simple reconstruction losses.', 'Therefore, it is reasonable to hypothesize that the reasons for the success of GANs in modeling natural images come from two complementary sources: (A1) Leveraging the powerful inductive bias of deep convnets.', '(A2) The adversarial training protocol.', 'While this does not invalidate the promise of GANs as generic models of uncertainty or as methods for building generative models, our results suggest that, in order to more fully test the adversarial construction, research needs to move beyond images and convnets.', 'The most common choices of the representation space for GANs are either the uniform distribution on the hypercube DISPLAYFORM1 In previous literature, Gaussian distributions lead to more stable GAN training BID36 , we will take this choice to design our representation space.

In [23]:
a = clean_data['source'][1]
for i in range(len(a)):
  if '(A1)' in a[i]:
    print(i)
    print(a[i])

32
Therefore, it is reasonable to hypothesize that the reasons for the success of GANs in modeling natural images come from two complementary sources: (A1) Leveraging the powerful inductive bias of deep convnets. (A2) The adversarial training protocol.
33
This work attempts to disentangle the factors of success (A1) and (A2) in GAN models.
34
Specifically, we propose and study one algorithm that relies on (A1) and avoids (A2), but still obtains competitive results when compared to a GAN.


In [21]:
"Therefore, it is reasonable to hypothesize that the reasons for the success of GANs in modeling natural images come from two complementary sources: (A1) Leveraging the powerful inductive bias of deep convnets." in clean_data['source'][1]

False

In [None]:
clean_data['number_words_extractive'] = count_words(clean_data, 'extractive_summary')

clean_data.describe()

Unnamed: 0,number_words_target,number_words_extractive
count,1312.0,1312.0
mean,58.935213,621.176067
std,20.402388,143.951058
min,30.0,126.0
25%,41.0,534.0
50%,58.0,611.0
75%,73.0,693.0
max,149.0,1199.0


In [None]:
i = 1
print(clean_data['source'][i])
print(clean_data['target'][i])
print(clean_data['extractive_summary'][i])
print(clean_data['number_words_extractive'][i])
print(clean_data['number_words_extractive'][i])

['Generative Adversarial Networks (GANs) have achieved remarkable results in the task of generating realistic natural images.', 'In most applications, GAN models share two aspects in common.', 'On the one hand, GANs training involves solving a challenging saddle point optimization problem, interpreted as an adversarial game between a generator and a discriminator functions.', 'On the other hand, the generator and the discriminator are parametrized in terms of deep convolutional neural networks.', 'The goal of this paper is to disentangle the contribution of these two factors to the success of GANs.', 'In particular, we introduce Generative Latent Optimization (GLO), a framework to train deep convolutional generators without using discriminators, thus avoiding the instability of adversarial optimization problems.', 'Throughout a variety of experiments, we show that GLO enjoys many of the desirable properties of GANs: learning from large data, synthesizing visually-appealing samples, int

In [9]:
# Assuming 'summarize_text' is a string
summarize_text = "This is the first sentence. This is the second sentence. And this is the third sentence."

# Split the string into a list of sentences
list_of_sentences = sent_tokenize(summarize_text)

# Print the result
print(list_of_sentences)

['This is the first sentence.', 'This is the second sentence.', 'And this is the third sentence.']
