<a href="https://colab.research.google.com/github/kabilan942/Natural-Language-Processing/blob/main/Summarization-DailyMail/dailymail_capsule.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Modules Required

In [None]:
!pip install rouge-score
!pip install rouge

In [None]:
!pip install transformers
!pip install tensorflow

In [None]:
!pip install datasets

In [None]:
import sys
sys.path.insert(0, '../')
import torch
import nltk
nltk.download('punkt')
from rouge_score import rouge_scorer
from sklearn.preprocessing import MinMaxScaler
from rouge import Rouge
import math
import pandas as pd
import numpy as np
from tqdm import tqdm

#from transformers import BertTokenizer, TFBertModel
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
from keras.layers import Layer, Input, Dense, MaxPooling2D,concatenate,Lambda, AveragePooling2D, Dropout
from keras.models import Model
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Conv2D
from keras import regularizers
from keras.optimizers import Adadelta
from keras.optimizers import Adam
from tensorflow.keras.constraints import MaxNorm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Getting the data

In [None]:
train=pd.read_csv("/content/training.csv")
valid=pd.read_csv("/content/validation.csv")
test=pd.read_csv("/content/test.csv")

In [None]:
train.dropna(inplace=True)
valid.dropna(inplace=True)
test.dropna(inplace=True)

In [None]:
train_source = train['article'].values.tolist()
train_summary = train['highlights'].values.tolist()
print("train file read successfully")
valid_source = valid['article'].values.tolist()
valid_summary = valid['highlights'].values.tolist()
print("valid file read successfully")
data_source = train_source+valid_source
data_summary = train_summary+valid_summary

train file read successfully
valid file read successfully


## Calculating Saliency score for each sentence with its summary

In [None]:
def saliency_score(doc, summary, alpha=0.5):
  # takes in two inputs and returns the saliency score
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)
  scores = scorer.score(doc, summary)
  R1 = scores['rouge1'][2]
  R2 = scores['rouge2'][2]

  return alpha * R1 + (1 - alpha) * R2

def split_to_sentences(para):
    sents = nltk.sent_tokenize(para)   # returns list of sentences from para
    return sents

In [None]:
input_data = [[],[],[],[],[],[]]  # [docid, sentid, sentences, length of sentence, length of summary, saliency scores]
total_sentences_doc = []
total_sentences_summary = []
for i in tqdm(range(len(data_source))):
  splitted_doc = split_to_sentences(data_source[i])
  total_sentences_doc.append(len(splitted_doc))
  doc_sents = []
  for ind in range(len(splitted_doc)):
    if len(splitted_doc[ind].split())>5:
      doc_sents.append(splitted_doc[ind])
  summary = data_summary[i]
  total_sentences_summary.append(len(split_to_sentences(summary)))
  saliency_list = []
  for sent in doc_sents:
    saliency_list.append(saliency_score(sent, summary))
  saliency_list = np.array(saliency_list)
  mms = MinMaxScaler()
  if len(saliency_list)>0:  # to avoide this error: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required by MinMaxScaler.
    scaled_saliency_list = mms.fit_transform(saliency_list.reshape(-1,1))
    for j in range(len(doc_sents)):
      len_sent = len(doc_sents[j].split())
      input_data[0].append(i)
      input_data[1].append(j)
      input_data[2].append(doc_sents[j])
      input_data[3].append(len_sent)
      input_data[4].append(len(split_to_sentences(summary)))
      input_data[5].append(scaled_saliency_list[j][0])

100%|██████████| 29/29 [00:01<00:00, 22.32it/s]


In [None]:
avg_sent_per_doc = sum(total_sentences_doc)/len(data_source)
avg_sent_per_sum = sum(total_sentences_summary)/len(data_summary)
longest_sent = max(total_sentences_doc)

print('Average sentences per doc: ',avg_sent_per_doc)
print('Average sentences per summary: ',avg_sent_per_sum)
print('Longest sentence in doc:', longest_sent)

Average sentences per doc:  36.827586206896555
Average sentences per summary:  3.9310344827586206
Longest sentence in doc: 83


In [None]:
df = pd.DataFrame({'doc_id':input_data[0],'sent_id':input_data[1],'Sentence':input_data[2],'Sentence Length':input_data[3], 'Summary Length':input_data[4], 'Saliency Score':input_data[5]})

df.head()

Unnamed: 0,doc_id,sent_id,Sentence,Sentence Length,Summary Length,Saliency Score
0,0,0,"14:11 EST, 25 October 2013 .",6,3,0.0
1,0,1,"15:36 EST, 25 October 2013 .",6,3,0.0
2,0,2,The bishop of the Fargo Catholic Diocese in No...,34,3,0.787966
3,0,3,The state Health Department has issued an advi...,19,3,0.173988
4,0,4,Bishop John Folda (pictured) of the Fargo Cath...,30,3,1.0


In [None]:
df.dtypes

doc_id               int64
sent_id              int64
Sentence            object
Sentence Length      int64
Summary Length       int64
Saliency Score     float64
dtype: object

In [None]:
df.shape

(937, 6)

In [None]:
df.to_csv('saliency_sentences.csv')

## Generate word embeddings for each sentence in doc

In [None]:
# Load Roberta tokenizer and model
from transformers import RobertaTokenizer, TFRobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.embeddings.position_ids', 'lm_head.dense.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

### Removing sentences with more than 76 tokens and storing in df - saliency_sentences_76

In [None]:
saliency_sentences = pd.read_csv('/content/saliency_sentences.csv',  usecols=['Sentence','Saliency Score'])

saliency_sentences.head()

Unnamed: 0,Sentence,Saliency Score
0,"14:11 EST, 25 October 2013 .",0.0
1,"15:36 EST, 25 October 2013 .",0.0
2,The bishop of the Fargo Catholic Diocese in No...,0.787966
3,The state Health Department has issued an advi...,0.173988
4,Bishop John Folda (pictured) of the Fargo Cath...,1.0


In [None]:
input_data = saliency_sentences.to_numpy().transpose()

sentences = input_data[0]

In [None]:
input_data.shape

(2, 937)

In [None]:
import numpy as np
max_len = 0
count76=0
summ=0
out_range=[]
for i in tqdm(range(len(sentences))):
  encoded_text = tokenizer(sentences[i], return_tensors='tf')
  l = len(np.array(encoded_text['input_ids'])[0])
  if l>50:
    count76+=1
    out_range.append(i)
    continue
  summ+=l
  if l>max_len:
    max_len=l

avg_token_size = summ/len(sentences)
print(max_len, avg_token_size, count76)

100%|██████████| 937/937 [00:00<00:00, 1576.00it/s]

50 24.82497331910352 66





In [None]:
input_data_76 = np.delete(input_data,out_range, axis=1)   # add in prev loop with an if statement that if l>100 -> delete that row

input_data_76.shape

(2, 871)

In [None]:
saliency_sentences_76 = pd.DataFrame(data = input_data_76.transpose(), columns = ['Sentence', 'Saliency Score'])

saliency_sentences_76.head()

Unnamed: 0,Sentence,Saliency Score
0,"14:11 EST, 25 October 2013 .",0.0
1,"15:36 EST, 25 October 2013 .",0.0
2,The bishop of the Fargo Catholic Diocese in No...,0.787966
3,The state Health Department has issued an advi...,0.173988
4,Bishop John Folda (pictured) of the Fargo Cath...,1.0


In [None]:
saliency_sentences_76.to_excel('saliency_sentences_limit_train.xlsx')

### Generating embedding batchwise

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
saliency_sentences_limit50_train = pd.read_excel('saliency_sentences_limit_train.xlsx', usecols=['Sentence','Saliency Score'])
saliency_sentences_limit50_valid = pd.read_excel('saliency_sentences_limit_train.xlsx', nrows = 100, usecols=['Sentence','Saliency Score'])

In [None]:
dataset = DatasetDict()
dataset['train'] = Dataset.from_pandas(saliency_sentences_limit50_train)
dataset['valid'] = Dataset.from_pandas(saliency_sentences_limit50_valid)
dataset

DatasetDict({
    train: Dataset({
        features: ['Sentence', 'Saliency Score'],
        num_rows: 871
    })
    valid: Dataset({
        features: ['Sentence', 'Saliency Score'],
        num_rows: 100
    })
})

In [None]:
# Load Roberta tokenizer and model
from transformers import RobertaTokenizer, TFRobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

In [None]:
max_length = 50

def preprocess_function(examples):
  encoded_sentences = tokenizer(examples['Sentence'], return_tensors='tf', padding='max_length', max_length=max_length, truncation=True)
  examples['input_ids'] = encoded_sentences['input_ids']
  examples['attention_mask'] = encoded_sentences['attention_mask']
  return examples

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, batch_size=20)  # batch_size=1000 by default

Map:   0%|          | 0/871 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['Sentence', 'Saliency Score', 'input_ids', 'attention_mask'],
        num_rows: 871
    })
    valid: Dataset({
        features: ['Sentence', 'Saliency Score', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [None]:
tokenized_dataset = tokenized_dataset.with_format("np")

In [None]:
tokenized_dataset.save_to_disk('dm_roberta_embeddings')

Saving the dataset (0/1 shards):   0%|          | 0/871 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
from datasets import load_from_disk
reloaded_encoded_dataset = load_from_disk("dm_roberta_embeddings")
# reloaded_encoded_dataset = load_from_disk("/scratch/ravindra.cse.nitt/extractive/embedding_legal_extractive")

In [None]:
train_input_ids = np.array(reloaded_encoded_dataset['valid']['input_ids'])
train_attention_mask = np.array(reloaded_encoded_dataset['valid']['attention_mask'])
train_labels = np.array(reloaded_encoded_dataset['valid']['Saliency Score'])

valid_input_ids = np.array(reloaded_encoded_dataset['valid']['input_ids'])
valid_attention_mask = np.array(reloaded_encoded_dataset['valid']['attention_mask'])
valid_labels = np.array(reloaded_encoded_dataset['valid']['Saliency Score'])

In [None]:
train_input_ids = train_input_ids.astype(np.int32)
train_attention_mask = train_attention_mask.astype(np.int32)

valid_input_ids = valid_input_ids.astype(np.int32)
valid_attention_mask = valid_attention_mask.astype(np.int32)

## Modelling

In [None]:
"""
from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
enc_model = TFBertModel.from_pretrained("bert-base-uncased")
"""
# Load Roberta tokenizer and model
from transformers import RobertaTokenizer, TFRobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [None]:
!git clone https://github.com/yasithdev/capsnet.git

fatal: destination path 'capsnet' already exists and is not an empty directory.


In [None]:
cd capsnet

/content/capsnet


In [None]:
from capsnet import nn, layers
from capsnet.layers import ConvCaps2D, DenseCaps

In [None]:
class BERTEmbedding(Layer):
  def _init_(self, **kwargs):
    super(BERTEmbedding, self)._init_(**kwargs)
  def call(self, inputs, **kwargs):
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Generate BERT embeddings
    bert_outputs = roberta_model(input_ids, attention_mask=attention_mask)
    hidden_state = bert_outputs['last_hidden_state']
    embeddings = tf.expand_dims(hidden_state, axis=-1)
    return embeddings

In [None]:
# Open a strategy scope.
#with strategy.scope():
    # Everything that creates variables should be under the strategy scope.
    # In general this is only model construction & `compile()`.
input_ids = Input(shape=(50,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(50,), dtype=tf.int32, name='attention_mask')

# Apply BERTEmbedding layer
bert_embedding = BERTEmbedding()({'input_ids': input_ids, 'attention_mask': attention_mask})

conv2d_layer_1 = Conv2D(256, (2,768), padding='valid', activation='relu', strides=1)(bert_embedding)
conv2d_layer_2 = Conv2D(256, (3,1), padding='valid', activation='relu', strides=1)(conv2d_layer_1)
conv2d_layer_3 = Conv2D(256, (4,1), padding='valid', activation='relu', strides=1)(conv2d_layer_2)

# convert to capsule domain
conv_caps_2d = ConvCaps2D(filters=32, filter_dims=8, kernel_size=(5,1), strides=(1, 1), name='conv_caps_2d')(conv2d_layer_3)
conv_caps_2d = Lambda(nn.squash)(conv_caps_2d)

# dense layer for dynamic routing
dense_caps = DenseCaps(caps=2, caps_dims=16, routing_iter=3, name='dense_caps')(conv_caps_2d)
dense_caps = Lambda(nn.squash)(dense_caps)

#merged = concatenate([tower1, tower2, tower3, tower4], axis=2)
flat = Flatten()(dense_caps)

drop = Dropout(0.2)(flat)
output = Dense(1, activation='sigmoid')(drop)

model = Model(inputs=[input_ids, attention_mask], outputs=output)

model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['mean_absolute_error','accuracy'])

print(model.summary())



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask (InputLayer)    [(None, 50)]         0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, 50)]         0           []                               
                                                                                                  
 bert_embedding (BERTEmbedding)  (None, 50, 768, 1)  0           ['attention_mask[0][0]',         
                                                                  'input_ids[0][0]']              
                                                                                                  
 conv2d (Conv2D)                (None, 49, 1, 256)   393472      ['bert_embedding[0][0]']     

In [None]:
nepochs = 5
batch_size = 32
val_train_ratio = 0.1

In [None]:
from keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2, restore_best_weights=True)

In [None]:
model.fit([train_input_ids, train_attention_mask], train_labels, epochs=nepochs, batch_size=batch_size,
          validation_data=([valid_input_ids, valid_attention_mask], valid_labels), callbacks=[early_stop])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7879dc439ab0>

In [None]:
model.save_weights('dm_deepcnncapsule.h5')

## Testing

In [None]:
test=pd.read_csv("/content/test.csv")

In [None]:
test_data_source = test['article'].values.tolist()
test_data_summary = test['highlights'].values.tolist()

In [None]:
#k = math.ceil(avg_sent_per_sum)
k = 4
print('Value of k:',k)

Value of k: 4


In [None]:
result_rouge_list = []

for i in tqdm(range(len(test_data_source))):
  doc = test_data_source[i]
  summary = test_data_summary[i]
  sentences = split_to_sentences(doc)

  # sentences are encoded (padded and truncated to input size of model) and embedded using BERT
  encoded_sentences = tokenizer(sentences, return_tensors='tf', padding='max_length', truncation=True, max_length=50)
  test_input_ids = encoded_sentences['input_ids']
  test_attention_mask = encoded_sentences['attention_mask']

  y_pred = model.predict([test_input_ids,test_attention_mask])
  y_pred=y_pred.reshape(y_pred.shape[0],)

  # choosing the top k (k- average of number of sentences per summary during training)
  temp = sorted(y_pred)[-k:]
  sal_index = []
  for ele in temp:
    sal_index.append(list(y_pred).index(ele))
  print(sal_index)
  sal_index_sorted = sorted(sal_index)
  print(sal_index_sorted)

  # pred_summary - list of sentences having the top k saliency scores as predicted by the model
  pred_summary = []
  for ind in sal_index_sorted:
    pred_summary.append(sentences[ind])

  # the sentences in pred_summary are joined
  final_pred_summary = ' '.join(pred_summary)
  #print(len(final_pred_summary.split(' ')))
  final_pred_summary_75 = final_pred_summary.split(' ')[:75]
  final_pred_summary = ' '.join(final_pred_summary_75)
  #print(len(final_pred_summary.split(' ')))

  # Rouge score is calculates between the predicted summary (final_pred_summary) and the reference summary
  rouge = Rouge()
  result_rouge_list.append(rouge.get_scores(summary, final_pred_summary, avg=True))

  0%|          | 0/5 [00:00<?, ?it/s]



 20%|██        | 1/5 [00:03<00:14,  3.53s/it]

[1, 0, 9, 7]
[0, 1, 7, 9]


 40%|████      | 2/5 [00:03<00:05,  1.68s/it]

[17, 9, 3, 7]
[3, 7, 9, 17]


 60%|██████    | 3/5 [00:05<00:03,  1.54s/it]

[29, 3, 71, 10]
[3, 10, 29, 71]


 80%|████████  | 4/5 [00:06<00:01,  1.23s/it]

[47, 49, 3, 50]
[3, 47, 49, 50]


100%|██████████| 5/5 [00:06<00:00,  1.33s/it]

[23, 19, 22, 21]
[19, 21, 22, 23]





In [None]:
agg_rouge_score = {'rouge-1':{'r':0,'p':0,'f':0},'rouge-2':{'r':0,'p':0,'f':0},
                   'rouge-l':{'r':0,'p':0,'f':0}}

for i in range(len(result_rouge_list)):
  for rouge_type in ['rouge-1','rouge-2','rouge-l']:
    for j in ['r','p','f']:
      agg_rouge_score[rouge_type][j]+=(result_rouge_list[i][rouge_type][j]/len(result_rouge_list))

print(agg_rouge_score)

{'rouge-1': {'r': 0.16407103825136612, 'p': 0.2832755881081892, 'f': 0.19707379504003636}, 'rouge-2': {'r': 0.027064050351721585, 'p': 0.04314285714285714, 'f': 0.03299265234009121}, 'rouge-l': {'r': 0.13870466964729258, 'p': 0.23305466044444986, 'f': 0.1652840801373595}}
