In [None]:
#google colab connection
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!git clone https://github.com/nianlonggu/MemSum.git
##https://github.com/nianlonggu/MemSum

In [None]:
import os
os.chdir("MemSum")

In [None]:
!pip install -r requirements.txt -q

In [None]:
!pip install nltk
import nltk
from nltk import tokenize
nltk.download('punkt')

In [None]:
#CSV 
import csv 
import re

import pickle
import os
import sys
csv.field_size_limit(sys.maxsize)

from src.data_preprocessing.MemSum.utils import greedy_extract
import json
from tqdm import tqdm

In [None]:
from summarizers import MemSum
from tqdm import tqdm
from rouge_score import rouge_scorer
import json
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
import deepl
translator = deepl.Translator("TODO") #add your Deepl auth_key #https://github.com/DeepLcom/deepl-python

In [None]:
#two methods to read and save CSV files 
def readCSVFile(Path, encoding, incl_header = False, delimiter = ";"):
  rows = []
  with open(Path, 'r', encoding=encoding) as file:
    reader = csv.reader(file, delimiter=delimiter)
    if(not incl_header):
      next(reader)
    for row in reader:
        rows.append(row)
  return rows

def saveCSVFile(filename, array):
  with open(filename, 'w', encoding='utf-8-sig') as file: 
      write = csv.writer(file, delimiter=';') 
      write.writerows(array) 

In [None]:
def preprocessing_datasets(dataset_train_path):
  rows = readCSVFile(dataset_train_path, 'utf-8')
  #split train - validation 
  train_rows ,val_rows = train_test_split(rows,test_size=0.1)

  train_data = []
  for row in train_rows: 
    data = {}
    data["text"] = tokenize.sent_tokenize(row[1])
    data["summary"] = tokenize.sent_tokenize(row[2])
    train_data.append(data)

  val_data = []
  for row in val_rows:
    data = {}
    data["text"] = tokenize.sent_tokenize(row[1])
    data["summary"] = tokenize.sent_tokenize(row[2])
    val_data.append(data)

  return train_data, val_data

def preprocessing_test_datasets(dataset_test_path):
  rows = readCSVFile(dataset_test_path, 'utf-8')
  test_data = []
  for row in rows: 
    data = {}
    data["text"] = tokenize.sent_tokenize(row[1])
    data["summary"] = tokenize.sent_tokenize(row[2])
    test_data.append(data)
  return test_data

In [None]:
def generate_custom_labelled_json(train_data, train_val = "train"):
  for data in train_data:
      high_rouge_episodes = greedy_extract( data["text"], data["summary"], beamsearch_size = 2)
      indices_list = []
      score_list  = []
      for indices, score in high_rouge_episodes:
          indices_list.append( indices )
          score_list.append(score)

      data["indices"] = indices_list
      data["score"] = score_list

  with open(MemSum_path+"data/custom_data/"+train_val+"_CUSTOM_labelled.jsonl","w") as f:
      for data in train_data:
          f.write(json.dumps(data) + "\n")

In [None]:
def add_translation(source_path, res_path, column_to_translate, target_language): 
  with open(source_path,'r') as input_file:
    with open(res_path, 'w') as output_file:
        res = []
        writer = csv.writer(output_file, lineterminator='\n',  delimiter=';')
        reader = csv.reader(input_file,  delimiter=';')
        row = next(reader)
        row.append('generated_summary_translated')
        res.append(row)
        for row in reader:
            translated_summary = translator.translate_text(row[column_to_translate], target_lang=target_language)
            row.append(translated_summary)
            res.append(row)
        writer.writerows(res)

In [None]:
def generate_summary(memsum_custom_data, max_sentences, text): 
  sentences = tokenize.sent_tokenize(text)
  extracted_summary_sentences = memsum_custom_data.extract([sentences], p_stop_thres = 0.6, max_extracted_sentences_per_document = max_sentences)[0] 
  extracted_summary = "\n".join(sorted(extracted_summary_sentences, key=lambda x: text.index(x)))
  return extracted_summary
  
def evaluate(model, corpus, p_stop, max_extracted_sentences, rouge_cal ):
    scores = []
    for data in tqdm(corpus):
        gold_summary = data["summary"]
        extracted_summary = model.extract( [data["text"]], p_stop_thres = p_stop, max_extracted_sentences_per_document = max_extracted_sentences )[0]
        
        score = rouge_cal.score( "\n".join( gold_summary ), "\n".join(extracted_summary)  )
        scores.append( [score["rouge1"].fmeasure, score["rouge2"].fmeasure, score["rougeLsum"].fmeasure ] )
    return np.asarray(scores).mean(axis = 0)   

In [None]:
#used to download the glove embedding (200dim) used in MemSum, with three addition token embeddings for bos eos pad, e
#https://github.com/nianlonggu/MemSum
!pip install gdown -q
try:
    os.system("rm -r model")
    os.makedirs("model/")
except:
    pass
!cd model/; gdown --folder https://drive.google.com/drive/folders/1lrwYrrM3h0-9fwWCOmpRkydvmF6hmvmW


if not os.path.exists("model/glove"):
    try:
        os.makedirs("model/glove")
        os.system("mv model/*.pkl model/glove/")
    except:
        pass

In [None]:
## paramater 
dataset_path = "English" #path need to be changed
max_doc_len = '4096'
max_seq_len = '512'
num_of_epochs = '10'
save_every = '1000'
n_device = '2'
batch_size_per_device = '1'
max_extracted_sentences_per_document = '12'
moving_average_decay = '0.999'
p_stop_thres  = '0.6'

In [None]:
#training for each fold 
for id in range(1,6): 
  fold = str(id)
  dataset_train_path = dataset_path+"English_Ext_Train_hEn_to_De_Step_"+fold+".csv"
  train_data, val_data =  preprocessing_datasets(dataset_train_path)
  generate_custom_labelled_json(train_data)
  generate_custom_labelled_json(val_data, "val")
  os.system("python "+MemSum_path+"src/MemSum_Full/train.py -training_corpus_file_name "+MemSum_path+"data/custom_data/train_CUSTOM_labelled.jsonl -validation_corpus_file_name "+
            MemSum_path+"data/custom_data/val_CUSTOM_labelled.jsonl -model_folder "+MemSum_path+"model/MemSum_Full/custom_data/200dim/run0/"+fold+"/ -log_folder "+
            MemSum_path+"model/log/MemSum_Full/custom_data/200dim/run0/"+fold+"/ -vocabulary_file_name "+MemSum_path+"model/glove/vocabulary_200dim.pkl -pretrained_unigram_embeddings_file_name "
            +MemSum_path+"model/glove/unigram_embeddings_200dim.pkl -max_seq_len "+max_seq_len+" -max_doc_len "+max_doc_len+"  -num_of_epochs "+num_of_epochs+" -save_every "+
            save_every+" -n_device "+n_device+" -batch_size_per_device "+batch_size_per_device+" -max_extracted_sentences_per_document "+max_extracted_sentences_per_document+" -moving_average_decay "+moving_average_decay+" -p_stop_thres "+p_stop_thres)



In [None]:
#text generation for each fold
for id in range(1,6): 
  fold = str(id)
  run_path = MemSum_path+'model/MemSum_Full/custom_data/200dim/run0/'+fold+'/model_batch_990.pt'
  print(run_path)
  res_path = '/Generated_summaries/summ_eval_step_'+str(id)+'.csv'
  dataset_test_path = dataset_path+"English_Ext_Val_hEn_to_De_Step_"+fold+".csv"
  data = readCSVFile(dataset_test_path, 'utf-8')
  test_data =  preprocessing_test_datasets(dataset_test_path)
  memsum_custom_data = MemSum(run_path, 
                              MemSum_path+"model/glove/vocabulary_200dim.pkl", 
                              gpu = 0 ,  max_doc_len = 512)
  res = []
  res.append(["id","text", "reference_summary_sl", "reference_summary_ol", "generated_summary"])
  for row in data:
    res.append([row[0], row[1], row[2], row[3], generate_summary(memsum_custom_data, max_extracted_sentences_per_document, row[1])])
                 
  saveCSVFile(res_path, res)
  rouge_cal = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeLsum'], use_stemmer=True)
  print(evaluate(memsum_custom_data, test_data, 0.6, 7, rouge_cal))

In [None]:
#add translation 
for i in range(1,6):
  source_path ="/summ_eval_step_"+str(i)+".csv"
  res_path = source_path.replace(".csv", "_with_translation.csv")
  #print(res_path)
  add_translation(source_path, res_path, 4, "DE" )