# We start by importing all relevant libraries

In [None]:
import re
import nltk.data
from nltk.probability import FreqDist
from nltk.tokenize import WordPunctTokenizer,sent_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import glob
import codecs
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
from unidecode import unidecode
from joblib import Parallel,delayed


In [None]:
def remove_non_ascii(text):
    """This function removes all non-ascii characters from text and replaces them with their closest ascii representation"""
    return unidecode(unicode(text, encoding = "utf-8"))

In [None]:
# we expand the set of stopwords in English to eliminate some very common words in papers, such as Fig, Table, Equation, Eq
stop_words = list(set(stopwords.words('english')))
stop_words.extend(['Fig','Fig.','Table','table','Equation','equation','Eq.','Eq','Figure','figure','below','follows'])

# We then proceed with one example before preprocessing the entire base

In [None]:
paper_list = pd.Series(sorted(glob.glob('/home/joao/Thesis/test_set/abridged/*.txt')))

In [None]:
paper = paper_list[0]

In [None]:
# we then start tokenizing the text:
# we then remove all non-ascii characters
def get_ts_matrix(paper):
    #opening example file
    with open(paper,'rb') as f:
        raw_text = f.read()
    # removing all non-unicode characters: 
    raw_text = remove_non_ascii(raw_text)
    # we start by removing all citations and things between parenthesis:
    raw_text = re.sub(r'\([^)]*\)', '',raw_text)
    this_file = paper[:-4].split('/')[-1]
    sentences = sent_tokenize((raw_text))
    tokenized = []
    useful_sentences = []
    for i in range(len(sentences)):
        if(any(c.isalpha() for c in sentences[i])):
            tokenized.append(WordPunctTokenizer().tokenize(sentences[i].strip()))
            useful_sentences.append(sentences[i].strip())
    sentences = useful_sentences

    # We then create the term-sentence matrix from the cleaned up-text
    sentences_df = pd.DataFrame(sentences)
    sentences_df['filename'] = this_file
    sentences_df['sentence_order'] = sentences_df.index.values
    sentences_df.columns = ['sentence','filename','sentence_order']
    full_tokenized = []
    for i in tokenized:
        full_tokenized.extend(i)

    ts_matrix = pd.DataFrame(columns = sorted(set(full_tokenized)))
    accumulator = []
    accumulator.append(ts_matrix)
    for i in range(len(tokenized)):
        fdist = FreqDist(tokenized[i])
        tmp = pd.DataFrame.from_dict(dict(fdist), orient = 'index')
        tmp1 = tmp.reset_index()
        tmp1.columns = ['word','count']
        tmp1 = pd.pivot_table(tmp1, columns = 'word')
        tmp1.index = [i]
        accumulator.append(tmp1)
    ts_matrix = pd.concat(accumulator, ignore_index = True)

    # we then remove the stopwords:

    stop_words = list(set(stopwords.words('english')))
    stop_words.extend(['Fig','Fig.','fig','eq','Table','table','Equation','equation','Eq.','Eq','Figure','figure','below','follows'])   
    ts_matrix = ts_matrix[ts_matrix.columns[~ts_matrix.columns.str.lower().isin(stop_words)]]
    #adding the file for reference:
    ts_matrix['this_file_name'] = this_file
    ts_matrix['sentence_order'] = ts_matrix.index.values
    ts_matrix = ts_matrix[list(ts_matrix.columns[ts_matrix.columns.str.isalpha()])+['this_file_name','sentence_order']]
    rows_to_drop = ts_matrix.index[ts_matrix.drop(columns = ['this_file_name','sentence_order']).fillna(0).sum(axis =1) == 0]
    sentence_bank = sentences_df.drop(index = rows_to_drop).reset_index(drop = True)
    ts_matrix = ts_matrix.drop(index=  rows_to_drop).reset_index(drop = True)
    # we finally eliminate all 2 letter words to avoid the influence of units of measure in the summarization:
    ts_matrix = ts_matrix[ts_matrix.columns[ts_matrix.columns.str.len()>2]]
    word_count = sentence_bank.sentence.str.split(' ').str.len()
    ts_matrix_file = '/home/joao/Thesis/test_set/ts_matrices/' + this_file + '.p'
    sentence_bank_file = '/home/joao/Thesis/test_set/sentence_banks/'+ this_file+ '.p'
    word_counts = []
    for i in sentence_bank.sentence:
        word_counts.append(len(WordPunctTokenizer().tokenize(i.strip())))
    ts_matrix['word_count'] = word_counts
    # eliminating empty rows:
    empty_rows = ts_matrix[ts_matrix.drop(columns = ['word_count','sentence_order','this_file_name']).sum(axis = 1) == 0].index
    ts_matrix.drop(index = empty_rows, inplace = True)
    sentence_bank.drop(index = empty_rows, inplace = True)
    ts_matrix.reset_index(inplace = True, drop = True)
    sentence_bank.reset_index(inplace = True, drop = True)
    ts_matrix.to_pickle(ts_matrix_file)
    sentence_bank.to_pickle(sentence_bank_file)
    return 0

# Paralellizing the execution of the transformations:

In [None]:
paper_list = pd.Series(sorted(glob.glob('/home/joao/Thesis/test_set/abridged/*.txt')))


In [None]:
k = Parallel(n_jobs = 8, verbose = 11)(delayed(get_ts_matrix)(paper) for paper in paper_list)

Execution time : 43.8s

# modifying the function to get a tf_matrix for the titles

In [None]:
import re
import nltk.data
from nltk.probability import FreqDist
from nltk.tokenize import WordPunctTokenizer,sent_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import glob
import codecs
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
from unidecode import unidecode
from joblib import Parallel,delayed

def remove_non_ascii(text):
    """This function removes all non-ascii characters from text and replaces them with their closest ascii representation"""
    return unidecode(unicode(text, encoding = "utf-8"))

paper_list = pd.Series(sorted(glob.glob('/home/joao/Thesis/test_set/abridged/*.txt')))
paper_list[0].split('/')[-1][:-4]

In [None]:
# we then start tokenizing the text:
# we then remove all non-ascii characters
def get_ts_titles(paper):
    #opening example file
    raw_text = paper.split('/')[-1][:-4]
    # removing all non-unicode characters: 
    raw_text = remove_non_ascii(raw_text)
    # we start by removing all citations and things between parenthesis:
    raw_text = re.sub(r'\([^)]*\)', '',raw_text)
    this_file = paper[:-4].split('/')[-1]
    sentences = sent_tokenize((raw_text))
    tokenized = []
    useful_sentences = []
    for i in range(len(sentences)):
        if(any(c.isalpha() for c in sentences[i])):
            tokenized.append(WordPunctTokenizer().tokenize(sentences[i].strip()))
            useful_sentences.append(sentences[i].strip())
    sentences = useful_sentences

    # We then create the term-sentence matrix from the cleaned up-text
    sentences_df = pd.DataFrame(sentences)
    sentences_df['filename'] = this_file
    sentences_df['sentence_order'] = sentences_df.index.values
    sentences_df.columns = ['sentence','filename','sentence_order']
    full_tokenized = []
    for i in tokenized:
        full_tokenized.extend(i)

    ts_matrix = pd.DataFrame(columns = sorted(set(full_tokenized)))
    accumulator = []
    accumulator.append(ts_matrix)
    for i in range(len(tokenized)):
        fdist = FreqDist(tokenized[i])
        tmp = pd.DataFrame.from_dict(dict(fdist), orient = 'index')
        tmp1 = tmp.reset_index()
        tmp1.columns = ['word','count']
        tmp1 = pd.pivot_table(tmp1, columns = 'word')
        tmp1.index = [i]
        accumulator.append(tmp1)
    ts_matrix = pd.concat(accumulator, ignore_index = True)
    # we then remove the stopwords:

    stop_words = list(set(stopwords.words('english')))
    stop_words.extend(['Fig','Fig.','fig','eq','Table','table','Equation','equation','Eq.','Eq','Figure','figure','below','follows'])   
    ts_matrix = ts_matrix[ts_matrix.columns[~ts_matrix.columns.str.lower().isin(stop_words)]]
    #adding the file for reference:
    ts_matrix['this_file_name'] = this_file
    ts_matrix['sentence_order'] = ts_matrix.index.values
    ts_matrix = ts_matrix[list(ts_matrix.columns[ts_matrix.columns.str.isalpha()])+['this_file_name','sentence_order']]
    rows_to_drop = ts_matrix.index[ts_matrix.drop(columns = ['this_file_name','sentence_order']).fillna(0).sum(axis =1) == 0]
    sentence_bank = sentences_df.drop(index = rows_to_drop).reset_index(drop = True)
    ts_matrix = ts_matrix.drop(index=  rows_to_drop).reset_index(drop = True)
    # we finally eliminate all 2 letter words to avoid the influence of units of measure in the summarization:
    ts_matrix = ts_matrix[ts_matrix.columns[ts_matrix.columns.str.len()>2]]
    word_count = sentence_bank.sentence.str.split(' ').str.len()
    ts_matrix_file = '/home/joao/Thesis/test_set/titles/' + this_file + '.p'
    ts_matrix.to_pickle(ts_matrix_file)
    return 0

In [None]:
k = Parallel(n_jobs = 8, verbose = 11)(delayed(get_ts_titles)(paper) for paper in paper_list)

# Characterizing the datasets:

In [None]:
import pandas as pd 
import glob
import numpy as np

ts_matrix_list = sorted(glob.glob('/home/joao/Thesis/test_set/ts_matrices/*.p'))

In [None]:
sentences = []
avg_word_count = []
total_words = []
for i in ts_matrix_list:
    tmp = pd.read_pickle(i)
    sentences.append(tmp.shape[0])
    avg_word_count.append(tmp.word_count.mean())
    total_words.append(tmp.word_count.sum())

In [None]:
print(np.mean(sentences),np.std(sentences,ddof = 1),np.mean(avg_word_count),np.std(avg_word_count,ddof = 1),np.mean(total_words),np.std(total_words,ddof = 1))

(109.18840579710145, 54.90292914138093, 22.93907341185208, 2.571579809536739, 2475.086956521739, 1213.5620968519086)

# Benchmarking sanity metrics

In [7]:
from rouge import Rouge,FilesRouge
import pandas as pd
import glob
from tqdm import tqdm
import numpy as np
from unidecode import unidecode

# we start by creating the summaries, i.e. picking the first k sentences until we have 250 words

sentence_banks = sorted(glob.glob('/home/joao/Thesis/test_set/sentence_banks/*.p'))

abstracts = []
filenames = []

for i in sentence_banks:
    filenames.append(i.split('/')[-1])
    sentence_bank = pd.read_pickle(i)
    abstract = ''
    for j in sentence_bank.sentence:
        if(len((abstract+j).split(' '))<250):
            abstract +=j
        else:
            break
    abstracts.append(abstract)
final_df = pd.DataFrame({'filenames':filenames,'abstracts':abstracts})

In [8]:
results_df = final_df.copy()
def remove_non_ascii(text):
    """This function removes all non-ascii characters from
    text and replaces them with their closest ascii representation"""
    return unidecode(unicode(text, encoding = "utf-8"))
# we then load all summaries and candidate summaries:

total_scores = []
scores = []
r1 = []
r2 = []
rl = []

for i in tqdm(results_df.index):
    ground_truth = ('/home/joao/Thesis/test_set/abstracts/ground_truths/'+ 
                    results_df.loc[i,'filenames'][:-2]+'.txt')
    rouge = Rouge()
    with open(ground_truth,'rb') as f:
        ground_truth = f.read()
    ground_truth = remove_non_ascii(ground_truth)
    tmp_scores = rouge.get_scores(results_df.loc[i,'abstracts'],
                                  ground_truth, avg = True)
    r2.append(tmp_scores['rouge-2']['f'])
    r1.append(tmp_scores['rouge-1']['f'])
    rl.append(tmp_scores['rouge-l']['f'])

100%|██████████| 69/69 [00:02<00:00, 25.22it/s]


In [9]:
print('r1',np.mean(r1),np.std(r1,ddof = 1))
print('r2',np.mean(r2),np.std(r2, ddof = 1))
print('rl',np.mean(rl),np.std(rl, ddof = 1))

('r1', 0.2798802914809339, 0.0640168587924369)
('r2', 0.07973435897110052, 0.04825397586421055)
('rl', 0.23666681693479602, 0.06376390119510343)


# Randomly selecting Sentences:

In [19]:
from rouge import Rouge,FilesRouge
import pandas as pd
import glob
from tqdm import tqdm
import numpy as np
from unidecode import unidecode
from tqdm import tqdm
from joblib import Parallel,delayed

def remove_non_ascii(text):
    """This function removes all non-ascii characters from
    text and replaces them with their closest ascii representation"""
    return unidecode(unicode(text, encoding = "utf-8"))
    

# we start by creating the summaries, i.e. picking the first k sentences until we have 250 words

sentence_banks = sorted(glob.glob('/home/joao/Thesis/test_set/sentence_banks/*.p'))



R1 = []
R2 = []
RL = []
    
def evaluate_random_abstracts(k):
    abstracts = []
    filenames = []
    for i in sentence_banks:
        filenames.append(i.split('/')[-1])
        sentence_bank = pd.read_pickle(i)
        sentence_bank = sentence_bank.sample(frac = 1)
        abstract = ''
        for j in sentence_bank.sentence:
            if(len((abstract+j).split(' '))<250):
                abstract +=j
            else:
                break
        abstracts.append(abstract)
    final_df = pd.DataFrame({'filenames':filenames,'abstracts':abstracts})
    results_df = final_df.copy()

    # we then load all summaries and candidate summaries:

    total_scores = []
    scores = []
    r1 = []
    r2 = []
    rl = []

    for i in results_df.index:
        ground_truth = ('/home/joao/Thesis/test_set/abstracts/ground_truths/'+ 
                        results_df.loc[i,'filenames'][:-2]+'.txt')
        rouge = Rouge()
        with open(ground_truth,'rb') as f:
            ground_truth = f.read()
        ground_truth = remove_non_ascii(ground_truth)
        tmp_scores = rouge.get_scores(results_df.loc[i,'abstracts'],
                                      ground_truth, avg = True)
        r2.append(tmp_scores['rouge-2']['f'])
        r1.append(tmp_scores['rouge-1']['f'])
        rl.append(tmp_scores['rouge-l']['f'])
    return np.mean(r1),np.std(r1,ddof = 1),np.mean(r2),np.std(r2, ddof = 1),np.mean(rl),np.std(rl, ddof = 1)

final_results = Parallel(n_jobs = -1, verbose = 11)(delayed(evaluate_random_abstracts)(k) for k in range(200))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed:  1.9min
[Paralle

In [20]:
r1 = []
r1_std = []
r2 = []
r2_std = []
rl = []
rl_std = []
for i in final_results:
    r1.append(i[0])
    r1_std.append(i[1])
    r2.append(i[2])
    r2_std.append(i[3])
    rl.append(i[4])
    rl_std.append(i[5])

In [21]:
print('r1',np.mean(r1),np.mean(r1_std))
print('r2',np.mean(r2),np.mean(r2_std))
print('rl',np.mean(rl),np.mean(rl_std))

('r1', 0.31365782578632995, 0.05878268006109875)
('r2', 0.09443756653738461, 0.04765202054540724)
('rl', 0.2660036938788816, 0.06054348730727893)
