# We start by importing all the relevant libraries

In [None]:
import pandas as pd 
import numpy as np
import glob
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('averaged_perceptron_tagger')
from gurobipy import *
from joblib import Parallel,delayed

# We then load an example file

In [None]:
ts_matrices = pd.Series(sorted(glob.glob('/home/joao/Thesis/ts_matrices_original/*.p')))

In [None]:
ts_matrix_file = ts_matrices[0]

In [None]:
ts_matrix = pd.read_pickle(ts_matrix_file)

In [None]:
ts_matrix.shape

In [None]:
# we also read the titles dataframe:
title_file = '/home/joao/Thesis/titles/'+ ts_matrix_file.split('/')[-1]
title = pd.read_pickle(title_file)


# We start by computing the initial hub and authority scores:

In [None]:
#in order to compare the similarity of the sentences to that of the title
# we must put them in the same vector space:
#(i.e. for the sake of comparison we temporarily extend both dataframes 
to the same space)
tmp1 = title.drop(columns = ['this_file_name','sentence_order']).copy()
tmp2 = ts_matrix.drop(columns = ['this_file_name','sentence_order',
                                 'word_count']).copy()
connectivity_matrix = ts_matrix.drop(columns = ['this_file_name',
                                                'sentence_order',
                                                'word_count']) > 0
for i in tmp2.columns[~tmp2.columns.isin(tmp1.columns)]:
    tmp1[i] = 0
for i in tmp1.columns[~tmp1.columns.isin(tmp2.columns)]:
    tmp2[i] = 0 
    
# we must then tag each of the components and consider only nouns
result = nltk.pos_tag(tmp1.columns)

word = []
pos_tag = []
for i in result:
    word.append(i[0])
    pos_tag.append(i[1])
pos_tags = pd.DataFrame({'pos_tag':pos_tag,'word':word})
useful_tags = pos_tags.word[pos_tags.pos_tag.str.startswith('N')].values
tmp1 = tmp1.loc[:,useful_tags]
tmp2 = tmp2.loc[:,useful_tags]
# we then put them both in the same order:
tmp1 = tmp1[sorted(tmp1.columns)]
tmp2 = tmp2[sorted(tmp1.columns)]


full_title = tmp1.fillna(0).sum(axis = 0)
tmp2 = tmp2.fillna(0)
# we then calculate the similarities of the hubs (sentences) to the title
hub_ranks = 1 + cosine_similarity(tmp2,full_title.reshape(1,-1))
# we then calculate the authorities rank
authority_ranks = 1 + connectivity_matrix.sum(axis = 0
                ) +connectivity_matrix.columns.isin(title.columns)

In [None]:
# we then apply the hiits algorithm
hubs_matrix = np.matmul(connectivity_matrix,
                        connectivity_matrix.transpose())
authorities_matrix = np.matmul(connectivity_matrix.transpose(),
                               connectivity_matrix)

max_diff = 10000
counter = 1
while(max_diff > 0.000000000001 or counter > 100000):
    old_hub_ranks = hub_ranks.copy()
    hub_ranks = np.matmul(hubs_matrix,hub_ranks)
    hub_ranks = hub_ranks/np.linalg.norm(hub_ranks)
    old_authority_ranks = authority_ranks.copy()
    authority_ranks= np.matmul(authorities_matrix,authority_ranks)
    authority_ranks = authority_ranks/np.linalg.norm(authority_ranks)
    auth_diff = np.linalg.norm(authority_ranks - old_authority_ranks)
    hub_diff = np.linalg.norm(hub_ranks - old_hub_ranks)
    max_diff = max(hub_diff,auth_diff)
    counter +=1


# we then build the projection of the graph

In [None]:
projection = np.zeros((connectivity_matrix.shape[0],connectivity_matrix.shape[0]))

for i in tqdm(range(connectivity_matrix.shape[0])):
    for j in range(i+1,connectivity_matrix.shape[0]):
        projection[i,j] = np.logical_and(
            connectivity_matrix.loc[i,:],connectivity_matrix.loc[j,:]
        ).sum()

In [None]:
outdegrees = (projection>0).sum(axis = 1)
position = range(ts_matrix.shape[0])
position = np.array(sorted(position, reverse = True)).astype(float)
outdegrees = np.divide(outdegrees,position+1)
outdegrees

In [None]:
# we then declare the optimization model: 

summarizer = Model('summarizer')

# we create a list of all sentences:
var_names = 'sentence_' + connectivity_matrix.index.astype(str)
portvars_x = [summarizer.addVar(vtype = "B",
                                name = symb) for symb in var_names]
portvars_x = pd.Series(portvars_x, index = var_names)
portfolio_x = pd.DataFrame({'Variables':portvars_x})
entities = connectivity_matrix.columns
portvars_y = [summarizer.addVar(vtype = "B",
                                name = symb) for symb in entities]
portvars_y = pd.Series(portvars_y, index = entities)
portfolio_y = pd.DataFrame({'Variables':portvars_y})

summarizer.update()

length = portvars_x.dot(ts_matrix.word_count.values)
# we create a similar connectivity_matrix and rename its axis:
new_conn = connectivity_matrix.copy()
new_conn.index = portvars_x.index

# adding the additional consistency contraints on y:
for i in new_conn.index:
    this_line = new_conn.loc[i,:]
    total_entities = (this_line >0).sum()
    local_entities = this_line[this_line > 0].index
    temp_sum = portvars_y[local_entities].sum()
    summarizer.addConstr(temp_sum >= total_entities*portvars_x[i])
#adding the additional consistency constraints on x
for i in new_conn.columns:
    this_column = new_conn.loc[:,i]
    total_sentences = this_column.index[this_column > 0]
    temp_sum = portvars_x[total_sentences].sum()
    summarizer.addConstr(temp_sum >= portvars_y[i])
word_limit = 400

# adding the final constraint on model size 
summarizer.addConstr(length <= 400)

# defining the objective function:
coverage = portvars_x.dot(hub_ranks)
coherence = portvars_x.dot(outdegrees)
diversity = portvars_y.sum()/portvars_y.shape[0]

summarizer.setObjective((coverage + coherence + diversity)[0],
                        GRB.MAXIMIZE)

summarizer.update()

In [None]:
summarizer.optimize()

In [None]:
# we process the answers:
selected_sentences = []
for i in portvars_x:
    selected_sentences.append(i.getAttr('x'))
selected_sentences = pd.Series(selected_sentences)

# we then load the sentence_banks to check the results
sentence_bank_file = ('/home/joao/Thesis/sentence_bank/'+ 
                        ts_matrix_file.split('/')[-1])

sentence_bank = pd.read_pickle(sentence_bank_file)

abstract = sentence_bank.loc[selected_sentences > 0,'sentence']

final_abstract = ''
for i in abstract:
    final_abstract += i

In [None]:
# let's score it! 
from rouge import Rouge,FilesRouge

# we then load all summaries and candidate summaries:
true_summaries = pd.Series(sorted(glob.glob(
    '/home/joao/Thesis/simplified_abstracts/*')))

total_scores = []
this_file = ts_matrix_file.split('/')[-1]
this_file_num = this_file.split('.')[-2][1:4]
ground_truths = true_summaries[true_summaries.str[-7:-4] == 
                               this_file_num]
scores = []
rouge = Rouge()
auto_summary = final_abstract
for j in ground_truths:
    with open(j,'rb') as f:
        ground_truth = f.read()
        tmp_scores = rouge.get_scores(auto_summary,ground_truth,
                                      avg = True)
    scores.append(tmp_scores['rouge-2']['p'])
total_scores.append(np.mean(scores))

np.mean(scores)

# now turning it into a function and running it for the entire database:

In [None]:
## relevant imports 
import pandas as pd 
import numpy as np
import glob
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('averaged_perceptron_tagger')
from gurobipy import *
from joblib import Parallel,delayed

In [None]:
def summarize_text(ts_matrix_file):
    ts_matrix = pd.read_pickle(ts_matrix_file)
    # we also read the titles dataframe:
    title_file = '/home/joao/Thesis/titles/'+ ts_matrix_file.split(
        '/')[-1]
    title = pd.read_pickle(title_file)
    #in order to compare the similarity of the sentences to 
    #that of the title
    # we must put them in the same vector space:
    #(i.e. for the sake of comparison we temporarily extend 
    #both dataframes to the same space)
    tmp1 = title.drop(columns = ['this_file_name',
                                 'sentence_order']).copy()
    tmp2 = ts_matrix.drop(columns = ['this_file_name',
                                     'sentence_order','word_count'
                                    ]).copy()
    connectivity_matrix = ts_matrix.drop(columns = ['this_file_name',
                                                    'sentence_order',
                                                    'word_count']).copy() > 0
    for i in tmp2.columns[~tmp2.columns.isin(tmp1.columns)]:
        tmp1[i] = 0
    for i in tmp1.columns[~tmp1.columns.isin(tmp2.columns)]:
        tmp2[i] = 0 

#     we must then tag each of the components and consider only nouns
    result = nltk.pos_tag(tmp1.columns)

    word = []
    pos_tag = []
    for i in result:
        word.append(i[0])
        pos_tag.append(i[1])
    pos_tags = pd.DataFrame({'pos_tag':pos_tag,'word':word})
    useful_tags = pos_tags.word[pos_tags.pos_tag.str.startswith('N')].values
    # we then remove all empty rows
    connectivity_matrix = connectivity_matrix.loc[:,connectivity_matrix.columns[connectivity_matrix.columns.isin(useful_tags)]] >0
    indices_to_drop = connectivity_matrix.index[connectivity_matrix.sum(axis = 1)==0]
    connectivity_matrix = connectivity_matrix.drop(index=  indices_to_drop) >0
    original_indices = connectivity_matrix.index
    connectivity_matrix.reset_index(inplace = True,drop = True)
    tmp1 = tmp1.loc[:,useful_tags]
    tmp2 = tmp2.loc[:,useful_tags]
    print(tmp2.index)
    tmp2.drop(index= indices_to_drop, inplace = True)
    tmp2.reset_index(inplace = True, drop = True)
    ts_matrix.drop(index = indices_to_drop, inplace = True)
    ts_matrix.reset_index(inplace = True, drop = True)
    # we then put them both in the same order:

    tmp1 = tmp1[sorted(tmp1.columns)]
    tmp2 = tmp2[sorted(tmp1.columns)]

    full_title = tmp1.fillna(0).sum(axis = 0)
    tmp2 = tmp2.fillna(0)
    # we then calculate the similarities of the hubs (sentences) to the title
    hub_ranks = 1 + cosine_similarity(tmp2,full_title.reshape(1,-1))
    # we then calculate the authorities rank
    authority_ranks = 1 + connectivity_matrix.sum(
        axis = 0) +connectivity_matrix.columns.isin(title.columns)
    # we then apply the hiits algorithm
    hubs_matrix = np.matmul(connectivity_matrix,connectivity_matrix.transpose())
    authorities_matrix = np.matmul(connectivity_matrix.transpose(),connectivity_matrix)

    max_diff = 10000
    counter = 1
    while(max_diff > 0.000000000001 or counter > 100000):
        old_hub_ranks = hub_ranks.copy()
        hub_ranks = np.matmul(hubs_matrix,hub_ranks)
        hub_ranks = hub_ranks/np.linalg.norm(hub_ranks)
        old_authority_ranks = authority_ranks.copy()
        authority_ranks= np.matmul(authorities_matrix,authority_ranks)
        authority_ranks = authority_ranks/np.linalg.norm(authority_ranks)
        auth_diff = np.linalg.norm(authority_ranks - old_authority_ranks)
        hub_diff = np.linalg.norm(hub_ranks - old_hub_ranks)
        max_diff = max(hub_diff,auth_diff)
        counter +=1
    projection = np.zeros((connectivity_matrix.shape[0],
                           connectivity_matrix.shape[0]))

    for i in tqdm(range(connectivity_matrix.shape[0])):
        for j in range(i+1,connectivity_matrix.shape[0]):
            projection[i,j] = np.logical_and(connectivity_matrix.loc[i,:],connectivity_matrix.loc[j,:]).sum()

    outdegrees = (projection>0).sum(axis = 1)
    position = original_indices
    position = np.array(sorted(position, reverse = True)).astype(float)
    outdegrees = np.divide(outdegrees,position+1)
    # we then declare the optimization model: 

    summarizer = Model('summarizer')

    # we create a list of all sentences:
    var_names = 'sentence_' + connectivity_matrix.index.astype(str)
    portvars_x = [summarizer.addVar(vtype = "B",
                                    name = symb) for symb in var_names]
    portvars_x = pd.Series(portvars_x, index = var_names)
    portfolio_x = pd.DataFrame({'Variables':portvars_x})
    entities = connectivity_matrix.columns
    portvars_y = [summarizer.addVar(vtype = "B",
                                    name = symb) for symb in entities]
    portvars_y = pd.Series(portvars_y, index = entities)
    portfolio_y = pd.DataFrame({'Variables':portvars_y})

    summarizer.update()

    length = portvars_x.dot(ts_matrix.word_count.values)
    # we create a similar connectivity_matrix and rename its axis:
    new_conn = connectivity_matrix.copy()
    new_conn.index = portvars_x.index

    # adding the additional consistency contraints on y:
    for i in new_conn.index:
        this_line = new_conn.loc[i,:]
        total_entities = (this_line >0).sum()
        local_entities = this_line[this_line > 0].index
        if(local_entities.size != 0):
            temp_sum = portvars_y[local_entities].sum()
            summarizer.addConstr(temp_sum >= 
                                 total_entities*portvars_x[i])
    #adding the additional consistency constraints on x
    for i in new_conn.columns:
        this_column = new_conn.loc[:,i]
        total_sentences = this_column.index[this_column > 0]
        temp_sum = portvars_x[total_sentences].sum()
        summarizer.addConstr(temp_sum >= portvars_y[i])
    word_limit = 400

    # adding the final constraint on model size 
    summarizer.addConstr(length <= word_limit)

    # defining the objective function:
    coverage = portvars_x.dot(hub_ranks)
    coherence = portvars_x.dot(outdegrees)
    diversity = portvars_y.sum()/portvars_y.shape[0]

    summarizer.setObjective((coverage + coherence + diversity)[0],
                            GRB.MAXIMIZE)

    summarizer.update()
    
    #execute the optimization
    summarizer.optimize()
    
    # we process the answers:
    selected_sentences = []
    for i in portvars_x:
        selected_sentences.append(i.getAttr('x'))
    selected_sentences = pd.Series(selected_sentences)

    # we then load the sentence_banks to check the results
    sentence_bank_file = ('/home/joao/Thesis/sentence_bank/'+ 
                          ts_matrix_file.split('/')[-1])
    sentence_bank = pd.read_pickle(sentence_bank_file)
    sentence_bank.drop(index=  indices_to_drop, inplace = True)
    sentence_bank.reset_index(inplace = True, drop = True)
    abstract = sentence_bank.loc[selected_sentences > 0,'sentence']

    final_abstract = ''
    for i in abstract:
        final_abstract += i
    return(ts_matrix_file,final_abstract)

In [None]:
ts_matrices = pd.Series(sorted(glob.glob('/home/joao/Thesis/ts_matrices_original/*.p')))
final_results = Parallel(n_jobs = -1, verbose = 11)(delayed(
    summarize_text)(ts_matrix_file)for ts_matrix_file in ts_matrices)

# Execution time : 6 minutes

In [None]:
files = []
abstracts = []
for i in final_results:
    files.append(i[0])
    abstracts.append(i[1])
final_df = pd.DataFrame({'filenames':files,'abstracts':abstracts})

In [None]:
final_df.to_pickle('/home/joao/Thesis/sum_coh/train_results.p')

In [None]:
# let's see how we did! 
true_summaries = pd.Series(sorted(glob.glob(
    '/home/joao/Thesis/simplified_abstracts/*')))
total_scores = []
for i in final_df.index:
    this_file = final_df.loc[i,'filenames'].split('/')[-1]
    this_file_num = this_file.split('.')[-2][1:4]
    ground_truths = true_summaries[true_summaries.str[-7:-4] == 
                                   this_file_num]
    scores = []
    rouge = Rouge()
    auto_summary = final_df.loc[i,'abstracts']
    for j in ground_truths:
        with open(j,'rb') as f:
            ground_truth = f.read()
            tmp_scores = rouge.get_scores(auto_summary,
                                          ground_truth,avg = True)
        scores.append(tmp_scores['rouge-2']['p'])
    total_scores.append(np.mean(scores))

In [None]:
total_series = pd.Series(total_scores)

# Now adapting the function to run for the articles database:

In [None]:
## relevant imports 
import pandas as pd 
import numpy as np
import glob
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('averaged_perceptron_tagger')
from gurobipy import *
from joblib import Parallel,delayed

def summarize_text(ts_matrix_file):
    ts_matrix = pd.read_pickle(ts_matrix_file)
    # we also read the titles dataframe:
    title_file = ('/home/joao/Thesis/test_set/titles/'+ 
                  ts_matrix_file.split('/')[-1])
    title = pd.read_pickle(title_file)
    #in order to compare the similarity of the sentences to that of the title
    # we must put them in the same vector space:
    #(i.e. for the sake of comparison we temporarily extend both dataframes to the same space)
    tmp1 = title.drop(columns = ['this_file_name',
                                 'sentence_order']).copy()
    tmp2 = ts_matrix.drop(columns = ['this_file_name',
                                     'sentence_order','word_count'
                                    ]).copy()
    connectivity_matrix = ts_matrix.drop(columns = ['this_file_name',
                                                    'sentence_order',
                                                    'word_count']).copy() > 0
    for i in tmp2.columns[~tmp2.columns.isin(tmp1.columns)]:
        tmp1[i] = 0
    for i in tmp1.columns[~tmp1.columns.isin(tmp2.columns)]:
        tmp2[i] = 0 

#     we must then tag each of the components and consider only nouns
    result = nltk.pos_tag(tmp1.columns)

    word = []
    pos_tag = []
    for i in result:
        word.append(i[0])
        pos_tag.append(i[1])
    pos_tags = pd.DataFrame({'pos_tag':pos_tag,'word':word})
    useful_tags = pos_tags.word[pos_tags.pos_tag.str.startswith('N')].values
    # we then remove all empty rows
    connectivity_matrix = connectivity_matrix.loc[:,connectivity_matrix.columns[connectivity_matrix.columns.isin(useful_tags)]] >0
    indices_to_drop = connectivity_matrix.index[connectivity_matrix.sum(axis = 1)==0]
    connectivity_matrix = connectivity_matrix.drop(index=  indices_to_drop) >0
    original_indices = connectivity_matrix.index
    connectivity_matrix.reset_index(inplace = True,drop = True)
    tmp1 = tmp1.loc[:,useful_tags]
    tmp2 = tmp2.loc[:,useful_tags]
    print(tmp2.index)
    tmp2.drop(index= indices_to_drop, inplace = True)
    tmp2.reset_index(inplace = True, drop = True)
    ts_matrix.drop(index = indices_to_drop, inplace = True)
    ts_matrix.reset_index(inplace = True, drop = True)
    # we then put them both in the same order:

    tmp1 = tmp1[sorted(tmp1.columns)]
    tmp2 = tmp2[sorted(tmp1.columns)]

    full_title = tmp1.fillna(0).sum(axis = 0)
    tmp2 = tmp2.fillna(0)
    # we then calculate the similarities of the hubs (sentences) to the title
    hub_ranks = 1 + cosine_similarity(tmp2,full_title.reshape(1,-1))
    # we then calculate the authorities rank
    authority_ranks = 1 + connectivity_matrix.sum(
        axis = 0) +connectivity_matrix.columns.isin(title.columns)
    # we then apply the hiits algorithm
    hubs_matrix = np.matmul(connectivity_matrix,connectivity_matrix.transpose())
    authorities_matrix = np.matmul(connectivity_matrix.transpose(),connectivity_matrix)

    max_diff = 10000
    counter = 1
    while(max_diff > 0.000000000001 or counter > 100000):
        old_hub_ranks = hub_ranks.copy()
        hub_ranks = np.matmul(hubs_matrix,hub_ranks)
        hub_ranks = hub_ranks/np.linalg.norm(hub_ranks)
        old_authority_ranks = authority_ranks.copy()
        authority_ranks= np.matmul(authorities_matrix,authority_ranks)
        authority_ranks = authority_ranks/np.linalg.norm(authority_ranks)
        auth_diff = np.linalg.norm(authority_ranks - old_authority_ranks)
        hub_diff = np.linalg.norm(hub_ranks - old_hub_ranks)
        max_diff = max(hub_diff,auth_diff)
        counter +=1
    projection = np.zeros((connectivity_matrix.shape[0],
                           connectivity_matrix.shape[0]))

    for i in tqdm(range(connectivity_matrix.shape[0])):
        for j in range(i+1,connectivity_matrix.shape[0]):
            projection[i,j] = np.logical_and(connectivity_matrix.loc[i,:],connectivity_matrix.loc[j,:]).sum()

    outdegrees = (projection>0).sum(axis = 1)
    position = original_indices
    position = np.array(sorted(position, reverse = True)).astype(float)
    outdegrees = np.divide(outdegrees,position+1)
    # we then declare the optimization model: 

    summarizer = Model('summarizer')

    # we create a list of all sentences:
    var_names = 'sentence_' + connectivity_matrix.index.astype(str)
    portvars_x = [summarizer.addVar(vtype = "B",
                                    name = symb) for symb in var_names]
    portvars_x = pd.Series(portvars_x, index = var_names)
    portfolio_x = pd.DataFrame({'Variables':portvars_x})
    entities = connectivity_matrix.columns
    portvars_y = [summarizer.addVar(vtype = "B",
                                    name = symb) for symb in entities]
    portvars_y = pd.Series(portvars_y, index = entities)
    portfolio_y = pd.DataFrame({'Variables':portvars_y})

    summarizer.update()

    length = portvars_x.dot(ts_matrix.word_count.values)
    # we create a similar connectivity_matrix and rename its axis:
    new_conn = connectivity_matrix.copy()
    new_conn.index = portvars_x.index

    # adding the additional consistency contraints on y:
    for i in new_conn.index:
        this_line = new_conn.loc[i,:]
        total_entities = (this_line >0).sum()
        local_entities = this_line[this_line > 0].index
        if(local_entities.size != 0):
            temp_sum = portvars_y[local_entities].sum()
            summarizer.addConstr(temp_sum >= 
                                 total_entities*portvars_x[i])
    #adding the additional consistency constraints on x
    for i in new_conn.columns:
        this_column = new_conn.loc[:,i]
        total_sentences = this_column.index[this_column > 0]
        temp_sum = portvars_x[total_sentences].sum()
        summarizer.addConstr(temp_sum >= portvars_y[i])
    word_limit = 250

    # adding the final constraint on model size 
    summarizer.addConstr(length <= word_limit)

    # defining the objective function:
    coverage = portvars_x.dot(hub_ranks)
    coherence = portvars_x.dot(outdegrees)
    diversity = portvars_y.sum()/portvars_y.shape[0]

    summarizer.setObjective((coverage + coherence + diversity)[0],
                            GRB.MAXIMIZE)

    summarizer.update()
    
    #execute the optimization
    summarizer.optimize()
    
    # we process the answers:
    selected_sentences = []
    for i in portvars_x:
        selected_sentences.append(i.getAttr('x'))
    selected_sentences = pd.Series(selected_sentences)

    # we then load the sentence_banks to check the results
    sentence_bank_file = ('/home/joao/Thesis/test_set/sentence_banks/'+ 
                          ts_matrix_file.split('/')[-1])

    sentence_bank = pd.read_pickle(sentence_bank_file)
    sentence_bank.drop(index=  indices_to_drop, inplace = True)
    sentence_bank.reset_index(inplace = True, drop = True)
    abstract = sentence_bank.loc[selected_sentences > 0,'sentence']

    final_abstract = ''
    for i in abstract:
        final_abstract += i
    return(ts_matrix_file,final_abstract)

In [None]:
ts_matrices = pd.Series(sorted(glob.glob(
    '/home/joao/Thesis/test_set/ts_matrices/*.p')))
final_results = Parallel(n_jobs = -1, verbose = 11)(delayed(
    summarize_text)(ts_matrix_file)for ts_matrix_file in ts_matrices)

files = []
abstracts = []
for i in final_results:
    files.append(i[0])
    abstracts.append(i[1])
final_df = pd.DataFrame({'filenames':files,'abstracts':abstracts})

# Execution time: 1 minute 40 seconds

In [None]:
final_df.to_pickle('/home/joao/Thesis/sum_coh/test_set_results.p')

# Scoring

In [1]:
from rouge import Rouge,FilesRouge
import pandas as pd
import glob
from tqdm import tqdm
import numpy as np
from unidecode import unidecode

results_df = pd.read_pickle('/home/joao/Thesis/sum_coh/test_set_results.p')
results_df.filenames = results_df.filenames.str.split('/', expand = True)[6]
def remove_non_ascii(text):
    """This function removes all non-ascii characters from
    text and replaces them with their closest ascii representation"""
    return unidecode(unicode(text, encoding = "utf-8"))
# we then load all summaries and candidate summaries:

total_scores = []
scores = []
r1 = []
r2 = []
rl = []

for i in tqdm(results_df.index):
    ground_truth = ('/home/joao/Thesis/test_set/abstracts/ground_truths/'+ 
                    results_df.loc[i,'filenames'][:-2]+'.txt')
    rouge = Rouge()
    with open(ground_truth,'rb') as f:
        ground_truth = f.read()
    ground_truth = remove_non_ascii(ground_truth)
    tmp_scores = rouge.get_scores(results_df.loc[i,'abstracts'],
                                  ground_truth, avg = True)
    r2.append(tmp_scores['rouge-2']['f'])
    r1.append(tmp_scores['rouge-1']['f'])
    rl.append(tmp_scores['rouge-l']['f'])


100%|██████████| 69/69 [00:03<00:00, 21.95it/s]


In [None]:
print('r1',np.mean(r1),np.std(r1,ddof = 1))
print('r2',np.mean(r2),np.std(r2, ddof = 1))
print('rl',np.mean(rl),np.std(rl, ddof = 1))

('r1', 0.34164433082277473, 0.06061530353507446)

('r2', 0.11464533396916964, 0.05274832618517848)

('rl', 0.3012795744110309, 0.06235428378544586)