# Imports

In [None]:
%load_ext line_profiler
import pandas as pd
import numpy as np
import glob
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from scipy.special import expit

# Experimentation

In [None]:
# We start by importing an example of a term_sentence matrix
ts_matrix_files = sorted(glob.glob('/home/joao/Thesis/ts_matrices_original/*.p'))

In [None]:
ts_matrix_file = ts_matrix_files[0]

In [None]:
ts_matrix = pd.read_pickle(ts_matrix_file)
ts_keys = ts_matrix[['this_file_name','sentence_order','word_count']]
ts_matrix = ts_matrix.drop(columns = ['this_file_name','sentence_order','word_count'])

In [None]:
# we must then build the ts-isf matrix:
# we start by building the isf vector
isf = np.log(float(ts_matrix.shape[0])/(ts_matrix >0).sum(axis = 0))

In [None]:
# we then build the ts_isf matrix:
ts_isf = (ts_matrix*isf).fillna(0).as_matrix()

In [None]:
%%time
# we then define the similarity between two vectors as the cosine similarity
sentence_similarities  = cosine_similarity(ts_isf.mean(axis = 0).reshape(1,-1),ts_isf)
full_similarities = cosine_similarity(ts_isf,ts_isf)
mean_overall_vector = ts_isf.mean(axis = 0)


In [None]:
def get_coverage(X):
    if(X.sum() == 0):
        return 0
    else:
        summary = ts_isf[X,:]
        mean_summary_vector = summary.mean(axis = 0)
        coverage = cosine_similarity(mean_overall_vector.reshape(1,-1),mean_summary_vector.reshape(1,-1))*np.matmul(sentence_similarities,X)
    return coverage

def get_diversity(X):
    if(X.sum() == 0):
        return 1000000
    # we start by creating a map of which sentences are to be selected in this measure
    selection_matrix = np.matmul(X.reshape(-1,1),X.reshape(1,-1))
    diversity  = float((np.multiply(full_similarities,selection_matrix)).sum() - X.sum())/2
    if(diversity == 0):
        return 10000
    return diversity

def objective_function(X):
    a = get_coverage(X)
    b = get_diversity(X)
    return(float(a)/(b))
    
def get_FT(iterations,itermax):
    return np.exp(-2*float(iterations)/itermax)

def clip_population(population):
    population[population < umin] = 2*umin - population[population < umin]
    population[population > umax] = 2*umax - population[population > umax]
    return population

def get_crossover_rate(actual_scores):
    RD = (max_score - actual_scores)/float((max_score - min_score))
    hiptan = np.tanh(2*RD)
    CR = 2*hiptan/(1+hiptan)
    return CR

def binarize(population):
    random_draw = np.random.rand(population.shape[0],population.shape[1])
    new_X = (random_draw < expit(population.values)).astype(int)
    return new_X

def evaluate_summary_candidates(X):
    scores = []
    for i in range(X.shape[0]): 
        scores.append(objective_function(X[i,:]))
    return pd.Series(scores)

def perform_crossover(population):
    reference = population.sample(frac = 1, replace = True).reset_index(drop = True)
    FT = get_FT(iteration,itermax)
    mutated_candidates = initial_population + (1.0-FT)*(global_best_individual - reference) + FT*(best_individual - reference)
    mutated_candidates = clip_population(mutated_candidates)
    CR = get_crossover_rate(scores)
    CR = np.repeat(CR.values,initial_population.shape[1])
    CR = CR.reshape((P,-1))
    random_draw = pd.DataFrame(np.random.rand(initial_population.shape[0],initial_population.shape[1]))
    keepers = (random_draw <= CR).values
    K = np.floor(float(initial_population.shape[1])*np.random.rand(initial_population.shape[0])).astype(int)
    future_generation = initial_population.copy()
    future_generation[keepers] = mutated_candidates[keepers]
    for i in future_generation.index.values:
        future_generation.iloc[i,K[i]] = mutated_candidates.loc[i,K[i]]
    future_generation = clip_population(future_generation)
    return future_generation

In [None]:
# we now define the whole genetic algorithmframework:

P = 100
itermax = 1000
sentence_limit = 400.0
umin = -5
umax = 5


iteration = 0
initial_population = pd.DataFrame(10.0*(np.random.rand(P,ts_isf.shape[0]))-(1-sentence_limit/ts_keys.word_count.sum())*10)
initial_population = clip_population(initial_population)
# we then discretize the problem
X = binarize(initial_population)

# we check which of the sentences is possibly valid by checking the word count of the summary
invalid_score = (sentence_limit - np.matmul(X,ts_keys['word_count'].values.reshape(-1,1)).reshape(-1,))
invalid_score = invalid_score/sentence_limit
invalid_population = (invalid_score < 0)

# we then compute the population scores
scores = evaluate_summary_candidates(X)
scores[invalid_population] = invalid_score[invalid_population]
# find the best and worst individuals
max_score = scores.max()
min_score = scores.min()
best_individual = initial_population.loc[scores[scores.values == scores.max()].index[0],:].values
worst_individual = initial_population.loc[scores[scores.values == scores.min()].index[0],:].values
# at first, the global best and the best are the same
global_best_individual = best_individual
global_best_score = max_score

next_generation = perform_crossover(initial_population)
# print((next_generation>0).sum())
X_mutated = binarize(next_generation)
# we then compute if the mutated scores are valid or not
invalid_score = sentence_limit - np.matmul(X_mutated,ts_keys['word_count'].values.reshape(-1,1)).reshape(-1,)
invalid_score = invalid_score/sentence_limit
invalid_population = (invalid_score < 0)
# we then compute the mutated scores
mutated_scores = evaluate_summary_candidates(X_mutated)
mutated_scores[invalid_population] = invalid_score
if(mutated_scores.max() > max_score):
    max_score = mutated_scores.max()
    best_individual = next_generation.iloc[mutated_scores[mutated_scores == max_score].index[0],:].values
    if(max_score > global_best_score):
        global_best_score = max_score
        global_best_individual = next_generation.loc[mutated_scores[mutated_scores == max_score].index[0],:].values
worse = (mutated_scores < scores).values

next_generation.loc[worse,:] = initial_population.loc[worse,:] 

X_mutated[worse,:] = X[worse,:]
X = X_mutated
worse.sum()

In [None]:
for iteration in range(1,itermax):
    # we then repeat the process for each successive iteration:

    initial_population = next_generation.copy()
    X = binarize(initial_population)
    # we then discretize the problem
    
    # we check which of the sentences is possibly valid by checking the word count of the summary
    invalid_score = sentence_limit - np.matmul(X,ts_keys['word_count'].values.reshape(-1,1)).reshape(-1,)
    invalid_score = invalid_score/sentence_limit
    invalid_population = (invalid_score < 0)
    #we then binarize the population:
    
    # we then compute the population scores
    scores = evaluate_summary_candidates(X)
    scores[invalid_population] = invalid_score[invalid_population]
        # find the best and worst individuals
    max_score = scores.max()
    min_score = scores.min()
    best_individual = initial_population.loc[scores[scores.values == scores.max()].index[0],:].values
    
    if(max_score > global_best_score):
        global_best_score = max_score
        global_best_individual = best_individual
        global_best_X = X[scores[scores.values == scores.max()].index[0]]
    next_generation = perform_crossover(initial_population)
    X_mutated = binarize(next_generation)
    # we then compute if the mutated scores are valid or not
    invalid_score = sentence_limit - np.matmul(X_mutated,ts_keys['word_count'].values.reshape(-1,1)).reshape(-1,)
    invalid_score = invalid_score/sentence_limit
    invalid_population = (invalid_score < 0)
    # we then compute the mutated scores
    mutated_scores = evaluate_summary_candidates(X_mutated)
    mutated_scores[invalid_population] = invalid_score
    if(mutated_scores.max() > max_score):
        max_score = mutated_scores.max()
        best_individual = next_generation.iloc[mutated_scores[mutated_scores == max_score].index[0],:].values
        if(max_score > global_best_score):
            global_best_score = max_score
            global_best_individual = next_generation.iloc[mutated_scores[mutated_scores == max_score].index[0],:].values
            global_best_X = X_mutated[mutated_scores[mutated_scores.values == mutated_scores.max()].index[0],:]
    worse = mutated_scores < scores
    next_generation.loc[worse,:] = initial_population.loc[worse,:]
    X_mutated[worse,:] = X[worse,:]
    X = X_mutated
    if(iteration%25 == 0):
        print(iteration,mutated_scores.mean(),scores.mean(),scores.max(),worse.sum(),global_best_score)

In [None]:
a = sorted(glob.glob('/home/joao/Thesis/sentence_bank/*.p'))
sentence_bank = pd.read_pickle(a[0])
sentence_bank

In [None]:
final_sentence = sentence_bank.loc[global_best_X == 1,:]

In [None]:
abstract = ''
for i in final_sentence.sentence:
    abstract += i

In [None]:
from rouge import Rouge,FilesRouge
rouge = Rouge()

In [None]:
true_summaries = pd.Series(sorted(glob.glob('/home/joao/Thesis/simplified_abstracts/*')))
true_summaries[0]

In [None]:
total_scores = []
i = ts_matrix_file
this_file = i.split('/')[-1]
this_file_num = this_file.split('.')[0][1:4]
ground_truths = true_summaries[true_summaries.str[-7:-4] == this_file_num]
scores = []
rouge = Rouge()
abstract
for j in ground_truths:
    with open(j,'rb') as f:
        ground_truth = f.read()
        tmp_scores = rouge.get_scores(abstract,ground_truth,avg = True)
    scores.append(tmp_scores['rouge-2']['p'])
total_scores.append(np.mean(scores))

In [None]:
total_scores

In [None]:
print(abstract)

# turning it all into a function and running it for the entire base:

In [None]:
import pandas as pd
import numpy as np
import glob
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from scipy.special import expit
from rouge import Rouge,FilesRouge
from joblib import Parallel, delayed

# We start by importing an example of a term_sentence matrix
ts_matrix_files = sorted(glob.glob('/home/joao/Thesis/ts_matrices_original/*.p'))

In [None]:
class summarizer:
    def __init__(self,ts_matrix_file):
        self.ts_matrix_file = ts_matrix_file
        sentence_bank_file = '/home/joao/Thesis/sentence_bank/' + ts_matrix_file.split('/')[-1]
        self.sentence_bank = pd.read_pickle(sentence_bank_file)
        self.P = 100
        self.itermax = 1000
        self.sentence_limit = 400.0
        self.umin = -5
        self.umax = 5
        ts_matrix = pd.read_pickle(ts_matrix_file)
        self.ts_keys = ts_matrix[['this_file_name','sentence_order','word_count']]
        ts_matrix = ts_matrix.drop(columns = ['this_file_name','sentence_order','word_count'])
        # we must then build the ts-isf matrix:
        # we start by building the isf vector
        isf = np.log(float(ts_matrix.shape[0])/(ts_matrix >0).sum(axis = 0))

        # we then build the ts_isf matrix:
        ts_isf = (ts_matrix*isf).fillna(0).as_matrix()
        self.ts_isf = ts_isf
        # we then define the similarity between two vectors as the cosine similarity
        self.sentence_similarities  = cosine_similarity(ts_isf.mean(axis = 0).reshape(1,-1),ts_isf)
        self.full_similarities = cosine_similarity(ts_isf,ts_isf)
        self.mean_overall_vector = self.ts_isf.mean(axis = 0)
    def get_coverage(self,X):
        if(X.sum() == 0):
            return 0
        else:
            summary = self.ts_isf[X,:]
            mean_summary_vector = summary.mean(axis = 0)
            coverage = cosine_similarity(self.mean_overall_vector.reshape(1,-1),mean_summary_vector.reshape(1,-1))*np.matmul(self.sentence_similarities,X)
        return coverage[0][0]

    def get_diversity(self,X):
        if(X.sum() == 0):
            return 1000000
        # we start by creating a map of which sentences are to be selected in this measure
        selection_matrix = np.matmul(X.reshape(-1,1),X.reshape(1,-1))
        diversity  = float((np.multiply(self.full_similarities,selection_matrix)).sum() - X.sum())/2
        if(diversity == 0):
            return 10000
        return diversity

    def objective_function(self,X):
        a = self.get_coverage(X)
        b = self.get_diversity(X)
        return(float(a)/(b))

    def get_FT(self,iterations,itermax):
        return np.exp(-2*float(iterations)/self.itermax)

    def clip_population(self,population):
        umin = self.umin
        umax = self.umax
        population[population < umin] = 2*umin - population[population < umin]
        population[population > umax] = 2*umax - population[population > umax]
        return population

    def get_crossover_rate(self,actual_scores):
        RD = (self.max_score - actual_scores)/float((self.max_score - self.min_score))
        hiptan = np.tanh(2*RD)
        CR = 2*hiptan/(1+hiptan)
        return CR

    def binarize(self,population):
        random_draw = np.random.rand(population.shape[0],population.shape[1])
        new_X = (random_draw < expit(population.values)).astype(int)
        return new_X

    def evaluate_summary_candidates(self,X):
        scores = []
        for i in range(X.shape[0]): 
            scores.append(self.objective_function(X[i,:]))
        return pd.Series(scores)

    def perform_crossover(self,population):
        reference = population.sample(frac = 1, replace = True).reset_index(drop = True)
        FT = self.get_FT(self.iteration,self.itermax)
        mutated_candidates = population + (1.0-FT)*(self.global_best_individual - reference) + FT*(self.best_individual - reference)
        mutated_candidates = self.clip_population(mutated_candidates)
        CR = self.get_crossover_rate(self.scores)
        CR = np.repeat(CR.values,population.shape[1])
        CR = CR.reshape((self.P,-1))
        random_draw = pd.DataFrame(np.random.rand(population.shape[0],population.shape[1]))
        keepers = (random_draw <= CR).values
        K = np.floor(float(population.shape[1])*np.random.rand(population.shape[0])).astype(int)
        future_generation = population.copy()
        future_generation[keepers] = mutated_candidates[keepers]
        for i in future_generation.index.values:
            future_generation.iloc[i,K[i]] = mutated_candidates.loc[i,K[i]]
        future_generation = self.clip_population(future_generation)
        return future_generation
    
    def summarize(self):
        sentence_limit = self.sentence_limit
        P = self.P 
        itermax = self.itermax
        sentence_limit = self.sentence_limit 
        umin = self.umin
        umax = self.umax
        ts_keys = self.ts_keys
        ts_isf = self.ts_isf
        self.iteration = 0
        initial_population = pd.DataFrame(10.0*(np.random.rand(P,ts_isf.shape[0]))-(1-sentence_limit/self.ts_keys.word_count.sum())*10)
        initial_population = self.clip_population(initial_population)
        # we then discretize the problem
        X = self.binarize(initial_population)

        # we check which of the sentences is possibly valid by checking the word count of the summary
        invalid_score = (sentence_limit - np.matmul(X,self.ts_keys['word_count'].values.reshape(-1,1)).reshape(-1,))
        invalid_score = invalid_score/sentence_limit
        invalid_population = (invalid_score < 0)

        # we then compute the population scores
        self.scores = self.evaluate_summary_candidates(X)
        self.scores[invalid_population] = invalid_score[invalid_population]
        # find the best and worst individuals
        scores = self.scores
        self.max_score = scores.max()
        self.min_score = scores.min()
        self.best_individual = initial_population.loc[scores[scores.values == self.scores.max()].index[0],:].values
        self.worst_individual = initial_population.loc[scores[scores.values == self.scores.min()].index[0],:].values
        # at first, the global best and the best are the same
        self.global_best_individual = self.best_individual
        self.global_best_score = self.max_score
        next_generation = self.perform_crossover(initial_population)
        X_mutated = self.binarize(next_generation)
        # we then compute if the mutated scores are valid or not
        invalid_score = sentence_limit - np.matmul(X_mutated,ts_keys['word_count'].values.reshape(-1,1)).reshape(-1,)
        invalid_score = invalid_score/sentence_limit
        invalid_population = (invalid_score < 0)
        # we then compute the mutated scores
        mutated_scores = self.evaluate_summary_candidates(X_mutated)
        mutated_scores[invalid_population] = invalid_score
        if(mutated_scores.max() > self.max_score):
            max_score = mutated_scores.max()
            self.max_score = max_score
            self.max_score = mutated_scores.max()
            self.best_individual = next_generation.iloc[mutated_scores[mutated_scores == max_score].index[0],:].values
            if(self.max_score > self.global_best_score):
                self.global_best_score = self.max_score
                self.global_best_individual = next_generation.loc[mutated_scores[mutated_scores == self.max_score].index[0],:].values
        worse = (mutated_scores < scores).values

        next_generation.loc[worse,:] = initial_population.loc[worse,:] 
        X_mutated[worse,:] = X[worse,:]
        X = X_mutated
        worse.sum()
        # we then iterate:
        for iteration in range(1,itermax):
            # we then repeat the process for each successive iteration:
            self.iteration = iteration
            initial_population = next_generation.copy()
            X = self.binarize(initial_population)
            # we then discretize the problem

            # we check which of the sentences is possibly valid by checking the word count of the summary
            invalid_score = sentence_limit - np.matmul(X,ts_keys['word_count'].values.reshape(-1,1)).reshape(-1,)
            invalid_score = invalid_score/sentence_limit
            invalid_population = (invalid_score < 0)
            #we then binarize the population:

            # we then compute the population scores
            self.scores = self.evaluate_summary_candidates(X)
            scores = self.scores
            self.scores[invalid_population] = invalid_score[invalid_population]
            # find the best and worst individuals
            self.max_score = scores.max()
            self.min_score = scores.min()
            self.best_individual = initial_population.loc[scores[scores.values == self.scores.max()].index[0],:].values

            if(self.max_score > self.global_best_score):
                self.global_best_score = self.max_score
                self.global_best_individual = self.best_individual
                self.global_best_X = X[scores[scores.values == scores.max()].index[0]]
            next_generation = self.perform_crossover(initial_population)
            X_mutated = self.binarize(next_generation)
            # we then compute if the mutated scores are valid or not
            invalid_score = sentence_limit - np.matmul(X_mutated,ts_keys['word_count'].values.reshape(-1,1)).reshape(-1,)
            invalid_score = invalid_score/sentence_limit
            invalid_population = (invalid_score < 0)
            # we then compute the mutated scores
            mutated_scores = self.evaluate_summary_candidates(X_mutated)
            mutated_scores[invalid_population] = invalid_score
            if(mutated_scores.max() > self.max_score):
                self.max_score = mutated_scores.max()
                self.best_individual = next_generation.iloc[mutated_scores[mutated_scores == self.max_score].index[0],:].values
                if(self.max_score > self.global_best_score):
                    self.global_best_score = self.max_score
                    self.global_best_individual = next_generation.iloc[mutated_scores[mutated_scores == self.max_score].index[0],:].values
                    self.global_best_X = X_mutated[mutated_scores[mutated_scores.values == mutated_scores.max()].index[0],:]
            worse = mutated_scores < scores
            next_generation.loc[worse,:] = initial_population.loc[worse,:]
            X_mutated[worse,:] = X[worse,:]
            X = X_mutated
            if(iteration%25 == 0):
                print(iteration,mutated_scores.mean(),scores.mean(),scores.max(),worse.sum(),self.global_best_score)
        selected_sentences = self.sentence_bank.loc[self.global_best_X == 1, :].sentence
        abstract= ''
        for i in selected_sentences:
            abstract += i
        
        return abstract,self.ts_matrix_file,self.global_best_X

In [None]:
def compute_results(ts_matrix_file):
    my_summarizer = summarizer(ts_matrix_file)
    result = my_summarizer.summarize()
    return result

In [None]:
results = Parallel(n_jobs = 8, verbose = 11)(delayed(compute_results)(ts_matrix_file)for ts_matrix_file in ts_matrix_files)

# Run Time : 218 minutes 

In [None]:
true_summaries = pd.Series(sorted(glob.glob('/home/joao/Thesis/simplified_abstracts/*')))
true_summaries[0]

total_scores = []
i = result[1]
this_file = i.split('/')[-1]
this_file_num = this_file.split('.')[0][1:4]
ground_truths = true_summaries[true_summaries.str[-7:-4] == this_file_num]
scores = []
rouge = Rouge()
abstract = result[0]
for j in ground_truths:
    with open(j,'rb') as f:
        ground_truth = f.read()
        tmp_scores = rouge.get_scores(abstract,ground_truth,avg = True)
    scores.append(tmp_scores['rouge-2']['p'])
total_scores.append(np.mean(scores))

In [None]:
total_scores

In [None]:
my_summarizer.get_coverage(result[2])

In [None]:
abstracts = []
files = []
for i in results:
    abstracts.append(i[0])
    files.append(i[1])

In [None]:
final_results = pd.DataFrame({'abstracts':abstracts,'files':files})

In [None]:
final_results.to_pickle('/home/joao/Thesis/OCDSumSaDE/final_results.p')

In [None]:
final_results

# Now running it for the test_base

In [None]:
%load_ext line_profiler
import pandas as pd
import numpy as np
import glob
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from scipy.special import expit
from rouge import Rouge,FilesRouge
from joblib import Parallel, delayed

# We start by importing an example of a term_sentence matrix
ts_matrix_files = sorted(glob.glob('/home/joao/Thesis/test_set/ts_matrices/*.p'))

In [None]:
class summarizer:
    def __init__(self,ts_matrix_file):
        self.ts_matrix_file = ts_matrix_file
        sentence_bank_file = '/home/joao/Thesis/test_set/sentence_banks/' + ts_matrix_file.split('/')[-1]
        self.sentence_bank = pd.read_pickle(sentence_bank_file)
        self.P = 100
        self.itermax = 1000
        self.sentence_limit = 250.0
        self.umin = -5
        self.umax = 5
        ts_matrix = pd.read_pickle(ts_matrix_file)
        self.ts_keys = ts_matrix[['this_file_name','sentence_order','word_count']]
        ts_matrix = ts_matrix.drop(columns = ['this_file_name','sentence_order','word_count'])
        # we must then build the ts-isf matrix:
        # we start by building the isf vector
        isf = np.log(float(ts_matrix.shape[0])/(ts_matrix >0).sum(axis = 0))

        # we then build the ts_isf matrix:
        ts_isf = (ts_matrix*isf).fillna(0).as_matrix()
        self.ts_isf = ts_isf
        # we then define the similarity between two vectors as the cosine similarity
        self.sentence_similarities  = cosine_similarity(ts_isf.mean(axis = 0).reshape(1,-1),ts_isf)
        self.full_similarities = cosine_similarity(ts_isf,ts_isf)
        self.mean_overall_vector = self.ts_isf.mean(axis = 0)
    def get_coverage(self,X):
        if(X.sum() == 0):
            return 0
        else:
            summary = self.ts_isf[X,:]
            mean_summary_vector = summary.mean(axis = 0)
            coverage = cosine_similarity(self.mean_overall_vector.reshape(1,-1),mean_summary_vector.reshape(1,-1))*np.matmul(self.sentence_similarities,X)
        return coverage[0][0]

    def get_diversity(self,X):
        if(X.sum() == 0):
            return 1000000
        # we start by creating a map of which sentences are to be selected in this measure
        selection_matrix = np.matmul(X.reshape(-1,1),X.reshape(1,-1))
        diversity  = float((np.multiply(self.full_similarities,selection_matrix)).sum() - X.sum())/2
        if(diversity == 0):
            return 10000
        return diversity

    def objective_function(self,X):
        a = self.get_coverage(X)
        b = self.get_diversity(X)
        return(float(a)/max(b,0.1))

    def get_FT(self,iterations,itermax):
        return np.exp(-2*float(iterations)/self.itermax)

    def clip_population(self,population):
        umin = self.umin
        umax = self.umax
        population[population < umin] = 2*umin - population[population < umin]
        population[population > umax] = 2*umax - population[population > umax]
        return population

    def get_crossover_rate(self,actual_scores):
        RD = (self.max_score - actual_scores)/float((self.max_score - self.min_score))
        hiptan = np.tanh(2*RD)
        CR = 2*hiptan/(1+hiptan)
        return CR

    def binarize(self,population):
        random_draw = np.random.rand(population.shape[0],population.shape[1])
        new_X = (random_draw < expit(population.values)).astype(int)
        return new_X

    def evaluate_summary_candidates(self,X):
        scores = []
        for i in range(X.shape[0]): 
            scores.append(self.objective_function(X[i,:]))
        return pd.Series(scores)

    def perform_crossover(self,population):
        reference = population.sample(frac = 1, replace = True).reset_index(drop = True)
        FT = self.get_FT(self.iteration,self.itermax)
        mutated_candidates = population + (1.0-FT)*(self.global_best_individual - reference) + FT*(self.best_individual - reference)
        mutated_candidates = self.clip_population(mutated_candidates)
        CR = self.get_crossover_rate(self.scores)
        CR = np.repeat(CR.values,population.shape[1])
        CR = CR.reshape((self.P,-1))
        random_draw = pd.DataFrame(np.random.rand(population.shape[0],population.shape[1]))
        keepers = (random_draw <= CR).values
        K = np.floor(float(population.shape[1])*np.random.rand(population.shape[0])).astype(int)
        future_generation = population.copy()
        future_generation[keepers] = mutated_candidates[keepers]
        for i in future_generation.index.values:
            future_generation.iloc[i,K[i]] = mutated_candidates.loc[i,K[i]]
        future_generation = self.clip_population(future_generation)

        return future_generation
    
    def summarize(self):
        sentence_limit = self.sentence_limit
        P = self.P 
        itermax = self.itermax
        sentence_limit = self.sentence_limit 
        umin = self.umin
        umax = self.umax
        ts_keys = self.ts_keys
        ts_isf = self.ts_isf
        self.iteration = 0
        initial_population = pd.DataFrame(10.0*(np.random.rand(P,ts_isf.shape[0]))-(1-sentence_limit/self.ts_keys.word_count.sum())*10)
        initial_population = self.clip_population(initial_population)
        # we then discretize the problem
        X = self.binarize(initial_population)
        # we check which of the sentences is possibly valid by checking the word count of the summary
        invalid_score = (sentence_limit - np.matmul(X,self.ts_keys['word_count'].values.reshape(-1,1)).reshape(-1,))
        invalid_score = invalid_score/sentence_limit
        invalid_population = (invalid_score < 0)

        # we then compute the population scores
        self.scores = self.evaluate_summary_candidates(X)
        self.scores[invalid_population] = invalid_score[invalid_population]
        # find the best and worst individuals
        scores = self.scores
        self.max_score = scores.max()
        self.min_score = scores.min()
        self.best_individual = initial_population.loc[scores[scores.values == self.scores.max()].index[0],:].values
        self.worst_individual = initial_population.loc[scores[scores.values == self.scores.min()].index[0],:].values
        # at first, the global best and the best are the same
        self.global_best_individual = self.best_individual
        self.global_best_score = self.max_score
        next_generation = self.perform_crossover(initial_population)
        X_mutated = self.binarize(next_generation)
        # we then compute if the mutated scores are valid or not
        invalid_score = sentence_limit - np.matmul(X_mutated,ts_keys['word_count'].values.reshape(-1,1)).reshape(-1,)
        invalid_score = invalid_score/sentence_limit
        invalid_population = (invalid_score < 0)
        # we then compute the mutated scores
        mutated_scores = self.evaluate_summary_candidates(X_mutated)
        mutated_scores[invalid_population] = invalid_score
        if(mutated_scores.max() > self.max_score):
            max_score = mutated_scores.max()
            self.max_score = max_score
            self.max_score = mutated_scores.max()
            self.best_individual = next_generation.iloc[mutated_scores[mutated_scores == max_score].index[0],:].values
            if(self.max_score > self.global_best_score):
                self.global_best_score = self.max_score
                self.global_best_individual = next_generation.loc[mutated_scores[mutated_scores == self.max_score].index[0],:].values
        worse = (mutated_scores < scores).values

        next_generation.loc[worse,:] = initial_population.loc[worse,:] 
        X_mutated[worse,:] = X[worse,:]
        X = X_mutated
        worse.sum()
        # we then iterate:
        for iteration in range(1,itermax):
            # we then repeat the process for each successive iteration:
            self.iteration = iteration
            initial_population = next_generation.copy()
            X = self.binarize(initial_population)
            # we then discretize the problem

            # we check which of the sentences is possibly valid by checking the word count of the summary
            invalid_score = sentence_limit - np.matmul(X,ts_keys['word_count'].values.reshape(-1,1)).reshape(-1,)
            invalid_score = invalid_score/sentence_limit
            invalid_population = (invalid_score < 0)
            #we then binarize the population:

            # we then compute the population scores
            self.scores = self.evaluate_summary_candidates(X)
            scores = self.scores
            self.scores[invalid_population] = invalid_score[invalid_population]
                # find the best and worst individuals
            self.max_score = scores.max()
            self.min_score = scores.min()
            self.best_individual = initial_population.loc[scores[scores.values == self.scores.max()].index[0],:].values

            if(self.max_score > self.global_best_score):
                self.global_best_score = self.max_score
                self.global_best_individual = self.best_individual
                self.global_best_X = X[scores[scores.values == scores.max()].index[0]]
            next_generation = self.perform_crossover(initial_population)
            X_mutated = self.binarize(next_generation)
            # we then compute if the mutated scores are valid or not
            invalid_score = sentence_limit - np.matmul(X_mutated,ts_keys['word_count'].values.reshape(-1,1)).reshape(-1,)
            invalid_score = invalid_score/sentence_limit
            invalid_population = (invalid_score < 0)
            # we then compute the mutated scores
            mutated_scores = self.evaluate_summary_candidates(X_mutated)
            mutated_scores[invalid_population] = invalid_score
            if(mutated_scores.max() > self.max_score):
                self.max_score = mutated_scores.max()
                self.best_individual = next_generation.iloc[mutated_scores[mutated_scores == self.max_score].index[0],:].values
                if(self.max_score > self.global_best_score):
                    self.global_best_score = self.max_score
                    self.global_best_individual = next_generation.iloc[mutated_scores[mutated_scores == self.max_score].index[0],:].values
                    self.global_best_X = X_mutated[mutated_scores[mutated_scores.values == mutated_scores.max()].index[0],:]
            worse = mutated_scores < scores
            next_generation.loc[worse,:] = initial_population.loc[worse,:]
            X_mutated[worse,:] = X[worse,:]
            X = X_mutated
            if(iteration%25 == 0):
                print(iteration,mutated_scores.mean(),scores.mean(),scores.max(),worse.sum(),self.global_best_score)
        selected_sentences = self.sentence_bank.loc[self.global_best_X == 1, :].sentence
        abstract= ''
        for i in selected_sentences:
            abstract += i
        
        return abstract,self.ts_matrix_file,self.global_best_X

In [None]:
def compute_results(ts_matrix_file):
    my_summarizer = summarizer(ts_matrix_file)
    result = my_summarizer.summarize()
    return result

In [None]:
results = Parallel(n_jobs = 8, verbose = 11)(delayed(compute_results)(ts_matrix_file)for ts_matrix_file in ts_matrix_files)

In [None]:
abstracts = []
files = []
for i in results:
    abstracts.append(i[0])
    files.append(i[1])

final_results = pd.DataFrame({'abstracts':abstracts,'files':files})

final_results.to_pickle('/home/joao/Thesis/OCDSumSaDE/test_final_results.p')

# Execution time: 51.2min finished

# Scoring 

In [None]:
from rouge import Rouge,FilesRouge
import pandas as pd
import glob
from tqdm import tqdm
import numpy as np
from unidecode import unidecode

results_df = pd.read_pickle('/home/joao/Thesis/OCDSumSaDE/test_final_results.p')

results_df.files = results_df.files.str.split('/', expand = True)[6]
def remove_non_ascii(text):
    """This function removes all non-ascii characters from text and replaces them with their closest ascii representation"""
    return unidecode(unicode(text, encoding = "utf-8"))
# we then load all summaries and candidate summaries:

total_scores = []
scores = []
r1 = []
r2 = []
rl = []
for i in tqdm(results_df.index):
    ground_truth = '/home/joao/Thesis/test_set/abstracts/ground_truths/'+ results_df.loc[i,'files'][:-2]+'.txt'
    rouge = Rouge()
    with open(ground_truth,'rb') as f:
        ground_truth = f.read()
    ground_truth = remove_non_ascii(ground_truth)
    tmp_scores = rouge.get_scores(results_df.loc[i,'abstracts'],ground_truth, avg = True)
    r2.append(tmp_scores['rouge-2']['f'])
    r1.append(tmp_scores['rouge-1']['f'])
    rl.append(tmp_scores['rouge-l']['f'])

In [None]:
print('r1',np.mean(r1),np.std(r1,ddof = 1))
print('r2',np.mean(r2),np.std(r2, ddof = 1))
print('rl',np.mean(rl),np.std(rl, ddof = 1))

('r1', 0.29292168257691364, 0.058581228141602736)

('r2', 0.07615283246687429, 0.04955519211966711)

('rl', 0.24793091807760487, 0.05986021707431101)
