Este notebook contiene la función principal para resumir un conjunto de documentos usando un rango dado de parametros. 

## Bibliotecas

In [28]:
import numpy as np

import matplotlib.pyplot as plt

# For clustering
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# Para iterar sobre archivos
import os

# Spacy
import spacy
nlp = spacy.load('en_core_web_lg') #_sm, _md, _lg

## Calling HDComputing_basics

In [29]:
%run HDComputing_basics.ipynb

# Summarizing texts

### Functions for language processing

In [30]:
def read_file_to_list (filename):
    # Reading file and deleting blank spaces
    Text = []
    with open(filename, "r" , encoding="utf8") as f:
        lines = f.readlines()
    for l in lines:
        if l[2:-2] != '':
            Text.append(l[2:-2])
    return Text

def modify_sent (sentence, list_starts_ends): # String, ['label_str', starts_int, end_int]
    "It substitute labels for entitites in sentences, e.g.  I was in Mexico -> I was in GPE0"
    
    def change_order (List_index):
        "Change representation"
        new_list = [[0, List_index[0][1], List_index[0][0]]]
        for i in range(len(List_index) - 1):
            new_list.append([List_index[i][2], List_index[i+1][1], List_index[i+1][0]])
        new_list.append([List_index[-1][2], None, None])  #NO SE DEBE HACER CASO 
        return new_list
    
    if len(list_starts_ends) == 0:
        return sentence
    else:
        new_repr = change_order(list_starts_ends)
        new_sentence = ''
        for x in new_repr[:-1]:
            new_sentence += sentence[x[0]: x[1]] + x[2]

        new_sentence += sentence[new_repr[-1][0]:]
        return new_sentence

def entity_text_recognizer (Text):
    """ Given a list of sentences (text) it recognizes entities and returns the same text with 
    entity labels.
    """
    
    # To return
    Text_mod = []
    Text_copy = [] 
    Dict_word_2_entlabel = {}  # Mexico (string) -> GPE#
    Dict_entlabel_2_word = {}  # GPE# -> Mexico (span)
    
    # Entity counter
    Dict_ent_count = {} # Diccionario contador de entidades
    
    for sentence in Text:
        # Processing
        doc = nlp(sentence)
        
        # Removing noise sentences
        if len(doc) > 6:   
            # Iterate over entities
            chars_to_change = []
            for ent in doc.ents:
                # This label exists!
                try:
                    new_label = Dict_word_2_entlabel[str(ent)]

                except: # A new label...
                    try:
                        Dict_ent_count[ent.label_] += 1
                    except:
                        Dict_ent_count[ent.label_] = 1

                    new_label = ent.label_ + str(Dict_ent_count[ent.label_])
                    # Two way dictionary
                    Dict_entlabel_2_word[new_label] = ent
                    Dict_word_2_entlabel[str(ent)] = new_label

                chars_to_change.append([ new_label, ent.start_char, ent.end_char])

            # Changing text
            Text_mod.append(modify_sent(sentence, chars_to_change))
            Text_copy.append(sentence)

    
    # Entities array
    Entities = list(Dict_entlabel_2_word.keys())
    Entities_array = np.array([np.array([x]) for x in Entities])
    
    return Text_mod, Text_copy, Dict_word_2_entlabel, Dict_entlabel_2_word, Entities_array

def Text_to_POS_arrays (Text_mod):
    "This functions creates a tokens array for each POS tag"
    # Dictionary of POS
    Dict_word_pos = {'ADJ': set(), 'ADV': set(), 'NOUN': set(), 'VERB': set() }

    for sentence in Text_mod:
        # Processing
        doc = nlp(sentence)
        # Iterate over each token
        for token in doc:
            if not(token.is_stop) and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' \
               and token.text != '-' and token.text != '_' and token.text != '.' and token.text != "'" \
               and (token.text not in Dict_entlabel_2_word.keys()): # Este ultimo es para que no sea entidad
                    # Setting dict key
                    if token.pos_ == 'PROPN':
                        keyy = 'NOUN'
                    elif token.pos_ not in ['ADJ', 'ADV', 'NOUN', 'VERB']:
                        continue # Next word... we only consider these 4 classes
                    else:
                        keyy = token.pos_

                    # No lemmatization is applied
                    Dict_word_pos[keyy].add(token.text)
    
    # From dictionaries to arrays
    # Adjectives
    Adj_array = np.array([np.array([ nlp(x)[0] ]) for x in Dict_word_pos['ADJ']])
    # Adverbs
    Adv_array = np.array([np.array([ nlp(x)[0] ]) for x in Dict_word_pos['ADV']])
    # Nouns
    Noun_array = np.array([np.array([ nlp(x)[0] ]) for x in Dict_word_pos['NOUN']])
    # Verbs
    Verb_array = np.array([np.array([ nlp(x)[0] ]) for x in Dict_word_pos['VERB']])
    
    return Adj_array, Adv_array, Noun_array, Verb_array

## Clustering functions

In [31]:
def dist_spacy_ent(label_ent1, label_ent2):
    "Distance between two entity words"
    global Dict_entlabel_2_word
    entity1 = Dict_entlabel_2_word[label_ent1[0]]
    entity2 = Dict_entlabel_2_word[label_ent2[0]]
    if entity1.has_vector and entity2.has_vector:
        dist = 1 - entity1.similarity(entity2)
        if dist < 0:
            return 0
        elif dist > 1:
            return 1
        else:
            return dist
    else:
        return 1
    
def dist_spacy(lem1, lem2):
    "Distance between words (except entities)"
    if lem1[0].has_vector and lem2[0].has_vector:
        dist = 1 - lem1[0].similarity(lem2[0])
        if dist < 0:
            return 0
        elif dist > 1:
            return 1
        else:
            return dist
    else:
        return 1

def plot_dendrogram(distance_array, max_distance):
    "Plot a dendrogram"
    plt.figure(figsize=(25,10))
    plt.title("Hierachical Clustering Dendrogram")
    plt.xlabel("Index")
    plt.ylabel("Average distance")
    dn = dendrogram(distance_array,
               leaf_rotation=90.,
               leaf_font_size=9.,
               color_threshold = max_distance);
    plt.axhline(max_distance, c='k')


def set_cluster_list(distance_array, words_array, max_distance ):
    "Dividing clusters by threshold distance"
    clusters = fcluster(distance_array, max_distance, criterion='distance')
    uniques = np.unique(clusters)
    # New list with lists for each cluster
    cluster_list = []
    for i in uniques: # For each numbered cluster
        cluster_list.append([words_array[j] for j in range(len(clusters)) if clusters[j] == i])
    return sorted(cluster_list, key = len, reverse = True)

def medoid_token (cluster):  # Recibe una arreglo de strings: [['lemm_0'], ['lemm_1'], ..., ['lemm_n']]
    "Index of the centroid element"
    if len(cluster) < 3:
        return 0 # Da igual
    else:
        # Cambiar formato -> array[ arrays...]
        #clus = np.array(cluster)
        clus = cluster #?!
        
        # Medir matriz de distancia (completa)
        dist_mat = squareform(pdist(clus, metric = dist_spacy))
        # Regresar minimo
        return np.argmin(dist_mat.sum(axis = 0))

def medoid_entity (cluster):  # Recibe una arreglo de etiquetas de entidades (strings)
    "Index of the centroid element"
    if len(cluster) < 3:
        return 0 # Da igual
    else:
        # Format
        #cluster = np.array([np.array([x]) for x in cluster])
        # Measuring distance
        dist_mat = squareform(pdist(cluster, metric = dist_spacy_ent))
        # Returns min
        return np.argmin(dist_mat.sum(axis = 0))

def medoid_sentence (cluster):  # Recibe una lista de oraciones
    "Finds medoid sentence"
    if len(cluster) < 3:
        return 0 #PROBABLEMENTE DEBA DESCARTAR ESTOS CASOS
    else:
        # Midiendo distancias manual... 
        X = []
        for i in range(len(cluster) - 1):
            for j in range(i+1, len(cluster)):
                X.append( Dict[cluster[i]].dist(Dict[cluster[j]]) )
                #print("sentencia {} vs sentencia {} = {}".format(i, j, X[-1]))
                
        X_arr = squareform(np.array(X))
        
        # Regresar minimo
        return np.argmin(X_arr.sum(axis = 0))
    
def distancias_POS_arrays (Adj_array, Adv_array, Noun_array, Verb_array, Entities_array):
    # Adj
    X_adj = pdist(Adj_array, metric = dist_spacy)
    distance_adj = linkage(X_adj, 'complete')

    # Adv
    X_adv = pdist(Adv_array, metric = dist_spacy)
    distance_adv = linkage(X_adv, 'complete')

    # Nouns
    X_nn = pdist(Noun_array, metric = dist_spacy)
    distance_nn = linkage(X_nn, 'complete')

    # Verbs
    X_vrb = pdist(Verb_array, metric = dist_spacy)
    distance_vrb = linkage(X_vrb, 'complete')

    # Entities
    X_ent = pdist(Entities_array, metric = dist_spacy_ent)
    distance_ent = linkage(X_ent, 'complete')
    
    return distance_adj, distance_adv, distance_nn, distance_vrb, distance_ent

## Hyperdimensional space mapping

In [32]:
def Assign_HDvectors (cluster, medoids, POS, diff_bits):
    "Given a medoids and clusters arrays it assigns HD vectors to each item"
    def name_item (word_in_array, pos):
        "Creates string label:  text + . + pos_"
        return str(word_in_array[0]) + '.'+ pos
    
    for i in range(len(cluster)):
        # indice del medoide
        medoid_ix = medoids[i]
        #Primero se asigna un vector aleatorio al medoide
        Dict_HD[ name_item( cluster[i][medoid_ix], POS ) ] = HDvector(N, name_item( cluster[i][medoid_ix], POS) ) 

        seed_vec = Dict_HD[name_item( cluster[i][medoid_ix], POS )].getVec()
        # Se asigna el resto de vectores (contaminados)
        for j in range(len(cluster[i])):
            if j != medoid_ix:
                Dict_HD[name_item( cluster[i][j], POS )] = HDvector( contamina_vec(seed_vec, diff_bits) , name_item( cluster[i][j], POS ))   
                
# Función de distancia
def distancia_HD (HDvector_1, HDvector_2):
    obj1 = HDvector_1[0]
    obj2 = HDvector_2[0]
    return obj1.dist(obj2)


def Encode_sentences (Text_mod):
    "Dado el texto modificado Y ya teniendo generados los vectores HD de cada palabra, \
    esta función regresa un arreglo con los vectores de cada oración"
    Sentences_done = set()
    Vectors_sentence = []

    for s in Text_mod:
        if s not in Sentences_done and s not in Dict_word_2_entlabel.keys(): 
          #  Non-repeated          &    not 1-word sentences...
    
            Sentences_done.add(s)  # Add to done      
            vec_words = []         # Stores vectors for latter addition
            sentence = nlp(s)
            for token in sentence:
                if not(token.is_stop) and (token.pos_ != 'PUNCT') and (token.pos_ != 'SYM') \
                   and token.text != '-' and token.text != '_' and token.text != '.' and token.text != "'":

                    try: # Entites case
                        vec_words.append(Dict_HD[token.text + '.ENT'])
                    except KeyError:

                        # No lemmatization... 
                        word = token.text  

                        if token.pos_ == 'PROPN':
                            keyy = 'NOUN'
                        elif token.pos_ not in ['ADJ', 'ADV', 'NOUN', 'VERB']:
                            continue
                        else:
                            keyy = token.pos_

                        vec_words.append(Dict_HD[word + '.' + keyy])

            # Adding sentence vector to list
            if len(vec_words) > 0:
                Vectors_sentence.append( HDvector(ADD(vec_words).getVec(), s) )
    
    # Returns as array
    return np.array([np.array([x]) for x in Vectors_sentence])

### Other auxiliary functions

In [33]:
def original_sentence (sent_mod, Tex_mod, Text_copy):
    "Regresa el valor original de las etiquetas de entidad"
    #print('len(Text_mod):', len(Text_mod))
    #print('len(Text_copy):', len(Text_copy)) 
    for i in range(len(Text_mod)):
        if sent_mod == Text_mod[i]:
            return Text_copy[i]

In [34]:
def count_words_text(resumen):
    "Counting words in summary"
    lines = resumen.replace('\n', ' ').replace(',', '').replace('.', '').replace(':', '').replace(';', '').replace('`','')
    doc = nlp(lines)
    i = 1
    for x in doc:
        if not x.is_punct and x.text not in [" ", "\'s", "n't", "$"]:
            i += 1
    return i - 1

# Main function

In [36]:
def Summarize_texts(max_d_list,   
                  diff_bits_list,
                  max_d_sent_list,
                  size_summary = 200, # Tamaño del resumen
                  texts_list = 57,
                  Directory = 'Texts/', 
                  Precalc_Directory = 'Precalc_distances/',
                  output_dir = 'Summaries/'):
    
    # Dimensionality
    N = 10000
    # Intialization
    init(N)
    
    # Globales variables
    global Dict_entlabel_2_word, Text_mod, Text_copy, Dict_word_2_entlabel, Dict_entlabel_2_word, Entities_array
    global Adj_array, Adv_array, Noun_array, Verb_array

    
    # Loop over all documents
    for filename in os.listdir(Directory)[20:texts_list]: 
        
        # Making dir
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
                    
        print(filename)
        dict_HD = {}
        # Reading file
        Text = read_file_to_list(Directory + filename)

        # Entity recognition
        Text_mod, Text_copy, Dict_word_2_entlabel, Dict_entlabel_2_word, Entities_array = entity_text_recognizer(Text)

        # POS classification
        Adj_array, Adv_array, Noun_array, Verb_array = Text_to_POS_arrays(Text_mod)

        # To save time we use precalc distances,
        # the first time it is used it generates the dirs
        if not os.path.exists(Precalc_Directory):
            os.mkdir(Precalc_Directory)
        if filename[:-4] + '.npy' in os.listdir(Precalc_Directory):
         #   print('Precalc distances')
            pre_computed_arrays = np.load(Precalc_Directory + filename[:-4] + '.npy', allow_pickle = True)
            # Assigning arrays
            distance_adj = pre_computed_arrays[0]
            distance_adv = pre_computed_arrays[1]
            distance_nn = pre_computed_arrays[2]
            distance_vrb = pre_computed_arrays[3]
            distance_ent = pre_computed_arrays[4]
        else:
           # print('No precalc distances')
            # Measuring distance
            distance_adj, distance_adv, distance_nn, distance_vrb, distance_ent = distancias_POS_arrays(Adj_array, 
                                                                                                    Adv_array, 
                                                                                                    Noun_array, 
                                                                                                    Verb_array,
                                                                                                    Entities_array)

            master_array = np.array([distance_adj, distance_adv, distance_nn, distance_vrb, distance_ent])
            np.save(Precalc_Directory + filename[:-4] + '.npy', master_array)


        for max_d in max_d_list:
                
            # Adjectives:
            Clusters_adj = set_cluster_list(distance_adj, Adj_array, max_d )
            medoids_adj = [medoid_token(x) for x in Clusters_adj]

            # Adverbs
            Clusters_adv = set_cluster_list(distance_adv, Adv_array, max_d )
            medoids_adv = [medoid_token(x) for x in Clusters_adv]

            # Nouns
            Clusters_nn = set_cluster_list(distance_nn, Noun_array, max_d )
            medoids_nn = [medoid_token(x) for x in Clusters_nn]

            # Verb
            Clusters_vrb = set_cluster_list(distance_vrb, Verb_array, max_d )
            medoids_vrb = [medoid_token(x) for x in Clusters_vrb]

            # Entities
            Clusters_ent = set_cluster_list(distance_ent, Entities_array , max_d )
            medoids_ent = [medoid_entity(x) for x in Clusters_ent]


            for diff_bits in diff_bits_list:
                # Initializing HD memory
                init(N)
                # Adjectives
                Assign_HDvectors(Clusters_adj, medoids_adj, 'ADJ', diff_bits)
                # Adverbs
                Assign_HDvectors(Clusters_adv, medoids_adv, 'ADV', diff_bits)
                # Nouns
                Assign_HDvectors(Clusters_nn, medoids_nn, 'NOUN', diff_bits)
                # Verbs
                Assign_HDvectors(Clusters_vrb, medoids_vrb, 'VERB', diff_bits)
                # Entities
                Assign_HDvectors(Clusters_ent, medoids_ent, 'ENT', diff_bits)

                # Sentece encoding
                Vectors_sentence_array = Encode_sentences(Text_mod)
                # Clustering sentences
                X_sents = pdist(Vectors_sentence_array, metric = distancia_HD)
                distance_sent = linkage(X_sents, 'complete')
                
                sentt = [x[0].getLabelID()[0][0] for x in Vectors_sentence_array]  

                for max_d_sent in max_d_sent_list:
                    # Directory name for results
                    dirName = output_dir + str(int(N/1000)) + 'k_' + str(max_d) + '_' + str(diff_bits) + '_' + str(max_d_sent)
                    
                    # Making dir
#                    if not os.path.exists(dirName):
#                        os.mkdir(dirName)
#                    else:
#                        if os.path.exists(dirName + '/' + filename[:-4] + '_englishSyssym1.txt'):
                         #   print('Ya existe archivo de resumenes, siguiente...')
#                            continue
                    
                    clusters = fcluster(distance_sent, max_d_sent, criterion='distance')
                    uniques = np.unique(clusters)
            
                    # New list for clusters
                    Clusters_sentences = [[] for i in range(len(uniques))]
                    
                    # Sorting clusters
                    for i in range(len(clusters)): 
                        Clusters_sentences[clusters[i] - 1].append(sentt[i])
                    
                    Clusters_sentences = sorted(Clusters_sentences, key = len, reverse = True)
                    # Centroid
                    medoids = [medoid_sentence(x) for x in Clusters_sentences]

                    # Creating summary
                    Resumen = ''
                    i_medoids = -1 # For indexing cluster
                    for s in Clusters_sentences:
                        i_medoids += 1
                        # sentence to be added...
                        sentences_to_prob_add = original_sentence(s[medoids[i_medoids]], Text_mod, Text_copy) + '\n'
                        if count_words_text(Resumen + sentences_to_prob_add) <= size_summary + 10: #210...
                            Resumen += sentences_to_prob_add  # Adding sentence to summary

                        # Size limit
                        elif count_words_text(Resumen) >= size_summary - 10: # 190...
                            break

                    # Final summary file
                    with open(output_dir + filename[:-4] + dirName[17:] +  '_englishSyssym1.txt', 'w') as f:
                        f.write(Resumen)