# **Automatic Summarization**

In [1]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import random
import math

### Salvo i path dei documenti

In [2]:
doc_paths = ['../data/docs/Andy-Warhol.txt',
'../data/docs/Ebola-virus-disease.txt',
'../data/docs/Life-indoors.txt',
'../data/docs/Napoleon-wiki.txt', 
'../data/docs/Trump-wall.txt']

### Preprocessing

Avrei potuto usare le stopwords di *nltk* ed anche il tokenizer *RegexpTokenizer* per rimuovere la punteggiatura durante il processo di tokenizzazione, ma ho scoperto dell'esistenza di questi ultimi solamente dopo aver implementato le seguenti funzioni. Per cui ho lasciato queste funzioni.

In [3]:
# Useful to remove punctuation from first or last char of a token 
# - Esempio: senza questa funzione "It's" diventa "It" e "'s"
# - It viene eliminato perchè è una stopword, mentre 's non viene eliminato perchè non rientra nè tra le stopwords nè tra la punteggiatura
# - Con questa funzione rimuovo ' da 's e poi rimuovo nuovamente eventuali stopwords
def remove_first_last(tokens, punct, stop):
    for i in range(len(tokens)):
        for p in punct:
            if tokens[i].startswith(p):
                tokens[i] = tokens[i][1:]
            if tokens[i].endswith(p):
                tokens[i] = tokens[i][:-1]
    tokens = [t for t in tokens if t not in stop]
    return tokens

# Remove stopwords and punctuation from the text, tokenize it and lemmatize it
def preprocess(text):
    text = text.lower()
    stop = []
    with open('../data/stop_words_FULL.txt', 'r') as f:
        stop = f.read().splitlines()
    stop = set(stop)
    punct = ['.', ',', '!', '?', ':', ';', '(', ')', '[', ']', '{', '}', '"', "'", '``', "''", '...', '’', '“', '”', '‘', '-', '$', '–']
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop and t not in punct]
    lemmatizer = WordNetLemmatizer()
    tokens = list(set([lemmatizer.lemmatize(t) for t in tokens]))
    tokens = remove_first_last(tokens, punct, stop)
    return tokens

### Parsing del file di input Nasari e creazione dizionario di vettori Nasari

(salvo anche lo score anche se in realtà non verrà usato)

In [4]:
# Parsing the Nasari file and creating a dictionary with:
# - key: word
# - value: dictionary with:
#          - key: lemma
#          - value: score
nasari = {}
with open('../data/dd-small-nasari-15.txt', 'r') as f:
    lines = [line.rstrip('\n') for line in f]
    for line in lines:
        line = line.split(';')
        tmp = {}
        for lemma in line[2:]:
            lemma = lemma.split('_')
            if len(lemma) > 1:
                tmp[lemma[0]] = lemma[1]
        nasari[line[1].lower()] = tmp

### Salvo i documenti di input

Rendo ogni documento una lista di paragrafi, ogni paragrafo è una linea del documento.

In [5]:
# Save document
def save_doc(filename):
    doc = []
    with open(filename, 'r') as f:
        lines = [line.rstrip('\n') for line in f]
        for line in lines:
            if '#' not in line and line != '': # remove empty lines and the first line with the link
                doc.append(line)
    return doc

## 1. Individuate the topic

In [6]:
# Get title from document, considering the first line
def get_title(filename):
    doc = save_doc(filename)
    return doc[0]

# Get topic words from the title checking if they are in the Nasari dictionary
def get_topic_words(title):
    tokens = preprocess(title)
    topic_words = [t for t in tokens if t in nasari.keys()]
    return topic_words

# Used only for testing
# Get random paragraph topic words from the document (not the title)
def get_random_paragraph(filename):
    doc = save_doc(filename)
    paragraph = random.choice(doc[1:])
    return paragraph

## 2. Create the context

In [7]:
# Create the context for a document title 
# - Return a list of dictionaries associated to the topic words of the title if they are in the Nasari dictionary
def create_context(title):
    topic_words = get_topic_words(title)
    context_vector = [nasari[word] for word in topic_words]
    return context_vector

# Create the context for a paragraph
# - Return a list of dictionaries associated to the topic words of the paragraph if they are in the Nasari dictionary
def create_paragraph_context(paragraph):
    topic = [w for w in paragraph if w in nasari.keys()]
    context_vector = [nasari[word] for word in topic]
    return context_vector

## 3. Retain paragraphs whose sentences contain the most salient terms, based on the Weighted Overlap

### Implementazione della Weighted Overlap

In [8]:
# Get overlap between a context topic words and a paragraph topic words 
def get_overlap(context, paragraph):
    overlap = set()
    for w in paragraph:
        for dict in context:
            if w in dict.keys():
                overlap.add(w)
    return overlap

# Get rank as the position of a lemma in the vector
def get_rank(lemma, vector):
    min = math.inf
    for dict in vector:
        i = 1
        for key in dict.keys():
            if key == lemma:
                if i < min:
                    min = i
            i += 1
    return min

# Compute weighted overlap between two vectors
def weighted_overlap(context, paragraph, par_context):
    overlap = get_overlap(context, paragraph)
    if overlap:
        i = 1
        num = 0
        den = 0
        for lemma in overlap:
            den += get_rank(lemma, context) + get_rank(lemma, par_context) # This should be the num but since it is to the power of -1 I can put it in the den
            num += 2 * i # This should be the den but since it is to the power of -1 I can put it in the num
            i += 1
        return num / den
    return 0

### Calcolo della Weighted Overlap per ogni paragrafo di un documento

In [9]:
# Compute the weighted overlap between a document title and all the paragraphs
def weight_doc(filename):
    title = get_title(filename)
    context = create_context(title)
    doc = save_doc(filename)
    paragraphs = [preprocess(par) for par in doc[1:]]
    par_context = [create_paragraph_context(par) for par in paragraphs]
    weighted_overlap_list = [weighted_overlap(context, paragraphs[i], par_context[i]) for i in range(len(paragraphs))]
    return weighted_overlap_list

### Selezione dei paragrafi migliori

Verrà selezionato il 70 - 80 - 90% dei paragrafi con peso maggiore, a seconda della percentuale richiesta.  
(Può essere utilizzato select_best_paragraphs per stampare il documento riassunto della percentuale richiesta)

In [10]:
# Calculate how many paragraphs to keep, given a percentage
def get_threshold(doc, percentage):
    total = len(doc[1:])
    threshold = math.ceil(total * percentage)
    return threshold

# Select the best paragraphs given a percentage
def select_best_paragraphs(filename, percentage):
    weighted_overlap_list = weight_doc(filename)
    doc = save_doc(filename)
    paragraphs = doc[1:]
    best_paragraphs = [doc[0]]
    threshold = get_threshold(doc, percentage)
    for i in range(int(threshold)):
        best_paragraphs.append(paragraphs[weighted_overlap_list.index(max(weighted_overlap_list))])
        weighted_overlap_list[weighted_overlap_list.index(max(weighted_overlap_list))] = -1
    return best_paragraphs

# select_best_paragraphs(doc_paths[0], 0.8)

### Costruzione del gold per la valutazione

Sarà un dizionario con chiave il nome del documento e valore una lista di paragrafi, dove ogni paragrafo è una lista parole rilevanti

In [11]:
# gold_docs will be a dictionary with:
# - key: title of the document
# - value: list of paragraphs with each element as a list of preprocessed words - which are considered as salient terms
gold_docs = {}

def doc_preprocess(filename):
    doc = save_doc(filename)
    paragraphs = [preprocess(par) for par in doc[1:]]
    return paragraphs

for path in doc_paths:
    title = get_title(path)
    gold_docs[title] = doc_preprocess(path)

### Funzioni per creare il summary e il gold utili per la valutazione

- Il summary sarà in realtà un set di termini rilevanti per il documento riassunto
- Il gold sarà un set di termini rilevanti per il documento originale

In [12]:
# Create summary as a set of relevant words for a summarized document
def create_summary(filename, percentage):
    best_paragraphs = select_best_paragraphs(filename, percentage)
    summary = [preprocess(par) for par in best_paragraphs[1:]]
    summary = [word for par in summary for word in par] 
    return set(summary)

# Create gold as a set of relevant words for an original document
def create_gold(filename):
    gold = gold_docs[get_title(filename)]
    gold = [word for par in gold for word in par]
    return set(gold)

### Calcolo della BLEU e ROUGE

Saranno date dal confronto tra il riassunto di riferimento (gold) e il riassunto generato (summary).

- **BLEU**: Misura la precision: ovvero quante parole del riassunto generato sono contenute nel riassunto di riferimento.  
BLEU = num(gold & summary) / num(summary)

- **ROUGE**: Misura la recall: ovvero quante parole del riassunto di riferimento sono contenute nel riassunto generato.  
ROUGE = num(gold & summary) / num(gold)

  
*Note*:  
- BLEU sarà sempre 1 perché il riassunto generato è estrattivo, non astrattivo. Pertanto non verrà generata alcuna parola non presente nel riassunto di riferimento.  
- ROUGE invece è più indicativo, dipende da quante parole del riassunto di riferimento sono state estratte.
- Ho inserito la percentuale 100% solamente per vedere se il codice funzionasse correttamente restituendo ROUGE = 1, len(summary) = len(gold) e threshold = max numero di paragrafi.

In [13]:
# Compute BLEU score for a document
def bleu(summary, gold):
    intersection = summary.intersection(gold)
    bleu = len(intersection) / len(summary)
    return bleu

# Compute ROUGE score for a document
def rouge(summary, gold):
    intersection = summary.intersection(gold)
    rouge = len(intersection) / len(gold)
    return rouge

# Run the summarization algorithm for all the documents with different percentages
for path in doc_paths:
    print('-' * 100)
    print(f'DOC: {get_title(path)}')
    n_par = len(save_doc(path)[1:])
    percents = [0.7, 0.8, 0.9, 1]
    for p in percents:
        threshold = get_threshold(save_doc(path), p)
        summary = create_summary(path, p)
        gold = create_gold(path)
        print(f'\tThreshold with percentage {p*100}%: {threshold} out of {n_par} paragraphs')
        print(f'\tSummary length: {len(summary)}')
        print(f'\tGold length: {len(gold)}')
        print(f'\tBLEU score with {p*100}%: {bleu(summary, gold)}')
        print(f'\tROUGE score with {p*100}%: {rouge(summary, gold)}')
        print('-' * 100)

----------------------------------------------------------------------------------------------------
DOC: Andy Warhol: Why the great Pop artist thought ‘Trump is sort of cheap’
	Threshold with percentage 70.0%: 14 out of 19 paragraphs
	Summary length: 303
	Gold length: 409
	BLEU score with 70.0%: 1.0
	ROUGE score with 70.0%: 0.7408312958435208
----------------------------------------------------------------------------------------------------
	Threshold with percentage 80.0%: 16 out of 19 paragraphs
	Summary length: 355
	Gold length: 409
	BLEU score with 80.0%: 1.0
	ROUGE score with 80.0%: 0.8679706601466992
----------------------------------------------------------------------------------------------------
	Threshold with percentage 90.0%: 18 out of 19 paragraphs
	Summary length: 398
	Gold length: 409
	BLEU score with 90.0%: 1.0
	ROUGE score with 90.0%: 0.9731051344743277
----------------------------------------------------------------------------------------------------
	Threshold wi