# Summary baseline

## Load the data

In [1]:
root_dir = '../../'
src_dir = 'src'
data_dir = 'data/corpus'
models_dir = 'data/models'

In [2]:
import os
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
corpus_filename = 'alaska_corpus.json'

In [5]:
from training import TrainingCorpus

In [6]:
corpus = TrainingCorpus()
corpus.load(os.path.join(root_dir, data_dir, corpus_filename))

In [7]:
corpus.size

2171

---

## Builds pseudo-docs

In [8]:
from collections import defaultdict

In [9]:
pseudodocs_dict = defaultdict(str)

In [10]:
for doc_id in corpus.docs:
    text = corpus.get_text(doc_id)
    label = corpus.target[doc_id][0]
    pseudodocs_dict[label] += ' ' + text

## Summarize pseudo-docs

In [11]:
entities = list(pseudodocs_dict.keys())
num_entities = len(entities)
pseudodocs = [pseudodocs_dict[e_id] for e_id in entities]

In [12]:
summaries = []

In [13]:
from transformers import pipeline

  return torch._C._cuda_getDeviceCount() > 0


In [14]:
model_name = 'bart-large-cnn'

In [15]:
summarizer = pipeline('summarization', model=model_name)

404 Client Error: Not Found for url: https://huggingface.co/bart-large-cnn/resolve/main/config.json
404 Client Error: Not Found for url: https://huggingface.co/bart-large-cnn/resolve/main/config.json


OSError: Can't load config for 'bart-large-cnn'. Make sure that:

- 'bart-large-cnn' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'bart-large-cnn' is the correct path to a directory containing a config.json file



In [None]:
min_len = 10
max_len = 50
do_sample = False

In [None]:
for i in range(num_entities):
    doc = pseudodocs[i].lower()
    summary_text = summarizer(doc, min_length=min_len, max_length=max_len, do_sample=do_sample)[0]['summary_text']
    summaries.append(summary_text)

In [None]:
from pprint import pprint

In [None]:
pprint(summaries)

---

## Compute tf-idf on each pseudo-document

### Define corpus

In [16]:
clean_pseudodocs = []

for i in range(num_entities):
    doc = pseudodocs[i]
    clean_doc = ' '.join(TrainingCorpus.tokenize(doc))
    clean_pseudodocs.append(clean_doc)

len(clean_pseudodocs)

20

# Fit tf-idf vectorizer

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
vectorizer = TfidfVectorizer(lowercase=False, tokenizer=lambda x: x.split())
vectorizer.fit(clean_pseudodocs)



TfidfVectorizer(lowercase=False,
                tokenizer=<function <lambda> at 0x7f9b938923b0>)

## Save tf-idf weights into a dict

In [19]:
vectorized_docs = vectorizer.transform(clean_pseudodocs)
vectorized_docs.shape

(20, 1224)

In [20]:
tfidf_weights = []

for i in range(num_entities):
    doc_vector = vectorized_docs[i].toarray().reshape(-1)
    weights = {}
    
    for j, w in enumerate(doc_vector):
        feature_name = vectorizer.get_feature_names()[j]
        if w > 0:
            weights[feature_name] = w
    
    tfidf_weights.append(weights)

## Compute baseline summaries

In [21]:
min_len = 1
max_len = 10

In [22]:
tolerance = 5

In [23]:
def compute_score(idx, text):
    weights = tfidf_weights[idx]
    tokenized_text = TrainingCorpus.tokenize(text.lower())
    score = 0
    
    if tokenized_text:
        for token in tokenized_text:
            if token in weights:
                score += weights[token]
    
    return score

In [24]:
import numpy as np

In [25]:
threshold_percentile = 98

In [26]:
final_summaries = []

for i in range(num_entities):
    summary = summaries[i].split()
    weights = list(tfidf_weights[i].values())
    threshold = np.percentile(weights, threshold_percentile)
    below_threshold_count = 0
    prev_score = 0
    
    for j in range(min_len, max_len):
        selected_summary_tokens = summary[:j]
        selected_summary = ' '.join(selected_summary_tokens)
        score = compute_score(i, selected_summary)
        delta_score = score - prev_score
        
        if delta_score >= threshold:
            below_threshold_count = 0
        else:
            below_threshold_count += 1
            
        if below_threshold_count > tolerance:
            final_summary = ' '.join(selected_summary_tokens[:j-tolerance-1])
            break
                
        else:
            final_summary = selected_summary
        
        prev_score = score
    
    final_summaries.append(final_summary)

IndexError: list index out of range

In [None]:
final_summaries

---