# Summary baseline

## Load the data

In [1]:
root_dir = '../../'
src_dir = 'src'
data_dir = 'data/corpus'
models_dir = 'data/models'

In [2]:
import os
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
corpus_filename = 'alaska_corpus.json'

In [5]:
from training import TrainingCorpus

In [6]:
corpus = TrainingCorpus()
corpus.load(os.path.join(root_dir, data_dir, corpus_filename))

In [7]:
corpus.size

2171

---

## Builds pseudo-docs

In [8]:
from collections import defaultdict

In [9]:
pseudodocs_dict = defaultdict(str)

In [10]:
for doc_id in corpus.docs:
    text = corpus.get_text(doc_id)
    label = corpus.target[doc_id][0]
    pseudodocs_dict[label] += ' ' + text

## Summarize pseudo-docs

In [11]:
entities = list(pseudodocs_dict.keys())
num_entities = len(entities)
pseudodocs = [pseudodocs_dict[e_id] for e_id in entities]

In [12]:
summaries = []

In [13]:
from transformers import pipeline

In [14]:
model_name = 'bart-large-cnn'

In [15]:
summarizer = pipeline('summarization', model=model_name)

Couldn't reach server at 'https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/modelcard.json' to download model card file.
Creating an empty model card.


In [16]:
min_len = 10
max_len = 50
do_sample = False

In [17]:
for i in range(num_entities):
    doc = pseudodocs[i]
    summary_text = summarizer(doc, min_length=min_len, max_length=max_len, do_sample=do_sample)[0]['summary_text']
    summaries.append(summary_text)

In [18]:
from pprint import pprint

In [19]:
pprint(summaries)

['Nikon d3200 digital slr camera, 24.2 megapixels for less at walmart.com. '
 'Save money. live live better.  nikon  dslr camera with 18 55mm and 55',
 'Canon eos 7d digital slr camera with 18-135mm f/3.5-5.6 is reviews - '
 'buzzillions.com cammarkt.',
 'Camerafarm australia canon eos 60d 18.1x optical zoom - 4272 x 2848 image - '
 '1920 x 1080 video - hdmi - pictbridge - hD movie mode.',
 ' ebay nikon d3100 + 18-55/3.5-5.6 vr + 55-300/4.5 - 5.5 vr  new zealand '
 'prices - priceme.  ',
 'Ebay buy nikon d5200 digital slr camera, black (body only) cameras - digital '
 '- slr 1501 today at pc connection.  nikons.com   \xa0nikon 5200',
 'Nikon d5100 16.2 mp cmos digital slr camera bundle with 18-55mm lens. 2 en '
 'el14 batteries, no charger.',
 ' ebay nikon d7000 (18-105 mm) price in india, bangalore, hyderabad, delhi, '
 'chennai, mumbai, pune, kolkatta nik on d7',
 'Canon eos 70d digital slr 20 2 mp camera with 18 55mm stm f 3 5 5 6 lens on '
 'sale for $1008.81. Canon eos70d + 18-135

---

## Compute tf-idf on each pseudo-document

### Define corpus

In [20]:
clean_pseudodocs = []

for i in range(num_entities):
    doc = pseudodocs[i]
    clean_doc = ' '.join(TrainingCorpus.tokenize(doc))
    clean_pseudodocs.append(clean_doc)

len(clean_pseudodocs)

20

# Fit tf-idf vectorizer

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorizer = TfidfVectorizer(lowercase=False, tokenizer=lambda x: x.split())
vectorizer.fit(clean_pseudodocs)



TfidfVectorizer(lowercase=False,
                tokenizer=<function <lambda> at 0x7f4bce7dcdd0>)

## Save tf-idf weights into a dict

In [23]:
vectorized_docs = vectorizer.transform(clean_pseudodocs)
vectorized_docs.shape

(20, 1224)

In [24]:
tfidf_weights = []

for i in range(num_entities):
    doc_vector = vectorized_docs[i].toarray().reshape(-1)
    weights = {}
    
    for j, w in enumerate(doc_vector):
        feature_name = vectorizer.get_feature_names()[j]
        if w > 0:
            weights[feature_name] = w
    
    tfidf_weights.append(weights)

## Compute baseline summaries

In [25]:
min_len = 1
max_len = 10

In [26]:
tolerance = 5

In [27]:
def compute_score(idx, text):
    weights = tfidf_weights[idx]
    tokenized_text = TrainingCorpus.tokenize(text.lower())
    score = 0
    
    if tokenized_text:
        for token in tokenized_text:
            if token in weights:
                score += weights[token]
    
    return score

In [28]:
import numpy as np

In [29]:
threshold_percentile = 98

In [30]:
final_summaries = []

for i in range(num_entities):
    summary = summaries[i].split()
    weights = list(tfidf_weights[i].values())
    threshold = np.percentile(weights, threshold_percentile)
    below_threshold_count = 0
    prev_score = 0
    
    for j in range(min_len, max_len):
        selected_summary_tokens = summary[:j]
        selected_summary = ' '.join(selected_summary_tokens)
        score = compute_score(i, selected_summary)
        delta_score = score - prev_score
        
        if delta_score >= threshold:
            below_threshold_count = 0
        else:
            below_threshold_count += 1
            
        if below_threshold_count > tolerance:
            final_summary = ' '.join(selected_summary_tokens[:j-tolerance-1])
            break
                
        else:
            final_summary = selected_summary
        
        prev_score = score
    
    final_summaries.append(final_summary)

In [31]:
final_summaries

['Nikon d3200 digital slr camera, 24.2 megapixels for less',
 'Canon eos 7d',
 'Camerafarm australia canon eos 60d 18.1x optical zoom -',
 'ebay nikon d3100',
 'Ebay buy nikon d5200 digital slr camera, black (body',
 'Nikon d5100 16.2',
 'ebay nikon d7000',
 'Canon eos 70d digital slr 20 2 mp camera',
 'Nikon d5300 24.2 mp cmos digital slr camera with',
 'Canon eos 5d mark iii digital slr camera with',
 'Ebay canon eos 5d mark ii 21 1 mp',
 'Nikon d90',
 'Buy nikon d800 digital slr body, 36.3mp (body only)',
 'Nikon d610 24.3mp',
 'Nikon d3300',
 'Ebay nikon 1 j1 mirrorless 10-30 mm price in',
 'Ebay nikon d80',
 'Nikon d300 12',
 'Nikon 1 j3',
 'Olympus om-d e-m5']

---