# Summary baseline

## Load the data

In [1]:
root_dir = '../../'
src_dir = 'src'
data_dir = 'data/corpus'
models_dir = 'data/models'

In [2]:
import os
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
corpus_filename = 'alaska_corpus.json'

In [5]:
from training import TrainingCorpus

In [6]:
corpus = TrainingCorpus()
corpus.load(os.path.join(root_dir, data_dir, corpus_filename))

In [7]:
corpus.size

2171

---

## Builds pseudo-docs

In [8]:
from collections import defaultdict

In [9]:
pseudodocs_dict = defaultdict(str)

In [10]:
for doc_id in corpus.docs:
    text = corpus.get_text(doc_id)
    label = corpus.target[doc_id][0]
    pseudodocs_dict[label] += ' ' + text

## Summarize pseudo-docs

In [11]:
entities = list(pseudodocs_dict.keys())
num_entities = len(entities)
pseudodocs = [pseudodocs_dict[e_id] for e_id in entities]

In [12]:
summary = []

In [13]:
from transformers import pipeline

In [14]:
model_name = 'bart-large-cnn'

In [15]:
summarizer = pipeline('summarization', model=model_name)

Couldn't reach server at 'https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/modelcard.json' to download model card file.
Creating an empty model card.


In [16]:
min_len = 10
max_len = 50
do_sample = False

In [17]:
for i in range(num_entities):
    doc = pseudodocs[i]
    summary_text = summarizer(doc, min_length=min_len, max_length=max_len, do_sample=do_sample)[0]['summary_text']
    summary.append(summary_text)

In [18]:
from pprint import pprint

In [19]:
pprint(summary)

['Nikon d3200 digital slr camera, 24.2 megapixels for less at walmart.com. '
 'Save money. live live better.  nikon  dslr camera with 18 55mm and 55',
 'Canon eos 7d digital slr camera with 18-135mm f/3.5-5.6 is reviews - '
 'buzzillions.com cammarkt.',
 'Camerafarm australia canon eos 60d 18.1x optical zoom - 4272 x 2848 image - '
 '1920 x 1080 video - hdmi - pictbridge - hD movie mode.',
 ' ebay nikon d3100 + 18-55/3.5-5.6 vr + 55-300/4.5 - 5.5 vr  new zealand '
 'prices - priceme.  ',
 'Ebay buy nikon d5200 digital slr camera, black (body only) cameras - digital '
 '- slr 1501 today at pc connection.  nikons.com   \xa0nikon 5200',
 'Nikon d5100 16.2 mp cmos digital slr camera bundle with 18-55mm lens. 2 en '
 'el14 batteries, no charger.',
 ' ebay nikon d7000 (18-105 mm) price in india, bangalore, hyderabad, delhi, '
 'chennai, mumbai, pune, kolkatta nik on d7',
 'Canon eos 70d digital slr 20 2 mp camera with 18 55mm stm f 3 5 5 6 lens on '
 'sale for $1008.81. Canon eos70d + 18-135

---

## Compute tf-idf on each pseudo-document

### Define corpus

In [20]:
clean_pseudodocs = []

for i in range(num_entities):
    doc = pseudodocs[i]
    clean_doc = ' '.join(TrainingCorpus.tokenize(doc))
    clean_pseudodocs.append(clean_doc)

len(clean_pseudodocs)

20

# Fit tf-idf vectorizer

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorizer = TfidfVectorizer(lowercase=False, tokenizer=lambda x: x.split())
vectorizer.fit(clean_pseudodocs)



TfidfVectorizer(lowercase=False,
                tokenizer=<function <lambda> at 0x7f991dc4a170>)

## Save tf-idf weights into a dict

In [23]:
vectorized_docs = vectorizer.transform(clean_pseudodocs)
vectorized_docs.shape

(20, 1224)

In [26]:
tfidf_weights = []

for i in range(num_entities):
    doc_vector = vectorized_docs[i].toarray().reshape(-1)
    weights = {}
    
    for j, w in enumerate(doc_vector):
        feature_name = vectorizer.get_feature_names()[j]
        if w > 0:
            weights[feature_name] = w
    
    tfidf_weights.append(weights)

## Compute n-grams and score them

In [38]:
ngram_min = 1
ngram_max = 10

In [57]:
def custom_fn(x, alpha):
    return 1 + alpha*x

In [66]:
alpha = 0.2

In [67]:
import math
from nltk import ngrams

In [68]:
final_summaries = []

for i in range(num_entities):
    doc = pseudodocs[i].split()
    doc_len = len(doc)
    n_max = min(doc_len, ngram_max)
    entity_weights = tfidf_weights[i]
    entity_ngrams = {}
    
    # compute all ngrams
    for j in range(ngram_min, n_max + 1):
        ngrams_list = ngrams(doc, j)
        
        # compute score for each ngram by
        # summing up the tf-idf weights of the 
        # tf-idf vector associated with the ngram
        for ngram in ngrams_list:
            ngram_str = ' '.join(ngram).lower()
            clean_ngram = TrainingCorpus.tokenize(ngram_str)
            score = 0
            
            # check for null list of tokens
            if clean_ngram:
                for token in clean_ngram:
                    if token in entity_weights:
                        score += entity_weights[token]
            
            if score:
                # penalize long summaries
                score = score/custom_fn(j, alpha)
                entity_ngrams[ngram_str] = score
        
    sorted_ngrams = sorted(entity_ngrams.items(), key=lambda x: -x[1])    
    selected_ngram = sorted_ngrams[0][0]
    final_summaries.append(selected_ngram)

In [69]:
final_summaries

['nikon d3200 with 18 55mm | ebay nikon d3200',
 'canon eos 7d body only canon eos 7d',
 'canon eos 60d body | ebay canon eos 60d',
 'nikon d3100 | ebay nikon d3100 14',
 'nikon d5200 body nikon d5200',
 'nikon d5100 camera | ebay nikon d5100',
 'nikon d7000 018208254682 | ebay nikon d7000 16',
 'eos 70d digital camera 013803221596 | ebay canon eos 70d',
 'nikon d5300 black digital slr body nikon d5300',
 'eos 5d mark iii digital camera - buy 5d iii,canon,digital',
 '5d mark ii 013803105384 | ebay canon eos 5d mark',
 'nikon d90 12.3',
 '018208254804 | ebay nikon d800 body nikon d800 36',
 'nikon d610 body only digital slr camera nikon d610 24.3mp',
 'nikon d3300 black body digital slr nikon d3300',
 'nikon 1 j1 | ebay nikon 1 j1 10 1',
 '018208254125 | ebay nikon d80 10.2mp digital slr camera',
 'd300 12.3 mp dslr camera body nikkor nikon d300 12',
 'nikon 1 j3 14.2',
 'olympus om-d e-m5 black body olympus om-d e-m5']

---