# Summary baseline

## Load the data

In [1]:
root_dir = '../../'
src_dir = 'src'
data_dir = 'data/corpus'
models_dir = 'data/models'

In [2]:
import os
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
corpus_filename = 'alaska_corpus.json'

In [5]:
from training import TrainingCorpus

In [6]:
corpus = TrainingCorpus()
corpus.load(os.path.join(root_dir, data_dir, corpus_filename))

In [7]:
corpus.size

2171

---

## Builds pseudo-docs

In [8]:
from collections import defaultdict

In [9]:
pseudodocs_dict = defaultdict(str)

In [10]:
for doc_id in corpus.docs:
    text = corpus.get_text(doc_id)
    label = corpus.target[doc_id][0]
    pseudodocs_dict[label] += ' ' + text

## Summarize pseudo-docs

In [11]:
entities = list(pseudodocs_dict.keys())
num_entities = len(entities)
pseudodocs = [pseudodocs_dict[e_id] for e_id in entities]

In [12]:
model_name = 'facebook/bart-large-cnn'

In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [14]:
min_len = 10
max_len = 100
do_sample = False

In [15]:
summaries = []

for i in range(num_entities):
    doc = pseudodocs[i].lower()
    encoded_doc = tokenizer([doc], padding=True, truncation=True, return_tensors='pt')
    summary_ids = model.generate(encoded_doc['input_ids'], min_length=min_len, max_length=max_len, do_sample=do_sample)
    summary_text = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
    summaries.append(summary_text)

## Save summaries to a file

In [16]:
import pandas as pd

In [17]:
raw_summaries_df = pd.DataFrame([(e, s) for e, s in zip(entities, summaries)], columns=['entity', 'summary'])
raw_summaries_df.head()

Unnamed: 0,entity,summary
0,ENTITY#44,"Nikon d3200 digital slr camera, 24.2 megapixel..."
1,ENTITY#23,Ebay canon eos 7d is on sale for $734.39. The ...
2,ENTITY#18,Camerafarm australia canon eos 60d 18.1x optic...
3,ENTITY#36,ebay nikon d3100 + 18-55/3.5-5.6 vr + 55-300/...
4,ENTITY#41,"Ebay buy nikon d5200 digital slr camera, black..."


In [18]:
raw_summaries_filename = 'raw_summary_alaska.xlsx'
raw_summaries_filepath = os.path.join(root_dir, 'data/terms', raw_summaries_filename)
raw_summaries_df.to_excel(raw_summaries_filepath, index=False)

---
## Compute ranking

### Define utility functions

In [30]:
def find_mix(seq, subseq):
    n = len(seq)
    m = len(subseq)
    for i in range(n - m + 1):
        if seq[i] == subseq[0] and seq[i:i + m] == subseq:
            yield range(i, i + m)
            

def get_chunk_document(chunks, text) -> list:
    tokens = TrainingCorpus.tokenize(text.lower())
    
    if len(chunks) > 0:
        for k_chunk in chunks:
            chunk = k_chunk.split('_')
            replacements = [r for r in find_mix(tokens, chunk)]
            l, f = 0, []
            
            while l < len(tokens):
                replaced = False
                
                for r in replacements:
                    if l in r:
                        replaced = True
                        f.append(chunk)
                        l += len(chunk)
                        break
                    else:
                        pass
                if not replaced:
                    f.append(tokens[l])
                    l += 1
            
            new_tokens = []
            
            for x in f:
                if isinstance(x, list):
                    new_tokens.append("_".join(x))
                else:
                    new_tokens.append(x)
            tokens = new_tokens
    
    return ' '.join(tokens)

### Get chunk document

In [31]:
import spacy
from tqdm.notebook import tqdm

In [32]:
summary_chunks = []
spacy_model_name = 'en_core_web_sm'
spacy_model = spacy.load(spacy_model_name)

for s in tqdm(summaries):
    chunks = []
    for chunk in spacy_model(s.lower()).noun_chunks:
        c = '_'.join(TrainingCorpus.tokenize(chunk.text))
        if c:
            chunks.append(c)
    
    chunk_doc = get_chunk_document(chunks, s)
    summary_chunks.append(chunk_doc)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




### Save chunks to a file

In [35]:
chunks_df = pd.DataFrame([(e, s) for e, s in zip(entities, summary_chunks)], columns=['entity', 'summary'])
chunks_df.head()

Unnamed: 0,entity,summary
0,ENTITY#44,nikon_d3200_digital_slr_camera 24_2_megapixels...
1,ENTITY#23,ebay_canon_eos_7d sale 734 39. camera 28_135mm...
2,ENTITY#18,camerafarm_australia canon eos 60d 18 1x optic...
3,ENTITY#36,ebay_nikon_d3100 18-55/3_5-5_6_vr 55-300/4 5 5...
4,ENTITY#41,ebay buy nikon_d5200_digital_slr_camera black ...


In [38]:
chunks_filename = 'chunk_summary_alaska.xlsx'
chunks_filepath = os.path.join(root_dir, 'data/terms', chunks_filename)
chunks_df.to_excel(chunks_filepath, index=False)

---

## Compute tf-idf on each pseudo-document

### Define corpus

In [18]:
clean_pseudodocs = []

for i in range(num_entities):
    doc = pseudodocs[i]
    clean_doc = ' '.join(TrainingCorpus.tokenize(doc))
    clean_pseudodocs.append(clean_doc)

len(clean_pseudodocs)

20

# Fit tf-idf vectorizer

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
vectorizer = TfidfVectorizer(lowercase=False, tokenizer=lambda x: x.split())
vectorizer.fit(clean_pseudodocs)



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=False, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function <lambda> at 0x7fa90c0e2820>, use_idf=True,
                vocabulary=None)

## Save tf-idf weights into a dict

In [21]:
vectorized_docs = vectorizer.transform(clean_pseudodocs)
vectorized_docs.shape

(20, 1224)

In [22]:
tfidf_weights = []

for i in range(num_entities):
    doc_vector = vectorized_docs[i].toarray().reshape(-1)
    weights = {}
    
    for j, w in enumerate(doc_vector):
        feature_name = vectorizer.get_feature_names()[j]
        if w > 0:
            weights[feature_name] = w
    
    tfidf_weights.append(weights)

## Compute baseline summaries

In [23]:
min_len = 1
max_len = 10

In [24]:
tolerance = 5

In [25]:
def compute_score(idx, text):
    weights = tfidf_weights[idx]
    tokenized_text = TrainingCorpus.tokenize(text.lower())
    score = 0
    
    if tokenized_text:
        for token in tokenized_text:
            if token in weights:
                score += weights[token]
    
    return score

In [26]:
import numpy as np

In [27]:
threshold_percentile = 98

In [28]:
final_summaries = []

for i in range(num_entities):
    summary = summaries[i].split()
    weights = list(tfidf_weights[i].values())
    threshold = np.percentile(weights, threshold_percentile)
    below_threshold_count = 0
    prev_score = 0
    
    for j in range(min_len, max_len):
        selected_summary_tokens = summary[:j]
        selected_summary = ' '.join(selected_summary_tokens)
        score = compute_score(i, selected_summary)
        delta_score = score - prev_score
        
        if delta_score >= threshold:
            below_threshold_count = 0
        else:
            below_threshold_count += 1
            
        if below_threshold_count > tolerance:
            final_summary = ' '.join(selected_summary_tokens[:j-tolerance-1])
            break
                
        else:
            final_summary = selected_summary
        
        prev_score = score
    
    final_summaries.append(final_summary)

In [29]:
final_summaries

['Nikon d3200 digital slr camera, 24.2 megapixels for less',
 'Ebay canon eos 7d is on sale for $734.39.',
 'Camerafarm australia canon eos 60d 18.1x optical zoom -',
 'ebay nikon d3100',
 'Ebay buy nikon d5200 digital slr camera, black (body',
 'Nikon d5100 16.2',
 'ebay nikon d7000',
 'Canon eos 70d',
 'Nikon d5300 24.2 mp cmos digital slr camera with',
 'Canon eos 5d mark iii kit with ef 24-105mm',
 'Ebay canon eos 5d mark ii 21 1 mp',
 'Nikon d90',
 'Buy nikon d800 digital slr body, 36.3mp (body only)',
 'Nikon d610',
 'Nikon d3300',
 'Ebay nikon 1 j1 mirrorless 10-30 mm price in',
 'ebay nikon d80 10.2mp digital slr camera kit on',
 'Nikon d300 12',
 'Nikon 1 j3',
 'Olympus om-d e-m5']

## Build a DataFrame out of summaries

In [32]:
df = pd.DataFrame([(e, s) for e, s in zip(entities, final_summaries)], columns=['entity', 'summary'])
df.head()

Unnamed: 0,entity,summary
0,ENTITY#44,"Nikon d3200 digital slr camera, 24.2 megapixel..."
1,ENTITY#23,Ebay canon eos 7d is on sale for $734.39.
2,ENTITY#18,Camerafarm australia canon eos 60d 18.1x optic...
3,ENTITY#36,ebay nikon d3100
4,ENTITY#41,"Ebay buy nikon d5200 digital slr camera, black..."


## Save DataFrame to a file

In [34]:
filename = 'summary_baseline_alaska.xlsx'
filepath = os.path.join(root_dir, 'data/terms', filename)
df.to_excel(filepath, index=False)

---