# Summary baseline

## Load the data

In [1]:
root_dir = '../../'
src_dir = 'src'
data_dir = 'data/corpus'

In [2]:
import os
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
corpus_filename = 'alaska_corpus_noisy.json'

In [5]:
from training import TrainingCorpus

In [6]:
corpus = TrainingCorpus()
corpus.load(os.path.join(root_dir, data_dir, corpus_filename))

In [7]:
corpus.size

2171

---

## Builds pseudo-docs

In [8]:
from collections import defaultdict

In [9]:
pseudodocs_dict = defaultdict(str)

In [10]:
for doc_id in corpus.docs:
    text = corpus.get_text(doc_id)
    label = corpus.target[doc_id][0]
    pseudodocs_dict[label] += ' ' + text

## Summarize pseudo-docs

In [11]:
entities = list(pseudodocs_dict.keys())
num_entities = len(entities)
pseudodocs = [pseudodocs_dict[e_id] for e_id in entities]

In [12]:
model_name = 'facebook/bart-large-cnn'

In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1399.0), HTML(value='')))




In [14]:
min_len = 10
max_len = 100
do_sample = False

In [15]:
summaries = []

for i in range(num_entities):
    doc = pseudodocs[i].lower()
    encoded_doc = tokenizer([doc], padding=True, truncation=True, return_tensors='pt')
    summary_ids = model.generate(encoded_doc['input_ids'], min_length=min_len, max_length=max_len, do_sample=do_sample)
    summary_text = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
    summaries.append(summary_text)

## Save summaries to a file

In [16]:
import pandas as pd

In [17]:
raw_summaries_df = pd.DataFrame([(e, s) for e, s in zip(entities, summaries)], columns=['entity', 'summary'])
raw_summaries_df.head()

Unnamed: 0,entity,summary
0,ENTITY#44,"Nikon d3200 digital slr camera, 24.2 megapixel..."
1,ENTITY#23,Ebay canon eos 7d is on sale for $734.39. The ...
2,ENTITY#18,Camerafarm australia canon eos 60d 18.1x optic...
3,ENTITY#36,ebay nikon d3100 + 18-55/3.5-5.6 vr + 55-300/...
4,ENTITY#41,"Ebay buy nikon d5200 digital slr camera, black..."


In [18]:
raw_summaries_filename = 'raw_summary_alaska_noisy.xlsx'
raw_summaries_filepath = os.path.join(root_dir, 'data/terms', raw_summaries_filename)
raw_summaries_df.to_excel(raw_summaries_filepath, index=False)

---
## Compute ranking

### Define utility functions

In [19]:
def find_mix(seq, subseq):
    n = len(seq)
    m = len(subseq)
    for i in range(n - m + 1):
        if seq[i] == subseq[0] and seq[i:i + m] == subseq:
            yield range(i, i + m)
            

def get_chunk_document(chunks, text) -> list:
    tokens = TrainingCorpus.tokenize(text.lower())
    
    if len(chunks) > 0:
        for k_chunk in chunks:
            chunk = k_chunk.split('_')
            replacements = [r for r in find_mix(tokens, chunk)]
            l, f = 0, []
            
            while l < len(tokens):
                replaced = False
                
                for r in replacements:
                    if l in r:
                        replaced = True
                        f.append(chunk)
                        l += len(chunk)
                        break
                    else:
                        pass
                if not replaced:
                    f.append(tokens[l])
                    l += 1
            
            new_tokens = []
            
            for x in f:
                if isinstance(x, list):
                    new_tokens.append("_".join(x))
                else:
                    new_tokens.append(x)
            tokens = new_tokens
    
    return ' '.join(tokens)

### Get chunk document

In [20]:
import spacy
from tqdm.notebook import tqdm

In [21]:
summary_chunks = []
spacy_model_name = 'en_core_web_sm'
spacy_model = spacy.load(spacy_model_name)

for s in tqdm(summaries):
    chunks = []
    for chunk in spacy_model(s.lower()).noun_chunks:
        c = '_'.join(TrainingCorpus.tokenize(chunk.text))
        if c:
            chunks.append(c)
    
    chunk_doc = get_chunk_document(chunks, s)
    summary_chunks.append(chunk_doc)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12.0), HTML(value='')))




### Save chunks to a file

In [22]:
chunks_df = pd.DataFrame([(e, s) for e, s in zip(entities, summary_chunks)], columns=['entity', 'summary'])
chunks_df.head()

Unnamed: 0,entity,summary
0,ENTITY#44,nikon_d3200_digital_slr_camera 24_2_megapixels...
1,ENTITY#23,ebay_canon_eos_7d sale 734 39. camera 28_135mm...
2,ENTITY#18,camerafarm_australia canon eos 60d 18 1x optic...
3,ENTITY#36,ebay_nikon_d3100 18-55/3_5-5_6_vr 55-300/4 5 5...
4,ENTITY#41,ebay buy nikon_d5200_digital_slr_camera black ...


In [23]:
chunks_filename = 'chunk_summary_alaska_noisy.xlsx'
chunks_filepath = os.path.join(root_dir, 'data/terms', chunks_filename)
chunks_df.to_excel(chunks_filepath, index=False)

---