# Summary baseline

## Load the data

In [1]:
root_dir = '../../'
src_dir = 'src'
data_dir = 'data/corpus'
models_dir = 'data/models'

In [2]:
import os
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
version = 'v2'
corpus_filename = f'wikidata_corpus_{version}.json'

In [5]:
from training import TrainingCorpus

In [6]:
corpus = TrainingCorpus()
corpus.load(os.path.join(root_dir, data_dir, corpus_filename))

In [7]:
corpus.size

3294

---

## Builds pseudo-docs

In [8]:
from collections import defaultdict

In [9]:
pseudodocs_dict = defaultdict(str)

In [10]:
for doc_id in corpus.docs:
    text = corpus.get_text(doc_id)
    label = corpus.target[doc_id][0]
    pseudodocs_dict[label] += ' ' + text

## Summarize pseudo-docs

In [11]:
entities = list(pseudodocs_dict.keys())
num_entities = len(entities)
pseudodocs = [pseudodocs_dict[e_id] for e_id in entities]

In [12]:
model_name = 'facebook/bart-large-cnn'

In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [14]:
max_len = 100
do_sample = False

In [15]:
summaries = []

for i in range(num_entities):
    doc = pseudodocs[i]
    encoded_doc = tokenizer([doc], padding=True, truncation=True, return_tensors='pt')
    summary_ids = model.generate(encoded_doc['input_ids'], max_length=max_len, do_sample=do_sample)
    summary_text = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
    summaries.append(summary_text)

## Save summaries to a file

In [16]:
import pandas as pd

In [17]:
raw_summaries_df = pd.DataFrame([(e, s) for e, s in zip(entities, summaries)], columns=['entity', 'summary'])
raw_summaries_df.head()

Unnamed: 0,entity,summary
0,Q1754,Stockholm is the capital and most populous ur...
1,Q1787199,"Stockholm is a town in Grant County, South Da..."
2,Q976601,"Stockholm is a village in Pepin County, Wisco..."
3,Q1484620,Stockholm asteroid 10552 Stockholm asteroid ...
4,Q3447382,"Stockholm is a town in Aroostook County, Main..."


In [18]:
raw_summaries_filename = f'raw_summary_wikidata_{version}.xlsx'
raw_summaries_filepath = os.path.join(root_dir, 'data/terms', raw_summaries_filename)
raw_summaries_df.to_excel(raw_summaries_filepath, index=False)

---
## Compute ranking

### Define utility functions

In [19]:
def find_mix(seq, subseq):
    n = len(seq)
    m = len(subseq)
    for i in range(n - m + 1):
        if seq[i] == subseq[0] and seq[i:i + m] == subseq:
            yield range(i, i + m)
            

def get_chunk_document(chunks, text) -> list:
    tokens = TrainingCorpus.tokenize(text.lower())
    
    if len(chunks) > 0:
        for k_chunk in chunks:
            chunk = k_chunk.split('_')
            replacements = [r for r in find_mix(tokens, chunk)]
            l, f = 0, []
            
            while l < len(tokens):
                replaced = False
                
                for r in replacements:
                    if l in r:
                        replaced = True
                        f.append(chunk)
                        l += len(chunk)
                        break
                    else:
                        pass
                if not replaced:
                    f.append(tokens[l])
                    l += 1
            
            new_tokens = []
            
            for x in f:
                if isinstance(x, list):
                    new_tokens.append("_".join(x))
                else:
                    new_tokens.append(x)
            tokens = new_tokens
    
    return ' '.join(tokens)

### Get chunk document

In [20]:
import spacy
from tqdm.notebook import tqdm

In [21]:
summary_chunks = []
spacy_model_name = 'en_core_web_sm'
spacy_model = spacy.load(spacy_model_name)

for s in tqdm(summaries):
    chunks = []
    for chunk in spacy_model(s.lower()).noun_chunks:
        c = '_'.join(TrainingCorpus.tokenize(chunk.text))
        if c:
            chunks.append(c)
    
    chunk_doc = get_chunk_document(chunks, s)
    summary_chunks.append(chunk_doc)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=195.0), HTML(value='')))




In [22]:
summary_chunks[44]

'tangerang city province banten indonesia latest_official_estimate mid 2020 2,273,697 – making eighth populated suburb world latter_date area 164_54_square_kilometres 63_53_square_miles'

### Save chunks to a file

In [23]:
chunks_df = pd.DataFrame([(e, s) for e, s in zip(entities, summary_chunks)], columns=['entity', 'summary'])
chunks_df.head()

Unnamed: 0,entity,summary
0,Q1754,stockholm capital populous_urban_area sweden w...
1,Q1787199,stockholm_town grant_county south_dakota unite...
2,Q976601,stockholm village pepin_county wisconsin unite...
3,Q1484620,stockholm asteroid 10552_stockholm_asteroid as...
4,Q3447382,stockholm town aroostook_county maine united_s...


In [24]:
chunks_filename = f'chunk_summary_wikidata_{version}.xlsx'
chunks_filepath = os.path.join(root_dir, 'data/terms', chunks_filename)
chunks_df.to_excel(chunks_filepath, index=False)

---

## Compute tf-idf on each pseudo-document

### Define corpus

In [25]:
clean_pseudodocs = []

for i in range(num_entities):
    doc = pseudodocs[i].lower()
    clean_doc = ' '.join(TrainingCorpus.tokenize(doc))
    clean_pseudodocs.append(clean_doc)

len(clean_pseudodocs)

195

# Fit tf-idf vectorizer

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
vectorizer = TfidfVectorizer(lowercase=False, tokenizer=lambda x: x.split())
vectorizer.fit(clean_pseudodocs)



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=False, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function <lambda> at 0x7feafcbeb9d0>, use_idf=True,
                vocabulary=None)

## Save tf-idf weights into a dict

In [28]:
vectorized_docs = vectorizer.transform(clean_pseudodocs)
vectorized_docs.shape

(195, 4444)

In [None]:
tfidf_weights = []

for i in range(num_entities):
    doc_vector = vectorized_docs[i].toarray().reshape(-1)
    weights = {}
    
    for j, w in enumerate(doc_vector):
        feature_name = vectorizer.get_feature_names()[j]
        if w > 0:
            weights[feature_name] = w
    
    tfidf_weights.append(weights)

## Compute baseline summaries

In [None]:
min_len = 1
max_len = 30

In [None]:
def compute_score(idx, text):
    if text:
        weights = tfidf_weights[idx]
        tokenized_text = TrainingCorpus.tokenize(text.lower())
        score = 0

        if tokenized_text:
            for token in tokenized_text:
                if token in weights:
                    score += weights[token]
        text_len = len(text.split())
        score = score/text_len
        return score
    else:
        return 0

In [None]:
import nltk

In [None]:
final_summaries = []

for i in range(num_entities):
    # split on sentences
    final_summary = []
    sentences = nltk.tokenize.sent_tokenize(summaries[i])
    prev_score = 0
    
    for sentence in sentences:
        score = compute_score(i, sentence)
        delta_score = score - prev_score
        
        if delta_score > 0:
            final_summary.append(sentence)
        else:
            break
        
        prev_score = score
    
    final_summaries.append(' '.join(final_summary))

## Build a DataFrame out of summaries

In [None]:
df = pd.DataFrame([(e, s) for e, s in zip(entities, final_summaries)], columns=['entity', 'summary'])
df.head()

## Save DataFrame to a file

In [None]:
filename = f'summary_baseline_wikidata_{version}.xlsx'
filepath = os.path.join(root_dir, 'data/terms', filename)
df.to_excel(filepath, index=False)

---