# Summary baseline

## Load the data

In [1]:
root_dir = '../../'
src_dir = 'src'
data_dir = 'data/corpus'
models_dir = 'data/models'

In [2]:
import os
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
version = 'v2'
corpus_filename = f'wikidata_corpus_{version}.json'

In [5]:
from training import TrainingCorpus

In [6]:
corpus = TrainingCorpus()
corpus.load(os.path.join(root_dir, data_dir, corpus_filename))

In [7]:
corpus.size

3294

---

## Builds pseudo-docs

In [8]:
from collections import defaultdict

In [9]:
pseudodocs_dict = defaultdict(str)

In [10]:
for doc_id in corpus.docs:
    text = corpus.get_text(doc_id)
    label = corpus.target[doc_id][0]
    pseudodocs_dict[label] += ' ' + text

## Summarize pseudo-docs

In [11]:
entities = list(pseudodocs_dict.keys())
num_entities = len(entities)
pseudodocs = [pseudodocs_dict[e_id] for e_id in entities]

In [12]:
summary = []

In [13]:
from transformers import pipeline

In [14]:
model_name = 'bart-large-cnn'

In [15]:
summarizer = pipeline('summarization', model=model_name)

Couldn't reach server at 'https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/modelcard.json' to download model card file.
Creating an empty model card.


In [16]:
min_len = 10
max_len = 50
do_sample = False

In [17]:
for i in range(num_entities):
    doc = pseudodocs[i]
    summary_text = summarizer(doc, min_length=min_len, max_length=max_len, do_sample=do_sample)[0]['summary_text']
    summary.append(summary_text)

Your max_length is set to 50, but you input_length is only 29. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 50, but you input_length is only 44. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


In [18]:
from pprint import pprint

In [19]:
pprint(summary)

['Stockholm is the capital and most populous urban area of Sweden as well as '
 'in Scandinavia. It hosts the annual Nobel Prize ceremonies and banquet at '
 'the Stockholm Concert Hall and Stockholm City Hall.',
 ' Stockholm is a town in Grant County, South Dakota, United States. It was '
 'laid out in 1896, and named after the capital city of Sweden. The population '
 'was 108 at the 2010 census.',
 ' Stockholm is a village in Pepin County, Wisconsin, United States, founded '
 'in 1854 by immigrants from Karlskoga, Sweden, who named it after their '
 "country's capital. The village is located within the Town of Stockholm.",
 ' Stockholm asteroid  10552 Stockholm asteroid is a large asteroid. It is '
 'located in the asteroid belt between Earth and the Earth.',
 ' Stockholm is a town in Aroostook County, Maine, United States. The '
 'population was 253 at the 2010 census.',
 ' Stockholm (known as The Captor in some countries) is a 2018 Canadian crime '
 'film written, produced and dir

---

## Compute tf-idf on each pseudo-document

### Define corpus

In [20]:
clean_pseudodocs = []

for i in range(num_entities):
    doc = pseudodocs[i]
    clean_doc = ' '.join(TrainingCorpus.tokenize(doc))
    clean_pseudodocs.append(clean_doc)

len(clean_pseudodocs)

195

# Fit tf-idf vectorizer

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorizer = TfidfVectorizer(lowercase=False, tokenizer=lambda x: x.split())
vectorizer.fit(clean_pseudodocs)



TfidfVectorizer(lowercase=False,
                tokenizer=<function <lambda> at 0x7f3739bc35f0>)

## Save tf-idf weights into a dict

In [23]:
vectorized_docs = vectorizer.transform(clean_pseudodocs)
vectorized_docs.shape

(195, 4783)

In [24]:
tfidf_weights = []

for i in range(num_entities):
    doc_vector = vectorized_docs[i].toarray().reshape(-1)
    weights = {}
    
    for j, w in enumerate(doc_vector):
        feature_name = vectorizer.get_feature_names()[j]
        if w > 0:
            weights[feature_name] = w
    
    tfidf_weights.append(weights)

## Compute n-grams and score them

In [25]:
ngram_min = 1
ngram_max = 10

In [26]:
def custom_fn(x, alpha):
    return 1 + alpha*x

In [27]:
alpha = 0.2

In [28]:
import math
from nltk import ngrams

In [29]:
final_summaries = []

for i in range(num_entities):
    doc = pseudodocs[i].split()
    doc_len = len(doc)
    n_max = min(doc_len, ngram_max)
    entity_weights = tfidf_weights[i]
    entity_ngrams = {}
    
    # compute all ngrams
    for j in range(ngram_min, n_max + 1):
        ngrams_list = ngrams(doc, j)
        
        # compute score for each ngram by
        # summing up the tf-idf weights of the 
        # tf-idf vector associated with the ngram
        for ngram in ngrams_list:
            ngram_str = ' '.join(ngram).lower()
            clean_ngram = TrainingCorpus.tokenize(ngram_str)
            score = 0
            
            # check for null list of tokens
            if clean_ngram:
                for token in clean_ngram:
                    if token in entity_weights:
                        score += entity_weights[token]
            
            if score:
                # penalize long summaries
                score = score/custom_fn(j, alpha)
                entity_ngrams[ngram_str] = score
        
    sorted_ngrams = sorted(entity_ngrams.items(), key=lambda x: -x[1])    
    selected_ngram = sorted_ngrams[0][0]
    final_summaries.append(selected_ngram)

In [30]:
final_summaries

['city the city stretches across fourteen islands',
 'town of the united states town',
 'village of stockholm village of wisconsin village',
 'asteroid asteroid 10552 stockholm asteroid 10552 stockholm asteroid asteroid',
 'town of the united states town',
 'film 2018 film stockholm film the film',
 'city/town city in russia čelâbinsk administrative territorial entity',
 'asteroid asteroid 21088 chelyabinsk asteroid asteroid 21088 chelyabinsk asteroid 21088',
 'oblast oblast of russia oblast of russia chelyabinsk oblast oblast',
 'university university university',
 'meteor meteor the chelyabinsk meteor',
 'time zone time zone yekaterinburg time time zone time zone',
 'тракторный завод, romanized: chelyabinskiy traktornyy zavod, abbreviated',
 'city/town city',
 'samara type of fruit type of non-opening dry fruit',
 'asteroid (26922) samara asteroid asteroid 26922 samara asteroid asteroid (26922)',
 '(given name) female given name female given name',
 'human settlement district in nico

---