# Summary generator

## Load the data

In [1]:
root_dir = '../../'
src_dir = 'src'
data_dir = 'data/corpus'
models_dir = 'data/models'

In [2]:
import os
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
version = 'v2'

In [5]:
corpus_filename = f'wikidata_corpus_{version}.json'

In [6]:
from training import TrainingCorpus

In [7]:
corpus = TrainingCorpus()
corpus.load(os.path.join(root_dir, data_dir, corpus_filename))

In [8]:
corpus.size

3294

---

## Builds pseudo-docs

In [9]:
from collections import defaultdict

In [10]:
pseudo_docs = defaultdict(str)

In [11]:
for doc_id in corpus.docs:
    text = corpus.get_text(doc_id)
    label = corpus.target[doc_id][0]
    pseudo_docs[label] += ' ' + text

## Summarize pseudo-docs

In [12]:
summary = {}

In [13]:
from transformers import pipeline

In [14]:
model_name = 'bart-large-cnn'

In [15]:
summarizer = pipeline('summarization', model=model_name)

Couldn't reach server at 'https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/modelcard.json' to download model card file.
Creating an empty model card.


In [16]:
min_len = 10
max_len = 30
do_sample = False

In [17]:
for entity_id, pseudo_doc in pseudo_docs.items():
    summary_text = summarizer(pseudo_doc, min_length=min_len, max_length=max_len, do_sample=do_sample)[0]['summary_text']
    summary[entity_id] = summary_text

Your max_length is set to 30, but you input_length is only 29. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


In [18]:
from pprint import pprint

In [19]:
pprint(summary)

{'Q1005682': 'Hamburg is a city in Carver County, Minnesota, United States. '
             'The population was 513 at the 2010 census.',
 'Q10123': 'Until 2008, Tangerang Regency was subdivided into 36 Districts, '
           'which were further divided into several villages and '
           'administrative villages.',
 'Q10127': 'Tangerang is a city in the province of Banten, Indonesia. It has '
           'an area of 164.54 square kilometres (',
 'Q1013097': 'Toronto is a town in Clinton County, Iowa, United States.The '
             'population was 124 at the 2010 census.',
 'Q1055': 'A hamburger (also burger for short) is a sandwich consisting of one '
          'or more cooked patties of ground meat, usually beef',
 'Q1067027': 'Samara asteroid  26922 is an asteroid. It is located in the '
             'asteroid belt between Earth and Mars.',
 'Q1074991': 'Chittagong District, officially known as Chattogram District, is '
             'a district located in the south-eastern regio

---