# 0. Install dependencies

# 1. Import and Load Model

In [1]:
# Importing dependencies from transformers
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [2]:
# Load tokenizer 
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

In [3]:
# Load model 
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 2. Perform Abstractive Summarization

In [4]:
text = """
China (Chinese: 中国; pinyin: Zhōngguó), officially the People's Republic of China (PRC),[k] is a country in East Asia. It is the world's second-most-populous country, with a population exceeding 1.4 billion. China spans the equivalent of five time zones and borders fourteen countries by land,[l] tied with Russia as having the most of any country. With an area of nearly 9.6 million square kilometers (3,700,000 sq mi), it is the third-largest country by total land area.[m] The country is divided into 22 provinces,[n] five autonomous regions, four municipalities, and two semi-autonomous special administrative regions. Beijing is the national capital, while Shanghai is the most populous city and largest financial center.
"""

In [5]:
# Create tokens - number representation of our text
tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")

In [6]:
# Input tokens
tokens

{'input_ids': tensor([[ 1224,   143, 21848,   151,   110,   105,   206,  4713, 46727,   151,
          2464,  1467,   105,  4652, 16200,   105,   312,  4551,   109,  2184,
           131,   116,  4498,   113,  1224,   143, 61788,   312,  4101,  1052,
          1100,   117,   114,   531,   115,  1445,  2661,   107,   168,   117,
           109,   278,   131,   116,   453,   121,  7952,   121, 11327, 34874,
           531,   108,   122,   114,  1948, 13972, 13602,  1722,   107,  1224,
         14916,   109,  4526,   113,   668,   166,  8141,   111,  8257, 19734,
          1105,   141,  1241,   108,  4101,  1191,  1100,  5483,   122,  3260,
           130,   458,   109,   205,   113,   189,   531,   107,   441,   142,
           345,   113,  1517, 56492,   604,  2151, 11294,  7567, 23285, 15702,
          7349,  8194,   312,   126,   117,   109,   776,   121, 22504,   531,
           141,   916,  1241,   345,   107,  4101,   208,  1100,   139,   531,
           117,  5215,   190,  2168, 1

In [7]:
# Summarize 
summary = model.generate(**tokens)

In [8]:
# Output summary tokens
summary[0]

tensor([    0,  1224,   117,   109,   278,   131,   116,   453,   121,  7952,
          121, 11327, 34874,   531,   108,   122,   114,  1948, 13972, 13602,
         1722,   107,     1])

In [9]:
# Decode summary
tokenizer.decode(summary[0])

"<pad>China is the world's second-most-populous country, with a population exceeding 1.4 billion.</s>"

In [10]:
# Decode summary
tokenizer.decode(summary[0])

"<pad>China is the world's second-most-populous country, with a population exceeding 1.4 billion.</s>"

In [11]:
len(summary)

1