In [1]:
from bs4 import BeautifulSoup
import requests

# User agents (don't be identified as a bot!)
headers = {'User-Agent': 'Mozilla/5.0'}

# Fetching the content from the URL
url = 'https://www.washingtonpost.com/world/2023/01/30/china-sichuan-single-parents-birth-rate/'
req = requests.get(url, headers=headers)

print("Request status code: ",req.status_code) # 200 means you're successfully connected

# Parsing the URL content and storing in a variable
soup = BeautifulSoup(req.text,'html.parser')

# Returning article containter, <p> tags and <h1> (commonly the article title)
article_div = soup.find('article')

title = soup.find('h1').text
paragraphs = soup.find_all('p')

article_content = title + '.'

# Looping through the paragraphs and adding them to the variable
for p in paragraphs:  
    article_content += p.text + '. '

print("Title: ", title)
print("Content :", article_content[len(title)+1:len(title)+100], "...")

Request status code:  200
Title:  China province to offer benefits to single parents as birthrate drops
Content : Sign in. Amid fears of a demographic crisis in China after the country’s birthrate reached its lowe ...


In [2]:
# Import spacy and english core vocabulary:
import spacy
nlp = spacy.load('en_core_web_sm')

# Convert raw string to analyzable nlp object
article_content=nlp(article_content)

#Print parts of speach and lemmatized word
for token in article_content[:10]:
    print(f"{token.text:{15}}{token.pos_:{7}}{token.lemma_:{15}}")

China          PROPN  China          
province       NOUN   province       
to             PART   to             
offer          VERB   offer          
benefits       NOUN   benefit        
to             ADP    to             
single         ADJ    single         
parents        NOUN   parent         
as             ADP    as             
birthrate      NOUN   birthrate      


In [3]:
from string import punctuation

punctuations = list(punctuation) + ['', '”', '"', '...', '. .']

stop_words = nlp.Defaults.stop_words

print(punctuations)

def tokenize(sentence):

    # lemmatizing
    sentence = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in sentence ]
    # removing stop words and punctuation
    sentence = [ word for word in sentence if word not in stop_words and word not in punctuations ]        
    return sentence

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '', '”', '"', '...', '. .']


In [4]:
frequency_table = dict()

def analyze_frequency(sentence):
    list_words = tokenize(sentence)
    
    for word in list_words:
        if word in frequency_table:
            frequency_table[word] += 1
        else:
            frequency_table[word] = 1
        

In [5]:
for sentence in nlp(article_content).sents:
    analyze_frequency(sentence)
    
frequency_table

{'china': 10,
 'province': 6,
 'offer': 1,
 'benefit': 4,
 'single': 6,
 'parent': 12,
 'birthrate': 4,
 'drop': 1,
 'sign': 1,
 'amid': 2,
 'fear': 1,
 'demographic': 5,
 'crisis': 3,
 'country': 4,
 'reach': 1,
 'low': 1,
 'level': 1,
 'record': 2,
 'year': 8,
 'official': 2,
 'populous': 1,
 'launch': 1,
 'policy': 13,
 'allow': 5,
 'unmarried': 6,
 'people': 10,
 'register': 6,
 'birth': 12,
 'unlimited': 1,
 'number': 2,
 'child': 16,
 '..': 14,
 'health': 4,
 'commission': 4,
 'sichuan': 7,
 'home': 1,
 'nearly': 1,
 '84': 1,
 'million': 1,
 'eligible': 1,
 'previously': 3,
 'reserve': 1,
 'married': 4,
 'couple': 4,
 'come': 1,
 'population': 5,
 'shrink': 2,
 'time': 4,
 'decade': 2,
 'decline': 3,
 '6.77': 1,
 '1,000': 1,
 '7.52': 1,
 'previous': 1,
 'nationwide': 1,
 'registry': 1,
 'system': 2,
 'recently': 1,
 'qualify': 1,
 'include': 2,
 'insurance': 2,
 'pay': 1,
 'leave': 1,
 'new': 3,
 'begin': 1,
 'feb.': 1,
 '15': 1,
 'cap': 1,
 '40': 1,
 'lift': 1,
 'quota': 1,
 'wa

In [6]:
def hash(sentence) -> list:
    
    hash = ''
    for value in sentence[:10]:
        try:
            hash += value[0]
        except:
            print(value, ' ', sentence)
        
    return hash

In [7]:
sentence_weight = dict()

def sentence_score(raw_sentence):
    sentence=tokenize(raw_sentence)
    sent_count = 0
    
    word_count = len(sentence)
    
    for key_word, weight in frequency_table.items():
        if key_word in sentence:
            sent_count += 1
            
            if hash(sentence) in sentence_weight:
                sentence_weight[hash(sentence)] += weight
            elif word_count<5:
                continue
            else:
                sentence_weight[hash(sentence)] = weight
    try:         
        sentence_weight[hash(sentence)] = sentence_weight[hash(sentence)] / sent_count
    except:
        pass
      
        

In [9]:
# Let's run it

for sentence in article_content.sents:
    
    analyze_frequency(sentence)
    sentence_score(sentence)
    

sorted(sentence_weight.items(), key=lambda x: x[1], reverse=True)


[('onpibmpcrb', 16.639053254437872),
 ('npbf1upsrc', 15.497041420118343),
 ('laesbspfui', 14.390532544378699),
 ('dlcp…tpgfd', 14.071005917159763),
 ('ddcypcchcc', 13.831632653061224),
 ('cpd6ysda', 13.53125),
 ('mpposhmrmc', 13.5),
 ('cbfadc', 13.25),
 ('pssfhcccas', 13.159722222222221),
 ('npsfaablgm', 13.1328),
 ('cpobspbd', 12.8125),
 ('oshctlmpis', 12.55363321799308),
 ('zsibpupiwg', 12.083044982698961),
 ('afdcccbrll', 11.899408284023668),
 ('psssyclfsc', 11.785123966942148),
 ('ccpmdfascu', 11.458333333333334),
 ('rehcepuwsm', 11.19047619047619),
 ('smdpses', 11.183673469387754),
 ('pauprbcgch', 10.911989795918368),
 ('nrsmcsprar', 10.438271604938272),
 ('ccpsytdbd6', 10.04),
 ('hcsphn8mpu', 9.910034602076124),
 ('cappltbpd', 9.629629629629628),
 ('ruac2clpst', 9.617728531855954),
 ('pwcwaerfcp', 9.5),
 ('lcafemw', 9.36734693877551),
 ('t4yplbqwfp', 8.938775510204081),
 ('illflr—sqq', 8.7724609375),
 ('lrssarmow.', 8.690000000000001),
 ('asdgwnapdi', 8.42361111111111),
 ('fdwcpf

In [10]:
# Calculate score avarage
def avg_score():
    total = 0
    for weight in sentence_weight.values():
        total += weight
        
    return total/len(sentence_weight)

limit_score = avg_score()

In [11]:

def summary(n_sents = 3):
    sentence_counter = 0
    article_summary = 'Title:'+'\n'+title +'\n'*2 + 'Summary:'+'\n'
    
    # Skip first sentence since the title will always be shown
    for raw_sentence in list(article_content.sents)[1:]:
        if hash(tokenize(raw_sentence)) in sentence_weight and sentence_weight[hash(tokenize(raw_sentence))] >= (limit_score):
            article_summary += raw_sentence.text + '\n'*2
            sentence_counter += 1
            if sentence_counter == n_sents: break
            
    return nlp(article_summary)

summary()
    

Title:
China province to offer benefits to single parents as birthrate drops

Summary:
Amid fears of a demographic crisis in China after the country’s birthrate reached its lowest level on record last year, officials in one of its most populous provinces have launched a policy that allows unmarried people to register the births of an unlimited number of children..

Under the new policy, beginning on Feb. 15, unmarried parents in Sichuan can register with no cap on the number of children..

“There was so much discussion around the lifting of the one-child policy … but this is the first time, with this province, that the government finally gave up this desire to tell people how many births they could have.”.
