# Get focus words of any random bbc.com/sport article

In [1]:
# Import modules
import requests, random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from collections import Counter
from nltk.stem import WordNetLemmatizer

In [2]:
# Get source page - bbc.com/sport using html.parser of BeautifulSoup
page = requests.get("https://www.bbc.com/sport").content
soupy = BeautifulSoup(page, "html.parser")

In [3]:
# Remove script and style tags
for chunk in soupy(['script', 'style']):
    chunk.extract()

In [4]:
# Get headline URLs from page
# The headline clas is gs-c-promo-heading
anchor = soupy.findAll("a")
urllist = []
for x in anchor:
    if x.has_attr("class") and "gs-c-promo-heading" in x["class"]:
        if x.get("href")[0]=="/": # focuses on only the relative links i.e. /sport/3882920
            urllist.append("https://bbc.co.uk"+x.get("href"))

In [5]:
# print(urllist)

In [6]:
# Parse random link in urllist
page = requests.get(urllist[random.randint(0,len(urllist)-1)]).text
soup = BeautifulSoup(page, "html.parser")
for chunk in soup(['script', 'style']):
    chunk.extract()
    
print(soup.title.get_text()) # Just for visual purposes, print url title
# print(soup)

Wilder v Fury rematch? Wilder v Joshua? Joshua v Fury? What next for heavyweight division? - BBC Sport


In [7]:
# We've made the dangerous assumption that the main body of the article starts at index 3 and ends at index -5
article = [line for line in soup.body.get_text().splitlines()[3:-5] if len(line)!=0]
# article[:10]

In [8]:
# Tokenize data
# loop through each sentence in article
# Then loop through each word in sentence and tokenize the lowercase string
# check that the word is not a stopword and doesn't have symbols
tokenized = [word for sentence in article for word in word_tokenize(sentence.lower()) if word.isalpha() and word not in stopwords.words('english')]
# tokenized[:10]

In [9]:
# What are the top 5 common words
Counter(tokenized).most_common(5)

[('fury', 21), ('wilder', 16), ('boxing', 9), ('joshua', 8), ('rematch', 8)]

In [10]:
print(f"The article is about {Counter(tokenized).most_common()[0][0].capitalize()} and {Counter(tokenized).most_common()[1][0].capitalize()}")

The article is about Fury and Wilder


In [11]:
# Let's see if lemmatization changes things
# Lemmatize using the verb argument v
# This makes all verb words change to their root word, e.g. loving to love, sharing to share
lemmatized = [WordNetLemmatizer().lemmatize(word, 'v') for sentence in article for word in word_tokenize(sentence.lower()) 
             if word.isalpha() and word not in stopwords.words('english')]

In [12]:
Counter(lemmatized).most_common(5)

[('fury', 21), ('wilder', 16), ('get', 11), ('box', 9), ('joshua', 8)]

In [13]:
print(f"The guessed focus words of the article are '{Counter(tokenized).most_common(5)[0][0].capitalize()}' and '{Counter(tokenized).most_common(5)[1][0].capitalize()}'")
print(f"To confirm, the title is \n{soup.title.get_text()}")

The guessed focus words of the article are 'Fury' and 'Wilder'
To confirm, the title is 
Wilder v Fury rematch? Wilder v Joshua? Joshua v Fury? What next for heavyweight division? - BBC Sport
