# Scrape Headlines of FT.com

## 1. Setup Environment

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

## 2. Scrape FT.com

In [None]:
def scrape_ft_headlines():
    """
    Scrapes headlines from the Financial Times homepage.
    Adjust the selectors as needed if the site structure changes.
    """
    url = "https://www.ft.com/"
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/90.0.4430.93 Safari/537.36"
        )
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print("Failed to retrieve the page. HTTP Status Code:", response.status_code)
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    headlines = []

    # Look for common headline tags; you might need to adjust these if FT changes their layout.
    for tag in soup.find_all(['a', 'h3']):
        text = tag.get_text(strip=True)
        if text and len(text) > 20:
            headlines.append(text)

    return headlines

print("Scraping the Financial Times homepage for headlines...")
headlines = scrape_ft_headlines()

if not headlines:
    print("No headlines found. The page structure may have changed or scraping might be blocked.")
else:
  print("\nScraped Headlines:")
  for idx, headline in enumerate(headlines, start=1):
      print(f"{idx}. {headline}")

## 3. Plot Word Cloud of Headlines

In [None]:
def generate_wordcloud(headlines):
    """
    Generates and displays a word cloud from the provided headlines.
    """
    # Combine all headlines into one text string
    text = " ".join(headlines)

    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r"[^a-zA-Z\s]", "", text).lower()

    # Define additional stopwords if needed
    custom_stopwords = {"ft", "financial", "times"}
    stopwords = STOPWORDS.union(custom_stopwords)

    # Generate the word cloud
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        stopwords=stopwords,
        collocations=False  # Prevents joining words that appear together often
    ).generate(text)

    # Display the generated word cloud using matplotlib
    plt.figure(figsize=(15, 7.5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title("Word Cloud of FT Headlines", fontsize=20)
    plt.show()

print("\nGenerating word cloud...")
generate_wordcloud(headlines)