SCRAPING NEWS USING PYTHON BEAUTIFUL SOUP

In [None]:

import sqlite3
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from urllib.parse import urljoin, urlparse

SCRIPT

In [None]:



# Define the URLs and blocks to scrape
urls = [
    {"url": "https://www.bramptonguardian.com/news/", "block": "2886699"},
    {"url": "https://www.mississauga.com/news/", "block": "2901984"},
    {"url": "https://www.durhamregion.com/news/", "block": "2891595"},
    {"url": "https://www.insidehalton.com/news/", "block": "2901111"},
    {"url": "https://www.yorkregion.com/news/", "block": "2858511"},
    {"url": "https://www.toronto.com/news/", "block": "1202226"}
]

def ensure_valid_url(url):
    """Ensure the URL has a scheme and is fully qualified."""
    parsed_url = urlparse(url)
    if not parsed_url.scheme:
        url = "https://" + url
    return url

def scrape_page(datau):
    url = ensure_valid_url(datau["url"])
    block_id = datau["block"]
    data = []

    # Send request to the URL
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve {url}")
        return []

    # Parse the page content
    soup = BeautifulSoup(response.content, "html.parser")
    articles = soup.select(f"section #block-{block_id} article")

    # Extract data from each article
    for article in articles:
        headline_element = article.select_one(".card-headline h4 a")
        if not headline_element:
            continue

        body = headline_element.get_text()
        # Ensure the article URL is fully qualified
        article_url = urljoin(url, headline_element["href"])
        img_element = article.select_one(".image .tnt-blurred-image img")
        img_url = img_element["src"] if img_element else "alt"

        # Get article description from the article page
        description = scrape_article_description(article_url)

        data.append({
            "body": body,
            "img": img_url,
            "url": article_url,
            "description": description
        })

    return data

def scrape_article_description(article_url):
    article_url = ensure_valid_url(article_url)

    # Send request to the article URL
    response = requests.get(article_url)
    if response.status_code != 200:
        print(f"Failed to retrieve {article_url}")
        return ""

    # Parse the article content
    soup = BeautifulSoup(response.content, "html.parser")
    des_elements = soup.select(".asset-body p")
    description = " ".join([p.get_text() for p in des_elements])
    return description

# Scrape data from all URLs
all_data = []
for datau in urls:
    print(f"Scraping {datau['url']}...")
    page_data = scrape_page(datau)
    all_data.extend(page_data)

# Save the data to a JSON file
with open("news_data.json", "w", encoding="utf-8") as f:
    json.dump(all_data, f, ensure_ascii=False, indent=2)

print("News data saved!")


df=pd.DataFrame(all_data)
print(df)

Scraping https://www.bramptonguardian.com/news/...
Scraping https://www.mississauga.com/news/...
Scraping https://www.durhamregion.com/news/...
Scraping https://www.insidehalton.com/news/...
Scraping https://www.yorkregion.com/news/...
Scraping https://www.toronto.com/news/...
News data saved!
                                                 body  \
0   \n        \n\n        Man rushed to hospital a...   
1   \n        \n\n        Ready for some winter fu...   
2   \n        \n\n        ‘Honour the incredible s...   
3   \n        \n\n        Got mail? Here are other...   
4   \n        \n\n        Ontario, Canada recalls:...   
5   \n        \n\n        ‘LANES CLOSED’: Major la...   
6   \n        \n\n        Still want to see Taylor...   
7   \n        \n\n        New Canada Post stamps p...   
8   \n        \n\n        ‘We’re liable’: Mississa...   
9   \n        \n\n        ‘LANES CLOSED’: Major la...   
10  \n        \n\n        Man with ties to Peel Re...   
11  \n        \n\n   

PERFORMING DATA ANALYSIS ON DATA USING PANDAS

NEWS BY REGION

In [None]:
region = "brampton"

# Filter rows where 'body' contains any of the keywords
brampton_news = df[df["url"].str.contains(region, case=False, na=False)]
print(brampton_news)

region = "halton"

# Filter rows where 'body' contains any of the keywords
halton_news = df[df["url"].str.contains(region, case=False, na=False)]
print(halton_news)




NEWS BY KEYWORD

In [51]:
keywords = "crime"

# Filter rows where 'body' contains any of the keywords
crime_rows = df[df["description"].str.contains(keywords, case=False, na=False)]
print(crime_rows)

                                                 body  \
15  \n        \n\n        Mississauga man charged ...   
28  \n        \n\n        2 charged with vehicle t...   
29  \n        \n\n        2 arrested following vio...   

                                                  img  \
15  https://bloximages.chicago2.vip.townnews.com/m...   
28  https://bloximages.chicago2.vip.townnews.com/i...   
29  https://bloximages.chicago2.vip.townnews.com/i...   

                                                  url  \
15  https://www.mississauga.com/news/crime/mississ...   
28  https://www.insidehalton.com/news/crime/2-char...   
29  https://www.insidehalton.com/news/crime/2-arre...   

                                          description  
15  A Mississauga man is one of two people charged...  
28  Two people have been arrested following an att...  
29  Two arrests have been made following a home in...  


SAVING IT IN SQLITE DATABASE AND QUERYING THE DATA FOR DISPLAY USING PANDAS READ SQL

In [None]:


# Connect to SQLite and create a database file
conn = sqlite3.connect('news.db')
 
df.to_sql("Gtanews", conn, if_exists="replace", index=True)

query_statement = f"SELECT * FROM Gtanews"
query_output = pd.read_sql(query_statement, conn)
print(query_statement)
query_output



SELECT * FROM Gtanews


Unnamed: 0,index,body,img,url,description
0,0,\n \n\n Man rushed to hospital a...,https://bloximages.chicago2.vip.townnews.com/b...,https://www.bramptonguardian.com/news/man-rush...,A man is in serious condition after a shooting...
1,1,\n \n\n Ready for some winter fu...,https://bloximages.chicago2.vip.townnews.com/b...,https://www.bramptonguardian.com/news/ready-fo...,The holiday magic of WinterFest returns to Can...
2,2,\n \n\n ‘Honour the incredible s...,https://bloximages.chicago2.vip.townnews.com/b...,https://www.bramptonguardian.com/news/honour-t...,The City of Brampton is hosting Remembrance Da...
3,3,\n \n\n Got mail? Here are other...,https://bloximages.chicago2.vip.townnews.com/b...,https://www.bramptonguardian.com/news/got-mail...,As the threat of a possible work stoppage at C...
4,4,"\n \n\n Ontario, Canada recalls:...",https://bloximages.chicago2.vip.townnews.com/b...,https://www.bramptonguardian.com/news/ontario-...,"Thousands of space heaters sold on Amazon, bat..."
5,5,\n \n\n ‘LANES CLOSED’: Major la...,https://bloximages.chicago2.vip.townnews.com/b...,https://www.bramptonguardian.com/news/lanes-cl...,Drivers in Peel Region should expect delays or...
6,6,\n \n\n Still want to see Taylor...,https://bloximages.chicago2.vip.townnews.com/b...,https://www.bramptonguardian.com/news/still-wa...,You still have a chance to win some Taylor Swi...
7,7,\n \n\n New Canada Post stamps p...,https://bloximages.chicago2.vip.townnews.com/b...,https://www.bramptonguardian.com/news/new-cana...,"Ahead of Remembrance Day, Canada Post has issu..."
8,8,\n \n\n ‘We’re liable’: Mississa...,https://bloximages.chicago2.vip.townnews.com/m...,https://www.mississauga.com/news/council/we-re...,It’s game over for a bid to allow sports on Mi...
9,9,\n \n\n ‘LANES CLOSED’: Major la...,https://bloximages.chicago2.vip.townnews.com/m...,https://www.mississauga.com/news/lanes-closed-...,Drivers in Peel Region should expect delays or...
