# Data Scraping ⛏️

Let's begin by importing the libraries and checking their versions.

In [9]:
import sklearn
import pandas
import seaborn
import requests
from bs4 import BeautifulSoup

print("scikit-learn version:", sklearn.__version__)     # 1.6.1
print("pandas version:", pandas.__version__)            # 2.2.3
print("seaborn version:", seaborn.__version__)          # 0.13.2
print("requests version:", requests.__version__)        # 2.31.0

scikit-learn version: 1.6.1
pandas version: 2.2.3
seaborn version: 0.13.2
requests version: 2.31.0


## Article Details 🔎
I will examine each article's 
- Title 
- Link 
- Author (if available)
- Publication Date
- Content/text
- Categorize it as "Fake" (0) or "Real" (1).

## Real News ✅ 
### BBC 🌐

I believe that starting with the BBC would be the most straightforward option, as my research shows they are ranked as the most trusted news source by Americans. Even though we live in Europe, I choose to trust this statistic.

In [None]:
bbc_url = 'https://www.bbc.com'

def scrape_article_details(link):
    response = requests.get(link)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # article text
    article_text = ''
    article_body = soup.find('article')
    if article_body:
        paragraphs = article_body.find_all('p')  #find all p tags
        for p in paragraphs:
            article_text += p.get_text(strip=True) + '\n\n'

    # author
    author_tag = soup.find('span', class_='sc-b42e7a8f-7 kItaYD')
    author = author_tag.get_text(strip=True) if author_tag else ''

    # publication date
    time_tag = soup.find('time', {'datetime': True})
    publication_date = time_tag['datetime'] if time_tag else ''

    return article_text, author, publication_date

def scrape_news():
    url = f'{bbc_url}/news'

    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    articles = []
    for article in soup.find_all('a', class_='sc-2e6baa30-0 gILusN'):
        title_element = article.find('h2', class_='sc-87075214-3 eywmDE')
        if title_element:
            title = title_element.get_text(strip=True)
            href = article['href']

            # check if href is a full URL or relative path
            if href.startswith('http'):
                link = href
            else:
                link = f"{bbc_url}{href}"

            # additional details
            article_text, author, publication_date = scrape_article_details(link)

            articles.append({
                'title': title,
                'link': link,
                'source': 'BBC',
                'journalist': author,
                'date': publication_date,
                'content': article_text,
                'classification': 1 #1 for real news
            })

    return articles

news_articles = scrape_news()

# save to CSV
df = pandas.DataFrame(news_articles)
df.to_csv("bbc_news_articles_v6.csv", index=False)

print("Saved.")

Saved.


## Fake News ❌
### The Onion 🧅

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

onion_url = 'https://theonion.com'

def scrape_onion_article_details(link):
    response = requests.get(link)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract article text
    article_text = ''
    article_body = soup.find('div', class_='entry-content single-post-content single-post-content--has-watermark wp-block-post-content has-echo-font-size is-layout-flow wp-block-post-content-is-layout-flow')
    if article_body:
        paragraphs = article_body.find_all('p')
        for p in paragraphs:
            article_text += p.get_text(strip=True) + '\n\n'

    # Extract publication date
    publication_date = ''
    time_tag = soup.find('time')
    if time_tag and 'datetime' in time_tag.attrs:
        publication_date = time_tag['datetime']

    return article_text, publication_date

def scrape_onion_news(pages=10):
    articles = []
    page = 1

    while page <= pages:  # Limit to the specified number of pages
        url = f'{onion_url}/news/page/{page}/'
        response = requests.get(url)

        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.text, 'html.parser')

        # Select article containers
        article_count = 0
        for article in soup.find_all('h3', class_='has-link-color wp-elements-aad9b6425bfbbfe81f8771ca6f420d00 wp-block-post-title has-text-color has-primary-2-color has-rocky-condensed-font-family'):
            title = article.get_text(strip=True)
            link_tag = article.find('a')
            href = link_tag['href'] if link_tag else ''

            # Ensure the link is a full URL
            link = href if href.startswith('http') else f"{onion_url}{href}"

            # Scrape article details
            article_text, publication_date = scrape_onion_article_details(link)

            articles.append({
                'title': title,
                'link': link,
                'source': 'The Onion',
                'journalist': '',  # Authors are typically not listed
                'date': publication_date,
                'content': article_text,
                'classification': 0  # Mark as fake news
            })
            article_count += 1

        # Break if no articles are found on the current page
        if article_count == 0:
            break

        page += 1

    return articles

# Scrape 5 pages
onion_articles = scrape_onion_news(pages=10)

# Save to CSV
df = pd.DataFrame(onion_articles)
df.to_csv("onion_news_articles.csv", index=False)

print("Scraping completed and saved to 'onion_news_articles.csv'.")

Scraping completed and saved to 'onion_news_articles.csv'.


### The People's Voice 🗣

In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = 'https://thepeoplesvoice.tv'

# Function to scrape details from an article

def scrape_article_details(link):
    response = requests.get(link)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract article text
    article_text = ''
    article_body = soup.find('div', class_='entry-content clearfix')
    if article_body:
        paragraphs = article_body.find_all('p')
        for p in paragraphs:
            article_text += p.get_text(strip=True) + '\n\n'

    # Extract publication date
    publication_date = ''
    date_span = soup.find('span', class_='entry-meta-date updated')
    if date_span:
        date_link = date_span.find('a')
        if date_link:
            publication_date = date_link.get_text(strip=True)

    # Extract author
    author = ''
    author_span = soup.find('span', class_='entry-meta-author author vcard')
    if author_span:
        author_link = author_span.find('a', class_='fn')
        if author_link:
            author = author_link.get_text(strip=True)

    return article_text, publication_date, author

# Function to scrape articles from multiple pages

def scrape_news(pages=10):
    articles = []
    for page in range(1, pages + 1):
        url = f'{base_url}/category/news/page/{page}/'
        response = requests.get(url)

        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.text, 'html.parser')

        # Select article containers
        article_count = 0
        for article in soup.find_all('h3', class_='entry-title mh-posts-list-title'):
            title = article.get_text(strip=True)
            link_tag = article.find('a')
            href = link_tag['href'] if link_tag else ''

            # Ensure the link is a full URL
            link = href if href.startswith('http') else f"{base_url}{href}"

            # Scrape article details
            article_text, publication_date, author = scrape_article_details(link)

            articles.append({
                'title': title,
                'link': link,
                'source': "The People's Voice",
                'journalist': author,
                'date': publication_date,
                'content': article_text,
                'classification': 0  # Mark as fake news
            })
            article_count += 1

        # Break if no articles are found on the current page
        if article_count == 0:
            break

    return articles

# Scrape 5 pages
news_articles = scrape_news(pages=10)

# Save to CSV
df = pd.DataFrame(news_articles)
df.to_csv("peoples_voice_articles.csv", index=False)

print("Scraping completed and saved to 'peoples_voice_articles.csv'.")

Scraping completed and saved to 'peoples_voice_articles.csv'.
