In [2]:
import requests
import os
import shutil
from bs4 import BeautifulSoup
from newspaper import Article

In [3]:
suffixes = ['.com', '.org', '.edu', '.gov', '.int', '.co', '.net', '.au', '.us', '.uk', '.ne', 'news'] 
    
covid_keywords = ['COVID', 'COVID-19', 'covid', 'pandemic', 'Pandemic', 'virus', 'Omicron', 'omicron', 'Delta', 'delta', 'variant', 'outbreak', 'mask', 'N95', 'KN95', 'wave', 'symptoms', 'testing', 'rapid test', 'pcr', 'PCR', 'social distancing', 'Social distancing', 'Social Distancing', 'epidemic', 'Epidemic', 'fatality rate', 'Fatality rate', 'Fatality Rate', 'flattening the curve', 'Flattening the Curve']

where_to_look = ['div', 'section', 'span']

dysfunctional_pages = ['ieee.org', 'www.naturalawakeningsmag.com', 'libertyvideos.org']

In [4]:
def __crawler(url: str) -> None:
    html_page = requests.get(url) 
    soup = BeautifulSoup(html_page.content, 'lxml') 
    news_sites = soup.find_all('span', {'style': 'font-size: 12pt;'}) 
    webpages = [] 
    for news_channel in news_sites: 
        link = news_channel.text[news_channel.text.rfind('(')+1:-1]
        if link[-4:] in suffixes:
            if (link[:8] == 'https://'):
                webpages.append(link[8:])
            else:
                webpages.append(link)
    
    if 'news_channels' in os.listdir():
        shutil.rmtree('news_channels/')

    os.mkdir('news_channels') 

    for website in webpages:
        if website in dysfunctional_pages:
            continue
        try:
            current_html = str(requests.get('https://' + website).content) 
            with open('news_channels/' + website + 'html_page.txt', 'w') as rn: 
                rn.write(current_html)
        except Exception:
            pass

In [5]:
def __finder(url: str) -> list:
    base_path = "news_channels/"
    structure = os.listdir('news_channels/')
    overall = []

    for file in structure:
        current_path = base_path + file

        with open(current_path, 'r') as current_soup:
            soup = BeautifulSoup(current_soup.read(), 'lxml')

            potential_articles = []

            for i in range(3):
                p1 = soup.find_all(where_to_look[i])
                p2 = []

                for potential in p1:
                    if potential.has_attr('class'):
                        p2.append(potential)

                for tag in p2:
                    for anchor in tag.find_all('a'):
                        if not anchor.has_attr('href'):
                            continue
                        potential_articles.append(anchor)
            potential_articles = list(set(potential_articles))

            covid_related = False
            for article_title in potential_articles:
                mod_title = article_title.text
                mod_title = ' '.join(mod_title.split())

                if 'css' in mod_title:
                    continue

                for covid_word in covid_keywords:
                    if covid_word in mod_title:
                        covid_related = True

                if covid_related:

                    intended_link = article_title['href']

                    # if intended_link[0] not in ['h', 'w'] and intended_link != '/':
                    #     intended_link = '/' + intended_link

                    if intended_link[0] == '/' or intended_link[0] not in ['h', 'w']:
                        intended_link = file[:-13] + intended_link

                    if intended_link.count('http://') + intended_link.count('https://') == 0:
                        intended_link = 'https://' + intended_link

                    article = {
                        'title': mod_title,
                        'link': intended_link,
                    }

                    overall.append(article)

                    covid_related = False

    return overall

In [6]:
def find_articles(url: str) -> list:
    __crawler(url)
    return __finder(url)

In [7]:
science_url = 'https://mediabiasfactcheck.com/pro-science/'
conspiracy_url = 'https://mediabiasfactcheck.com/conspiracy/'

In [8]:
science_articles = find_articles(science_url)

In [9]:
conspiracy_articles = find_articles(conspiracy_url)

In [10]:
import json

def write_to_storage(name: str, articles: list[str]) -> None:
    filename = f"../Data/extract/{name}.json"
    with open(filename, 'w') as storage:
        storage.write(json.dumps(articles, indent = 4))

In [11]:
write_to_storage('science', science_articles)

In [12]:
write_to_storage('conspiracy', conspiracy_articles)

In [13]:
import shutil
shutil.rmtree('news_channels/')