accessing appropriate pages on MediaBiasFactCheck to collect sites to visit

In [5]:
science_url = 'https://mediabiasfactcheck.com/pro-science/'
conspiracy_url = 'https://mediabiasfactcheck.com/conspiracy/'

In [6]:
import requests

science_page = requests.get(science_url, timeout=5)
conspiracy_page = requests.get(conspiracy_url, timeout=5)

parsing through both pages and finding all sites that can be visited

In [7]:
from bs4 import BeautifulSoup

science_soup = BeautifulSoup(science_page.content, 'lxml')
science_sites = science_soup.find_all('span', {'style': 'font-size: 12pt;'})

In [8]:
conspiracy_soup = BeautifulSoup(conspiracy_page.content, 'lxml')
conspiracy_sites = conspiracy_soup.find_all('span', {'style': 'font-size: 12pt;'})

extracting all science and conspiracy pages that can be visited

In [9]:
def create_db(sites):
    # will ensure request can deal with all links
    def format_links(link):
        if not link.startswith(('https://www', "http://www.", "http://", "https://")):
            if not link.startswith(('www.')):
                link = 'www.' + link
            if not link.startswith(('http://', 'https://')):
                link = 'https://' + link
        return link

    db = []
    for site in sites:
        link = format_links(site.text[site.text.rfind('(')+1:-1])
        source = site.text[:site.text.rfind('(')-1]

        if link.count(' ') == 0 and len(source) >= 2:
            db.append({"name": source, "url": link})    
    return db

In [10]:
science_db = create_db(science_sites)
conspiracy_db = create_db(conspiracy_sites)

In [11]:
science_db[:10]

[{'name': 'Acoustics Journal',
  'url': 'https://www.mdpi.com/journal/acoustics'},
 {'name': 'Acta Neuropathologica Communications',
  'url': 'https://www.actaneurocomms.biomedcentral.com'},
 {'name': 'Actuators Journal',
  'url': 'https://www.mdpi.com/journal/actuators'},
 {'name': 'Administrative Sciences Journal',
  'url': 'https://www.mdpi.com/journal/admsci'},
 {'name': 'Adolescents Journal',
  'url': 'https://www.mdpi.com/journal/adolescents'},
 {'name': 'Advances in Respiratory Medicine',
  'url': 'https://www.mdpi.com/journal/arm'},
 {'name': 'Aerospace Journal',
  'url': 'https://www.mdpi.com/journal/aerospace'},
 {'name': 'Agriculture Journal',
  'url': 'https://www.mdpi.com/journal/agriculture'},
 {'name': 'AgriEngineering Journal',
  'url': 'https://www.mdpi.com/journal/agriengineering'},
 {'name': 'Agrochemicals Journal',
  'url': 'https://www.mdpi.com/journal/agrochemicals'}]

In [12]:
conspiracy_db[:10]

[{'name': '2020ElectionCenter.c', 'url': 'https://www.2020ElectionCenter.co'},
 {'name': '21st Century Wire', 'url': 'https://www.21stcenturywire.com'},
 {'name': '79Days.Ne', 'url': 'https://www.79Days.New'},
 {'name': '369 News', 'url': 'https://www.369news.net'},
 {'name': '911Truth.org', 'url': 'https://www.911truth.org'},
 {'name': 'Above Top Secret', 'url': 'https://www.abovetopsecret.com'},
 {'name': 'A Call for an Uprising',
  'url': 'https://www.acallforanuprising.com'},
 {'name': 'ACNLatitudes', 'url': 'https://www.latitudes.org'},
 {'name': 'Activist Post', 'url': 'https://www.activistpost.com'},
 {'name': 'Actualized.o', 'url': 'https://www.Actualized.or'}]

validation to ensure all links have valid headers

In [13]:
from urllib.parse import urlparse

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme in ['http', 'https'], result.netloc])
    except ValueError:
        return False

In [14]:
assert [is_valid_url(page['url']) for page in science_db].count(False) == 0

In [15]:
assert [is_valid_url(page['url']) for page in conspiracy_db].count(False) == 0

saving both lists to ensure we don't have to scrape MediaBiasFactCheck every single time we want to find new articles from these websites for the database

In [16]:
import json

science_json = json.dumps(science_db, indent=4)
conspiracy_json = json.dumps(conspiracy_db, indent=4)

In [17]:
with open('data/website_metadata/science.json', 'w') as file:
    file.write(science_json)

In [18]:
with open('data/website_metadata/conspiracy.json', 'w') as file:
    file.write(conspiracy_json)

collecting articles related to COVID-19 using the ArticleFinder class

In [19]:
from src.model.article_finder import ArticleFinder

# finding the scientific articles
science_articles = ArticleFinder.find_articles(article_type="science")

In [None]:
science_articles[:10]

[{'title': 'Sound Environment during Dental Treatment in Relation to COVID-19 Pandemic',
  'link': 'https://www.mdpi.com/journal/acoustics/2624-599X/5/4/56'},
 {'title': 'The Role of Agile Values in Enhancing Good Governance in Public Administration during the COVID-19 Crisis: An International Survey',
  'link': 'https://www.mdpi.com/journal/admsci/2076-3387/13/12/248'},
 {'title': '\\n\\n\\nA Comparative Study of Communication Management Strategies on Social Media in the Hotel Industry in Spain in Times of COVID-19\\n\\n',
  'link': 'https://www.mdpi.com/journal/admsci/2076-3387/13/11/240'},
 {'title': 'How Can Digitalization Mitigate Pandemic-Induced Demand Shocks? A Case Study from the Apparel Industry',
  'link': 'https://www.mdpi.com/journal/admsci/2076-3387/13/12/257'},
 {'title': 'Factors Associated with the Prevalence and Treatment of Depression in Adolescent Males in the US during the Period of the COVID-19 Pandemic',
  'link': 'https://www.mdpi.com/journal/adolescents/2673-70

In [None]:
len(science_articles)

162

In [None]:
# finding the conspiracy articles
conspiracy_articles = ArticleFinder.find_articles(article_type="conspiracy")

In [None]:
conspiracy_articles[:10]

[{'title': 'Italian Health Minister Under Investigation for Murder for Concealing COVID-19 Vaccine Deaths',
  'link': 'https://www.abovetopsecret.com/forum/thread1342167/pg1'},
 {'title': '25% of COVID Vaxxed Now Have VAIDS, Cambridge Scientists Warn',
  'link': 'https://www.abovetopsecret.com/forum/thread1341936/pg1'},
 {'title': 'Diseases and Pandemics',
  'link': 'https://www.abovetopsecret.com/forum/121/pg1/srtpgs'},
 {'title': 'COVID vaccination rates \\xe2\\x80\\x98alarmingly\\xe2\\x80\\x99 low among nursing home staff',
  'link': 'https://www.abovetopsecret.com/forum/thread1341843/pg1'},
 {'title': 'Part 3. How Would You Create a Biological Warfare Agent? (From \\xe2\\x80\\x9cThe WHO\\xe2\\x80\\x99s Proposed Treaty Will Increase Man-Made Pandemics\\xe2\\x80\\x9d)',
  'link': 'https://ahrp.org/part-3-how-would-you-create-a-biological-warfare-agent/'},
 {'title': 'The WHO\\xe2\\x80\\x99s Proposed Treaty Will Increase Man-Made Pandemics – Part 1. Weapons of Mass Destruction: Chem/B

In [None]:
len(conspiracy_articles)

351

adding new articles found to the existing database

In [None]:
# adds new articles discovered to existing database (extending the database)
def consolidate_data(existing_data, new_data):
    def remove_dup(curr):
        titles = set()
        rev = []
        for item in curr:
            if item['title'] in titles:
                continue
            rev.append(item)
            titles.add(item['title'])
        return rev

    print(f"Before combining: {len(existing_data)}")
    existing_data += new_data
    existing_data = remove_dup(existing_data)
    print(f"After combining and removing duplicates: {len(existing_data)}")
    
    return remove_dup(existing_data)

saving JSON results obtained from scraping

In [None]:
with open(f"data/article_metadata/science.json") as file:
    existing_science_articles = json.load(file)

In [None]:
with open(f"data/article_metadata/conspiracy.json") as file:
    existing_conspiracy_articles = json.load(file)

In [None]:
updated_science_articles = consolidate_data(existing_data=existing_science_articles, 
                                            new_data=science_articles)

Before combining: 689
After removing duplicates: 705


In [None]:
updated_conspiracy_articles = consolidate_data(existing_data=existing_conspiracy_articles,
                                               new_data=conspiracy_articles)

Before combining: 1134
After removing duplicates: 1143


In [None]:
with open(f"data/article_metadata/science.json", 'w') as file:
    json.dump(updated_science_articles, file, indent = 4)

In [None]:
with open(f"data/article_metadata/conspiracy.json", 'w') as file:
    json.dump(updated_conspiracy_articles, file, indent = 4)