In [31]:
import requests

In [32]:
science_url = 'https://mediabiasfactcheck.com/pro-science/'
conspiracy_url = 'https://mediabiasfactcheck.com/conspiracy/'

In [33]:
# access the corresponding page on MediaBiasFactCheck
science_page = requests.get(science_url, timeout=5)
conspiracy_page = requests.get(conspiracy_url, timeout=5)

In [34]:
from bs4 import BeautifulSoup

In [35]:
# parse through that particular page and find all of the news sites
science_soup = BeautifulSoup(science_page.content, 'lxml')
science_sites = science_soup.find_all('span', {'style': 'font-size: 12pt;'})

In [36]:
# repeat for conspiracy page
conspiracy_soup = BeautifulSoup(conspiracy_page.content, 'lxml')
conspiracy_sites = conspiracy_soup.find_all('span', {'style': 'font-size: 12pt;'})

In [37]:
# creating storage for all of the links to be obtained off the MBFC page
science_db = []
conspiracy_db = []

In [38]:
# will ensure valid url
def format_url(link):
    if not link.startswith(('https://www', "http://www.", "http://", "https://")):
        if not link.startswith(('www.')):
            link = 'www.' + link
        if not link.startswith(('http://', 'https://')):
            link = 'https://' + link
    return link

In [39]:
# extract all science articles
for sci_page in science_sites:
    link = format_url(sci_page.text[sci_page.text.rfind('(')+1:-1])
    source = sci_page.text[:sci_page.text.rfind('(')-1]

    if link.count(' ') == 0 and len(source) >= 2:
        science_db.append({"name": source, "url": link})

In [40]:
# repeating the previous process, now extracting all conspiracy articles
for con_page in conspiracy_sites:
    link = format_url(con_page.text[con_page.text.rfind('(')+1:-1])
    source = con_page.text[:con_page.text.rfind('(')-1]

    if link.count(' ') == 0 and len(source) >= 2:
        conspiracy_db.append({"name": source, "url": link})

In [43]:
science_db

[{'name': 'Acoustics Journal',
  'url': 'https://www.mdpi.com/journal/acoustics'},
 {'name': 'Acta Neuropathologica Communications',
  'url': 'https://www.actaneurocomms.biomedcentral.com'},
 {'name': 'Actuators Journal',
  'url': 'https://www.mdpi.com/journal/actuators'},
 {'name': 'Administrative Sciences Journal',
  'url': 'https://www.mdpi.com/journal/admsci'},
 {'name': 'Adolescents Journal',
  'url': 'https://www.mdpi.com/journal/adolescents'},
 {'name': 'Advances in Respiratory Medicine',
  'url': 'https://www.mdpi.com/journal/arm'},
 {'name': 'Aerospace Journal',
  'url': 'https://www.mdpi.com/journal/aerospace'},
 {'name': 'Agriculture Journal',
  'url': 'https://www.mdpi.com/journal/agriculture'},
 {'name': 'AgriEngineering Journal',
  'url': 'https://www.mdpi.com/journal/agriengineering'},
 {'name': 'Agrochemicals Journal',
  'url': 'https://www.mdpi.com/journal/agrochemicals'},
 {'name': 'Agronomy Journal', 'url': 'https://www.mdpi.com/journal/agronomy'},
 {'name': 'AI Jour

In [42]:
conspiracy_db

[{'name': '2020ElectionCenter.c', 'url': 'https://www.2020ElectionCenter.co'},
 {'name': '21st Century Wire', 'url': 'https://www.21stcenturywire.com'},
 {'name': '79Days.Ne', 'url': 'https://www.79Days.New'},
 {'name': '369 News', 'url': 'https://www.369news.net'},
 {'name': '911Truth.org', 'url': 'https://www.911truth.org'},
 {'name': 'Above Top Secret', 'url': 'https://www.abovetopsecret.com'},
 {'name': 'A Call for an Uprising',
  'url': 'https://www.acallforanuprising.com'},
 {'name': 'ACNLatitudes', 'url': 'https://www.latitudes.org'},
 {'name': 'Activist Post', 'url': 'https://www.activistpost.com'},
 {'name': 'Actualized.o', 'url': 'https://www.Actualized.or'},
 {'name': 'Adams.Ne', 'url': 'https://www.Adams.New'},
 {'name': 'Addiction.Ne', 'url': 'https://www.Addiction.New'},
 {'name': 'ADDitude Magazine', 'url': 'https://www.additudemag.com'},
 {'name': 'Age of Autism', 'url': 'https://www.ageofautism.com'},
 {'name': 'Alien News', 'url': 'https://www.newsinstact.com'},
 {'na

check if all urls are valid

In [44]:
from urllib.parse import urlparse

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme in ['http', 'https'], result.netloc])
    except ValueError:
        return False

In [46]:
# checks if any are false
assert [is_valid_url(page['url']) for page in science_db].count(False) == 0

In [47]:
assert [is_valid_url(page['url']) for page in conspiracy_db].count(False) == 0

convert to JSON and send to storage

In [48]:
import json

In [49]:
science_json = json.dumps(science_db, indent=4)
conspiracy_json = json.dumps(conspiracy_db, indent=4)

In [50]:
with open('data/science.json', 'w') as file:
    file.write(science_json)

In [51]:
with open('data/conspiracy.json', 'w') as file:
    file.write(conspiracy_json)