accessing appropriate pages on MediaBiasFactCheck to collect sites to visit

In [31]:
import requests

In [32]:
science_url = 'https://mediabiasfactcheck.com/pro-science/'
conspiracy_url = 'https://mediabiasfactcheck.com/conspiracy/'

In [33]:
science_page = requests.get(science_url, timeout=5)
conspiracy_page = requests.get(conspiracy_url, timeout=5)

In [34]:
from bs4 import BeautifulSoup

parsing through both pages and finding all sites that can be visited

In [35]:
science_soup = BeautifulSoup(science_page.content, 'lxml')
science_sites = science_soup.find_all('span', {'style': 'font-size: 12pt;'})

In [36]:
conspiracy_soup = BeautifulSoup(conspiracy_page.content, 'lxml')
conspiracy_sites = conspiracy_soup.find_all('span', {'style': 'font-size: 12pt;'})

extracting all science and conspiracy pages that can be visited

In [52]:
# will ensure request can deal with all links
def format_links(link):
    if not link.startswith(('https://www', "http://www.", "http://", "https://")):
        if not link.startswith(('www.')):
            link = 'www.' + link
        if not link.startswith(('http://', 'https://')):
            link = 'https://' + link
    return link

In [61]:
def create_db(sites):
    db = []
    for site in sites:
        link = format_links(site.text[site.text.rfind('(')+1:-1])
        source = site.text[:site.text.rfind('(')-1]

        if link.count(' ') == 0 and len(source) >= 2:
            db.append({"name": source, "url": link})    
    return db

In [62]:
science_db = create_db(science_sites)
conspiracy_db = create_db(conspiracy_sites)

In [63]:
science_db[:10]

[{'name': 'Acoustics Journal',
  'url': 'https://www.mdpi.com/journal/acoustics'},
 {'name': 'Acta Neuropathologica Communications',
  'url': 'https://www.actaneurocomms.biomedcentral.com'},
 {'name': 'Actuators Journal',
  'url': 'https://www.mdpi.com/journal/actuators'},
 {'name': 'Administrative Sciences Journal',
  'url': 'https://www.mdpi.com/journal/admsci'},
 {'name': 'Adolescents Journal',
  'url': 'https://www.mdpi.com/journal/adolescents'},
 {'name': 'Advances in Respiratory Medicine',
  'url': 'https://www.mdpi.com/journal/arm'},
 {'name': 'Aerospace Journal',
  'url': 'https://www.mdpi.com/journal/aerospace'},
 {'name': 'Agriculture Journal',
  'url': 'https://www.mdpi.com/journal/agriculture'},
 {'name': 'AgriEngineering Journal',
  'url': 'https://www.mdpi.com/journal/agriengineering'},
 {'name': 'Agrochemicals Journal',
  'url': 'https://www.mdpi.com/journal/agrochemicals'}]

In [64]:
conspiracy_db[:10]

[{'name': '2020ElectionCenter.c', 'url': 'https://www.2020ElectionCenter.co'},
 {'name': '21st Century Wire', 'url': 'https://www.21stcenturywire.com'},
 {'name': '79Days.Ne', 'url': 'https://www.79Days.New'},
 {'name': '369 News', 'url': 'https://www.369news.net'},
 {'name': '911Truth.org', 'url': 'https://www.911truth.org'},
 {'name': 'Above Top Secret', 'url': 'https://www.abovetopsecret.com'},
 {'name': 'A Call for an Uprising',
  'url': 'https://www.acallforanuprising.com'},
 {'name': 'ACNLatitudes', 'url': 'https://www.latitudes.org'},
 {'name': 'Activist Post', 'url': 'https://www.activistpost.com'},
 {'name': 'Actualized.o', 'url': 'https://www.Actualized.or'}]

validation to ensure all links have valid headers

In [44]:
from urllib.parse import urlparse

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme in ['http', 'https'], result.netloc])
    except ValueError:
        return False

checking if any links are not valid using the function above

In [46]:
assert [is_valid_url(page['url']) for page in science_db].count(False) == 0

In [47]:
assert [is_valid_url(page['url']) for page in conspiracy_db].count(False) == 0

converting both dbs to JSON and sending them to storage in the data/ directory

In [48]:
import json

In [49]:
science_json = json.dumps(science_db, indent=4)
conspiracy_json = json.dumps(conspiracy_db, indent=4)

In [50]:
with open('data/science.json', 'w') as file:
    file.write(science_json)

In [51]:
with open('data/conspiracy.json', 'w') as file:
    file.write(conspiracy_json)