accessing appropriate pages on MediaBiasFactCheck to collect sites to visit

In [1]:
import requests

In [2]:
science_url = 'https://mediabiasfactcheck.com/pro-science/'
conspiracy_url = 'https://mediabiasfactcheck.com/conspiracy/'

In [3]:
science_page = requests.get(science_url, timeout=5)
conspiracy_page = requests.get(conspiracy_url, timeout=5)

In [4]:
from bs4 import BeautifulSoup

parsing through both pages and finding all sites that can be visited

In [5]:
science_soup = BeautifulSoup(science_page.content, 'lxml')
science_sites = science_soup.find_all('span', {'style': 'font-size: 12pt;'})

In [6]:
conspiracy_soup = BeautifulSoup(conspiracy_page.content, 'lxml')
conspiracy_sites = conspiracy_soup.find_all('span', {'style': 'font-size: 12pt;'})

extracting all science and conspiracy pages that can be visited

In [7]:
# will ensure request can deal with all links
def format_links(link):
    if not link.startswith(('https://www', "http://www.", "http://", "https://")):
        if not link.startswith(('www.')):
            link = 'www.' + link
        if not link.startswith(('http://', 'https://')):
            link = 'https://' + link
    return link

In [8]:
def create_db(sites):
    db = []
    for site in sites:
        link = format_links(site.text[site.text.rfind('(')+1:-1])
        source = site.text[:site.text.rfind('(')-1]

        if link.count(' ') == 0 and len(source) >= 2:
            db.append({"name": source, "url": link})    
    return db

In [None]:
science_db = create_db(science_sites)
conspiracy_db = create_db(conspiracy_sites)

In [10]:
science_db[:10]

[{'name': 'Acoustics Journal',
  'url': 'https://www.mdpi.com/journal/acoustics'},
 {'name': 'Acta Neuropathologica Communications',
  'url': 'https://www.actaneurocomms.biomedcentral.com'},
 {'name': 'Actuators Journal',
  'url': 'https://www.mdpi.com/journal/actuators'},
 {'name': 'Administrative Sciences Journal',
  'url': 'https://www.mdpi.com/journal/admsci'},
 {'name': 'Adolescents Journal',
  'url': 'https://www.mdpi.com/journal/adolescents'},
 {'name': 'Advances in Respiratory Medicine',
  'url': 'https://www.mdpi.com/journal/arm'},
 {'name': 'Aerospace Journal',
  'url': 'https://www.mdpi.com/journal/aerospace'},
 {'name': 'Agriculture Journal',
  'url': 'https://www.mdpi.com/journal/agriculture'},
 {'name': 'AgriEngineering Journal',
  'url': 'https://www.mdpi.com/journal/agriengineering'},
 {'name': 'Agrochemicals Journal',
  'url': 'https://www.mdpi.com/journal/agrochemicals'}]

In [11]:
conspiracy_db[:10]

[{'name': '2020ElectionCenter.c', 'url': 'https://www.2020ElectionCenter.co'},
 {'name': '21st Century Wire', 'url': 'https://www.21stcenturywire.com'},
 {'name': '79Days.Ne', 'url': 'https://www.79Days.New'},
 {'name': '369 News', 'url': 'https://www.369news.net'},
 {'name': '911Truth.org', 'url': 'https://www.911truth.org'},
 {'name': 'Above Top Secret', 'url': 'https://www.abovetopsecret.com'},
 {'name': 'A Call for an Uprising',
  'url': 'https://www.acallforanuprising.com'},
 {'name': 'ACNLatitudes', 'url': 'https://www.latitudes.org'},
 {'name': 'Activist Post', 'url': 'https://www.activistpost.com'},
 {'name': 'Actualized.o', 'url': 'https://www.Actualized.or'}]

validation to ensure all links have valid headers

In [12]:
from urllib.parse import urlparse

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme in ['http', 'https'], result.netloc])
    except ValueError:
        return False

checking if any links are not valid using the function above

In [13]:
assert [is_valid_url(page['url']) for page in science_db].count(False) == 0

In [14]:
assert [is_valid_url(page['url']) for page in conspiracy_db].count(False) == 0

converting both dbs to JSON and sending them to storage in the data/ directory

In [15]:
import json

In [16]:
science_json = json.dumps(science_db, indent=4)
conspiracy_json = json.dumps(conspiracy_db, indent=4)

In [17]:
with open('data/extract_websites/science.json', 'w') as file:
    file.write(science_json)

In [18]:
with open('data/extract_websites/conspiracy.json', 'w') as file:
    file.write(conspiracy_json)

data_acquisition code

In [19]:
from article_finder import ArticleFinder

In [20]:
# finding the scientific articles
science_articles = ArticleFinder.find_articles(article_type="science")

  soup = BeautifulSoup(page['html'], 'lxml')


In [21]:
science_articles[:10]

[{'title': 'Sound Environment during Dental Treatment in Relation to COVID-19 Pandemic',
  'link': 'https://www.mdpi.com/journal/acoustics/2624-599X/5/4/56'},
 {'title': 'Factors Associated with the Prevalence and Treatment of Depression in Adolescent Males in the US during the Period of the COVID-19 Pandemic',
  'link': 'https://www.mdpi.com/journal/adolescents/2673-7051/3/4/45'},
 {'title': 'The Impact of Comprehensive Rehabilitation on the Exercise Capacity of Patients after COVID-19',
  'link': 'https://www.mdpi.com/journal/arm/2543-6031/91/6/37'},
 {'title': 'COVID-19 Acute Respiratory Distress Syndrome: Treatment with Helmet CPAP in Respiratory Intermediate Care Unit by Pulmonologists in the Three Italian Pandemic Waves',
  'link': 'https://www.mdpi.com/journal/arm/2543-6031/91/5/30'},
 {'title': 'The Clinical Significance of Aspergillus Detected in Lower-Respiratory-Tract Samples of Critically Ill COVID-19-Positive Patients',
  'link': 'https://www.mdpi.com/journal/arm/2543-6031

In [22]:
len(science_articles)

172

In [23]:
# finding the conspiracy articles
conspiracy_articles = ArticleFinder.find_articles(article_type="conspiracy")

In [24]:
conspiracy_articles[:10]

[{'title': 'Italian Health Minister Under Investigation for Murder for Concealing COVID-19 Vaccine Deaths',
  'link': 'https://www.abovetopsecret.com/forum/thread1342167/pg1'},
 {'title': 'COVID vaccination rates \\xe2\\x80\\x98alarmingly\\xe2\\x80\\x99 low among nursing home staff',
  'link': 'https://www.abovetopsecret.com/forum/thread1341843/pg1'},
 {'title': '25% of COVID Vaxxed Now Have VAIDS, Cambridge Scientists Warn',
  'link': 'https://www.abovetopsecret.com/forum/thread1341936/pg1'},
 {'title': "54% of US Youth are Chronically Ill* America's children are facing unprecedented epidemics! We are in a crisis",
  'link': 'https://childrenshealthdefense.org/follow-the-science/54-of-us-youth-are-chronically-ill/'},
 {'title': 'lobbied for COVID-19',
  'link': 'https://www.leefang.com/p/pfizer-quietly-financed-groups-lobbying'},
 {'title': 'COVID-19',
  'link': 'https://childrenshealthdefense.org/defender_category/covid/'},
 {'title': 'Part 6. A Pandemic Treaty and Amendments (From “

In [25]:
len(conspiracy_articles)

369

saving JSON results obtained from scraping

In [1]:
# adds new articles discovered to existing database (extending the database)
def update_data(new_data, article_type):
    def remove_dup(curr):
        titles = set()
        rev = []
        for item in curr:
            if item['title'] in titles:
                continue
            rev.append(item)
            titles.add(item['title'])
        return rev

    if not (article_type == "science" or article_type == "conspiracy"):
        raise ValueError("article_type param must either be 'science' or 'conspiracy'")

    with open(f"../data/extract_articles/{article_type}.json") as file:
        existing_data = json.loads(file.read())

    # print(len(existing_data))
    existing_data += new_data
    # print(len(existing_data))

    with open(f"../data/extract_articles/{article_type}.json", 'w') as file:
        file.write(json.dumps(remove_dup(existing_data), indent = 4))

In [2]:
# untested after addition of update_data(), make sure to test it out
import json

# creating a function to store JSON files
def write_articles_to_storage(article_type: str, articles: list[str]) -> None:
    # specifying a filename where to create a new file
    filename = f"Data/extract_articles/{article_type}.json"
    update_data(articles, article_type=article_type)

    # creating a new file located at filename and writing JSON-ified articles into that file
    # with open(filename, 'w') as storage:
    #     storage.write(json.dumps(articles, indent = 4))

In [27]:
# writing the scientific and conspiracy articles found to storage for further use
write_articles_to_storage('science', science_articles)
write_articles_to_storage('conspiracy', conspiracy_articles)