# Scrape the MBFC Dataset, Including Granular Bias Ratings

In [6]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_fixed
import tldextract
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import re
from string import punctuation

categories = [
    "center",
    "left",
    "leftcenter",
    "right-center",
    "right",
    "conspiracy",
    "fake-news",
    "pro-science",
]

def get_all_links(link):
    page = requests.get(link)
    soup = BeautifulSoup(page.content, "html.parser")
    extract = soup.select("#mbfc-table a")
    hrefs = [x["href"] for x in extract]
    return hrefs

@retry(stop=stop_after_attempt(5), wait=wait_fixed(1))
def get_granular_ratings(link):
    result = {}
    page = requests.get(link)
    soup = BeautifulSoup(page.content, "html.parser")
    main = soup.select("#main-content .clearfix")
    imgs = main[0].select("img")
    granular_rating = imgs[0]["src"]
    paragraphs = soup.find_all("p")
    for p in paragraphs:
        if "Bias Rating:" in p.get_text():
            extract_text = p.get_text(separator=" ")
            lines = extract_text.strip().split("\n")
            result = {}
            for line in lines:
                if ":" in line:
                    key, value = line.split(":", 1)
                    result[key.strip()] = value.strip()
            break
    text = soup.select("p")
    source_link = [x.text for x in text if "Source:" in x.text]
    if ": " in source_link[0]:
        link = source_link[0].split(": ")[1]
    elif ":\xa0" in source_link[0]:
        link = source_link[0].split(":\xa0")[1]

    result["source_link"] = link
    result["source_domain"] = tldextract.extract(link).domain
    result["granular_rating"] = granular_rating
    return result


links = {}
for category in tqdm(categories):
    links[category] = get_all_links(f"https://mediabiasfactcheck.com/{category}/")

data = []
error_count = {}

def process_link(link, category):
    try:
        result = get_granular_ratings(link)
        result["category"] = category
        data.append(result)
    except Exception as e:
        if category not in error_count:
            error_count[category] = []
        error_count[category].append([link, str(e)])

In [None]:

with ThreadPoolExecutor(max_workers=50) as executor:
    futures = []
    for category, link_list in links.items():
        for link in link_list:
            futures.append(executor.submit(process_link, link, category))

    for future in tqdm(as_completed(futures), total=len(futures)):
        pass

    for category in categories:
        print(f"Errors in {category}: {error_count.get(category, 0)}")

for category in error_count:
    error_count[category] = list(set(tuple(error) for error in error_count[category]))

print("Running error links again...")
error_links = []
for category, errors in error_count.items():
    for error in errors:
        error_links.append((error[0], category))

with ThreadPoolExecutor(max_workers=50) as executor:
    futures = []
    for link, category in error_links:
        futures.append(executor.submit(process_link, link, category))

    for future in tqdm(as_completed(futures), total=len(futures)):
        pass

    for category in categories:
        print(f"Remaining errors in {category}: {len(error_count.get(category, [])) }")


unique_data = []
unique_links = set()

for item in data:
    source_link = item["source_link"]
    if source_link not in unique_links:
        unique_data.append(item)
        unique_links.add(source_link)


data = unique_data


def clean_name(x):
    x = (
        x.lower()
        .translate(str.maketrans("", "", punctuation))
        .replace("rank", "rating")
        .replace("’", "")
        .strip()
    )
    x = re.sub(" +", " ", x)
    x = x.replace(" ", "_")
    return x


column_mapping = {
    "mbfcscountry_freedom_rating": "mbfc_country_freedom_rating",
    "mbfcs_countryfreedom_rating": "mbfc_country_freedom_rating",
    "mbfcs_press_freedom_rating": "mbfc_country_freedom_rating",
    "mbrfcs_country_freedom_rating": "mbfc_country_freedom_rating",
    "world_press_freedom_rating": "mbfc_country_freedom_rating",
    "press_freedom_rating": "mbfcs_country_freedom_rating",
    "mbfcs_freedom_rating": "mbfc_country_freedom_rating",
    "press_freedom_rating": "mbfc_country_freedom_rating",
    "mbfcs_country_freedom_rating": "mbfc_country_freedom_rating",
    "credibility": "mbfc_credibility_rating",
    "mbfcs_county_freedom_rating": "mbfc_country_freedom_rating",
    "mbfcs_country_freedom_profile": "mbfc_country_freedom_rating",
    "mbgcs_country_freedom_rating": "mbfc_country_freedom_rating",
    "country_press_freedom_rating": "mbfc_country_freedom_rating",
    "mbfcs_countyry_freedom_rating": "mbfc_country_freedom_rating",
}

data_clean = [{clean_name(k): v for k, v in entry.items()} for entry in data]
data_clean = [
    {column_mapping.get(k, k): v for k, v in entry.items()} for entry in data_clean
]
df = pd.DataFrame(data_clean)
# Export to CSV
df.to_csv("mediabiasfactcheck_fulldataset.csv", index=False)

df["granularrating_cat"] = df["granularrating"].apply(
    lambda x: x.split("/")[-1].split(".")[0]
)

df["granularrating_cat"].apply(lambda x: re.sub(r"\d+", "", x)).value_counts()
political_categories = [
    "extremeright",
    "right",
    "rightcenter",
    "leastbiased",
    "leftcenter",
    "left",
    "extremeleft",
]
df["political"] = df["granularrating_cat"].str.contains("|".join(political_categories))
df["level"] = df["granularrating_cat"].apply(
    lambda x: x.replace(re.sub(r"\d+", "", x), "")
)
df["granularrating_nolevel"] = df["granularrating_cat"].apply(
    lambda x: re.sub(r"\d+", "", x)
)
df["level"] = df["level"].apply(lambda x: x[0:2] if len(x) > 2 else x)

df[df.granularrating_nolevel == "extremeleft"].groupby(
    "level"
).granularrating.unique().values
df["granularrating_nolevel"] = df["granularrating_cat"].apply(
    lambda x: re.sub(r"\d+", "", x)
)
df["level"] = df["granularrating_cat"].apply(
    lambda x: x.replace(re.sub(r"\d+", "", x), "")
)
df["level"] = df["level"].apply(lambda x: x[:2] if len(x) > 2 else x)

# Map the political levels from -36 to +36
political_mapping = {
    "extremeleft": -30,
    "left": -20,
    "leftcenter": -10,
    "leastbiased": 0,
    "rightcenter": 10,
    "right": 20,
    "extremeright": 30,
}


def map_political_level(row):
    if row["granularrating_nolevel"] not in political_mapping:
        return None
    category = row["granularrating_nolevel"]
    level = int(row["level"])

    if category in ["extremeleft", "extremeright"]:
        level_mapping = {1: 6, 2: 5, 3: 4, 4: 3, 5: 2, 6: 1}
    else:
        level_mapping = {
            1: -6,
            2: -5,
            3: -4,
            4: -3,
            5: -2,
            6: -1,
            7: 1,
            8: 2,
            9: 3,
            10: 4,
            11: 5,
            12: 6,
        }

    return political_mapping[category] + level_mapping[level]


df["political_level"] = df.apply(map_political_level, axis=1)

df = df.drop(
    columns=[
        "granularrating_nolevel",
        "level",
        "granularrating_cat",
        "questionable_reasoning",
        "bias",
        "reasoning",
    ]
)
df.to_csv("mediabiasfactcheck_fulldataset.csv", index=False)

## Repeat for Misinformation, Fake-News

In [16]:
links = get_all_links("https://mediabiasfactcheck.com/fake-news/")
links = [(link, "fake-news") for link in links]
data = []
error_count = {}

with ThreadPoolExecutor(max_workers=50) as executor:
    futures = []
    for link, category in links:
        futures.append(executor.submit(process_link, link, category))
    for future in tqdm(as_completed(futures), total=len(futures)):
        pass
    for category in categories:
        print(f"Remaining errors in {category}: {len(error_count.get(category, [])) }")

# Repeat for error links
error_links = []
for category, errors in error_count.items():
    for error in errors:
        error_links.append((error[0], category))

with ThreadPoolExecutor(max_workers=50) as executor:
    futures = []
    for link, category in error_links:
        futures.append(executor.submit(process_link, link, category))
    for future in tqdm(as_completed(futures), total=len(futures)):
        pass
    for category in categories:
        print(f"Remaining errors in {category}: {len(error_count.get(category, [])) }")

d = pd.DataFrame(data)
d[~d.source_link.duplicated()].to_csv("../Data/mediabiasfactcheck_fakenews.csv", index=False)

100%|██████████| 1523/1523 [02:16<00:00, 11.15it/s]

Remaining errors in center: 0
Remaining errors in left: 0
Remaining errors in leftcenter: 0
Remaining errors in right-center: 0
Remaining errors in right: 0
Remaining errors in conspiracy: 0
Remaining errors in fake-news: 593
Remaining errors in pro-science: 0



