In [1]:
import os 
import csv
import pandas  as pd
import numpy as np
from datasets import load_dataset
import requests 
from bs4 import BeautifulSoup
from newspaper import Article, ArticleException
from huggingface_hub import hf_hub_download
import pandas as pd

In [6]:
# get political bias dataset - for categorizing publishers
REPO_ID = "valurank/PoliticalBias"

df1 = pd.read_csv(
    hf_hub_download(repo_id=REPO_ID, filename='Part1.csv', repo_type="dataset")
)
df1 = pd.DataFrame(df1)
df2 = pd.read_csv(
    hf_hub_download(repo_id=REPO_ID, filename='Part2.csv', repo_type="dataset")
)
df2 = pd.DataFrame(df2)
df = df1._append(df2)
df = df.drop(columns='Main URL')
df.to_csv('politicalBias_urls.csv') 


In [3]:
# scrape data to get (url,label,content)
# send to file
# will take hours
from tqdm import tqdm

df = pd.read_csv('politicalBias_urls.csv')

PoliticalBias_scraped = pd.DataFrame(columns=['url','label', 'text'])
success, miss = 0, 0
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Scraping Progress"):

    # Iterate over items in the row
    for column, value in row.items():
        if value != np.nan:
            try:
                url = str(value)
                article = Article(url)
                article.download()
                article.parse()
                PoliticalBias_scraped = PoliticalBias_scraped._append({'url':url ,'label': column, 'text': article.text}, ignore_index=True)
                success += 1
            except ArticleException:
                #print('***FAILED TO DOWNLOAD***', url)
                miss += 1
                pass

rate = success/(miss + success)
print(rate)
PoliticalBias_scraped.to_csv('./scraped_data/politicalBias_scraped.csv')

Scraping Progress:   0%|          | 1/6253 [00:01<1:47:43,  1.03s/it]WWWWWWW
WWWWWWW
Scraping Progress:   0%|          | 2/6253 [00:01<1:29:37,  1.16it/s]WWWWWWW
WWWWWWW
WWWWWWW
Scraping Progress:   0%|          | 3/6253 [00:02<1:19:52,  1.30it/s]WWWWWWW
WWWWWWW
Scraping Progress:   0%|          | 4/6253 [00:04<2:28:29,  1.43s/it]WWWWWWW
Scraping Progress:   0%|          | 5/6253 [00:05<1:58:56,  1.14s/it]WWWWWWW
WWWWWWW
Scraping Progress:   0%|          | 6/6253 [00:06<1:44:04,  1.00it/s]WWWWWWW
Scraping Progress:   0%|          | 7/6253 [00:06<1:35:20,  1.09it/s]WWWWWWW
WWWWWWW
Scraping Progress:   0%|          | 8/6253 [00:07<1:23:11,  1.25it/s]WWWWWWW
WWWWWWW
Scraping Progress:   0%|          | 9/6253 [00:07<1:11:37,  1.45it/s]WWWWWWW
WWWWWWW
Scraping Progress:   0%|          | 10/6253 [00:09<1:27:06,  1.19it/s]WWWWWWW
WWWWWWW
Scraping Progress:   0%|          | 11/6253 [00:09<1:10:31,  1.48it/s]WWWWWWW
WWWWWWW
WWWWWWW
Scraping Progress:   0%|          | 12/6253 [00:10<1:13:38,  1.

In [4]:
# get BABE dataset - used to train and test models
# BABE changed its file storage type to a zip for storage so this is feature no longer works. Content is stored in urls_BABE.csv
REPO_ID = "mediabiasgroup/BABE"

df1 = pd.read_csv(
    hf_hub_download(repo_id=REPO_ID, filename='BABE.csv', repo_type="dataset")
)
df1 = pd.DataFrame(df1)

df1 = df1.dropna()

df1.to_csv('urls_BABE.csv')


EntryNotFoundError: 404 Client Error. (Request ID: Root=1-6614557f-4caebee276ec657433dc8a58;9c284105-7685-4e14-8e49-ff114554f8c3)

Entry Not Found for url: https://huggingface.co/datasets/mediabiasgroup/BABE/resolve/main/BABE.csv.

In [2]:
def relable_class(row):
    if row['type_class'] == 'left':
        return 0
    elif row['type_class'] == 'center':
        return 1
    elif row['type_class'] == 'right':
        return 2
    else:
        return row['type_class']  # Return the original value if not one of the specified types

In [21]:
# Scrape data using URLs
# will take hours
df1 = pd.read_csv('urls_BABE.csv')
df_scraped = pd.DataFrame(columns=['url', 'content', 'type_class'])
print(len(df1))
x, hit, miss = 0, 0, 0

for index, row in df1.iterrows():
    if(x == len(df1)-1):
        break
    # Iterate over items in the row
    url = row['news_link']
    url = str(url)
    type_class = row['type']
    if url != np.nan:
        try:
            article = Article(url)
            article.download()
            article.parse()
            df_scraped = df_scraped._append({'url': url, 'content': article.text, 'type_class': type_class}, ignore_index=True)
            hit += 1
            #print('Succesfully grabbed article #' + str(x))
            
            print(df_scraped.head())
        except ArticleException:
            miss += 1
            #print('***FAILED TO DOWNLOAD***')
    x +=1
print('\nSuccesfully scraped ' + str(hit) + ' articles. Missed ' + str(miss))
#print(df_scraped.head(20))

# Output scraped data csv
df_scraped['type_class'] = df_scraped.apply(relable_class, axis=1)
#df_scraped.to_csv('scraped_data/BABE_scraped.csv')
type_counts = df_scraped['type_class'].value_counts() # eval number of each type_class
print(type_counts)

2644
                                                 url  \
0  https://www.foxnews.com/entertainment/australi...   

                                             content type_class  
0  "Orange Is the New Black" star Yael Stone is r...      right  
                                                 url  \
0  https://www.foxnews.com/entertainment/australi...   
1  https://www.alternet.org/2020/06/law-and-order...   

                                             content type_class  
0  "Orange Is the New Black" star Yael Stone is r...      right  
1  Mark Twain's instruction to curious residents ...       left  
                                                 url  \
0  https://www.foxnews.com/entertainment/australi...   
1  https://www.alternet.org/2020/06/law-and-order...   
2  https://www.nbcnews.com/news/latino/after-step...   

                                             content type_class  
0  "Orange Is the New Black" star Yael Stone is r...      right  
1  Mark Twain's instructio

KeyboardInterrupt: 