exclude:
- reddit.com
- redd.it
- x.com
- wsj.com

In [46]:
import requests
from bs4 import BeautifulSoup
from ftfy import fix_text

In [47]:
def scrape_paragraphs_text(urls):
    """
    """
    scraped_all = []  # output storage
    for url in urls:
        # Continue if URL is empty or is in `skip_domains` list
        skip_domains = ['reddit', 'redd', 'x', 'wsj']
        url_split = url.split('.')
        if not url or any(domain in url_split for domain in skip_domains):
            continue

        scrape_data = {'text': None, 'status': None}
        response = requests.get(url)
        parsed_contents = BeautifulSoup(response.content, 'html.parser')
        paragraphs = parsed_contents.find_all('p')
        
        # Concatenate text from <p> tags and save it
        p_text = fix_text(' '.join([p.get_text() for p in paragraphs]))
        scrape_data['text'] = p_text

        # Filter out short texts and add "status" value
        if len(scrape_data['text'].split()) > 100:
            scrape_data['status'] = 'OK'
        else:
            block_words = ['robot', 'robots', 'block', 'blocked']
            words_to_check = scrape_data['text'].replace('.',' ').replace(',',' ').split()
            if any(word in words_to_check for word in block_words):
                scrape_data['status'] = 'BLOCKED'
            elif not scrape_data['text']:
                scrape_data['status'] = 'EMPTY'
            else:
                scrape_data['status'] = 'SHORT'

        scraped_all.append(scrape_data)

    return scraped_all


In [66]:
extracted_texts = scrape_paragraphs_text(urls)
extracted_texts

[{'text': "To continue, please click the box below to let us know you're not a robot. Please make sure your browser supports JavaScript and cookies and that you are not\n            blocking them from loading.\n            For more information you can review our Terms of\n                Service and Cookie Policy. For inquiries related to this message please contact\n            our support team and provide the reference ID below.",
  'status': 'BLOCKED'},
 {'text': 'A bipartisan group of senators is expressing concern over the USDA\'s decision to use a single contractor to distribute food to Indian Reservations A bipartisan group of senators is demanding immediate action from USDA Secretary Thomas Vilsack after several tribal nations reported that a federal food distribution program they rely on has not fulfilled orders for months, and in some cases has delivered expired food. Last spring, the USDA consolidated from two contractors to one for deliveries of its Food Distribution Progra

---

In [70]:
from transformers import pipeline

analyzer = pipeline(
    "sentiment-analysis",
    model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
    tokenizer="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
    max_length=512,
    truncation=True
)



In [73]:
for t in extracted_texts:
    if t['status'] != 'OK':
        continue
    t['sentiment_label'] = analyzer(t['text'])[0]['label']
    t['sentiment_score'] = analyzer(t['text'])[0]['score']

In [69]:
extracted_texts

[{'text': "To continue, please click the box below to let us know you're not a robot. Please make sure your browser supports JavaScript and cookies and that you are not\n            blocking them from loading.\n            For more information you can review our Terms of\n                Service and Cookie Policy. For inquiries related to this message please contact\n            our support team and provide the reference ID below.",
  'status': 'BLOCKED'},
 {'text': 'A bipartisan group of senators is expressing concern over the USDA\'s decision to use a single contractor to distribute food to Indian Reservations A bipartisan group of senators is demanding immediate action from USDA Secretary Thomas Vilsack after several tribal nations reported that a federal food distribution program they rely on has not fulfilled orders for months, and in some cases has delivered expired food. Last spring, the USDA consolidated from two contractors to one for deliveries of its Food Distribution Progra