# Testing Retrieving Articles

## Cleaning - GOOD!

In [None]:
def clean_malformed_escaped_url(url: str) -> str:
    print(repr(url))
    url = url.strip()
    print(repr(url))
    url = url.replace("\\u003d", "=")
    print(repr(url))
    url = url.replace(" ", "%20")
    return url

url = "https://abcnews.go.com/US/bird-flu-found-sample-california-raw-milk-officials/story?id\\u003d116185808"
print(clean_malformed_escaped_url(url))

## Testing with Fork Newspaper4k

In [None]:
from playwright.sync_api import sync_playwright
import time
import newspaper

def scrape_with_playwright(url):
    # Using Playwright to render JavaScript
    with sync_playwright() as p:
        browser = p.chromium.launch()
        context = browser.new_context()
        context.set_default_timeout(60000)
        page = context.new_page()
        page.goto(url, wait_until="domcontentloaded", timeout=60000)
        time.sleep(3) # Allow the javascript to render
        content = page.content()
        browser.close()
    article = newspaper.article(url, input_html=content, language='en')
    return article

url = "https://www.sfgate.com/bayarea/article/hannah-kobayashi-father-found-dead-19941244.php"
print(scrape_with_playwright(url))

## Newspaper 3k

In [None]:
from newspaper import Article, Config
config = Config()
config.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"

try:
    url = "https://www.espn.com/fantasy/football/insider/story/_/id/42525671/2024-fantasy-football-advice-questions-answers-espn-experts-arena-week-12"
    article = Article(url)
    article.download()
    article.parse()
    content = article.text
except Exception as e:
    content = "Paywalled or Blocked"
    print(content)
else:
    print(content)

## newspaper3k with Redirects

In [None]:
import requests

url = ""
https://news.google.com/rss/articles/CBMiWkFVX3lxTFBWU2c3ZFJsQ0Z1MXgxZVFXYXF2VDMxbWIzREhIU004dUlaMGRvcE1tci1sVVRPNmpjNXdmSWFiTVZoR1FlMDVyVE1mVy1hVHRqcVR2QzBmbGdIUdIBX0FVX3lxTE83RlFOamNfRFBqN29HcnN3R3NpNUtUbS1CanJJTmk5eXdNcVZ2MTdvSmxLVTRtZkdCN0ktSmhfUDFaQUo0eC1Pb2RabVVkOHZQMW1GZzRyUGExMFVyLXpn?oc=5
response = requests.get(url, allow_redirects=True)
print(response.url)

url2 = "https://news.google.com/rss/articles/CBMiWkFVX3lxTFBWU2c3ZFJsQ0Z1MXgxZVFXYXF2VDMxbWIzREhIU004dUlaMGRvcE1tci1sVVRPNmpjNXdmSWFiTVZoR1FlMDVyVE1mVy1hVHRqcVR2QzBmbGdIUdIBX0FVX3lxTE83RlFOamNfRFBqN29HcnN3R3NpNUtUbS1CanJJTmk5eXdNcVZ2MTdvSmxLVTRtZkdCN0ktSmhfUDFaQUo0eC1Pb2RabVVkOHZQMW1GZzRyUGExMFVyLXpn?oc=5&hl=en-US&gl=US&ceid=US:en"

response2 = requests.get(url2, allow_redirects=True)
print(response.url)


## Selenium for Javascript Heavy

In [None]:
from selenium import webdriver

driver = webdriver.Chrome()  # Install appropriate WebDriver for your browser
driver.get("https://news.google.com/rss/articles/CBMiWkFVX3lxTFBWU2c3ZFJsQ0Z1MXgxZVFXYXF2VDMxbWIzREhIU004dUlaMGRvcE1tci1sVVRPNmpjNXdmSWFiTVZoR1FlMDVyVE1mVy1hVHRqcVR2QzBmbGdIUdIBX0FVX3lxTE83RlFOamNfRFBqN29HcnN3R3NpNUtUbS1CanJJTmk5eXdNcVZ2MTdvSmxLVTRtZkdCN0ktSmhfUDFaQUo0eC1Pb2RabVVkOHZQMW1GZzRyUGExMFVyLXpn?oc=5")
print(driver.page_source)  # Get the rendered page source
driver.quit()

## Testing Using Feed Parser

In [None]:
import feedparser

url = "https://news.google.com/rss/articles/CBMiWkFVX3lxTFBWU2c3ZFJsQ0Z1MXgxZVFXYXF2VDMxbWIzREhIU004dUlaMGRvcE1tci1sVVRPNmpjNXdmSWFiTVZoR1FlMDVyVE1mVy1hVHRqcVR2QzBmbGdIUdIBX0FVX3lxTE83RlFOamNfRFBqN29HcnN3R3NpNUtUbS1CanJJTmk5eXdNcVZ2MTdvSmxLVTRtZkdCN0ktSmhfUDFaQUo0eC1Pb2RabVVkOHZQMW1GZzRyUGExMFVyLXpn?oc=5"
feed = feedparser.parse(url)
print(feed)
articles = []
for entry in feed.entries:
    if 'link' in entry:
        articles.append(entry.link)

print(articles)

