In [30]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy import create_engine

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

def setup_selenium():
    options = Options()
    options.headless = True
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def fetch_article_content(soup):
    '''Get text present from first <p> tag up to the first <h2> tag of each article'''
    content = []
    current_tag = soup.find('p')  

    while current_tag and current_tag.name != 'h2':
        if current_tag.name == 'p':
            content.append(current_tag.get_text(strip=True))
        current_tag = current_tag.find_next()  # Move to next tag
    
    return ' '.join(content)

def fetch_article(link):
    driver = setup_selenium()
    driver.get(link)
    article_soup = article_soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Get date and time of published article
    time_tag = article_soup.find('time')
    article_info = {
        'datetime': time_tag['datetime'] if time_tag else ''
    }

    # Get stocks affected


    # Get content
    content = fetch_article_content(article_soup)
    article_info['content'] = content
    
    driver.quit()
    return article_info

def scrape_yfinance_news():
    driver = setup_selenium()
    driver.get('https://finance.yahoo.com/news')
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    class_name = "clamp tw-line-clamp-3 sm:tw-line-clamp-2 svelte-13zydns"

    links = [h3.find_parent('a')['href'] for h3 in soup.find_all('h3', class_=class_name)]
    driver.quit()

    news = []
    completed_articles = 0
    total_articles = len(links)

    # Lock for safely updating progress from multiple threads
    lock = threading.Lock()

    # Use ThreadPoolExecutor to fetch details concurrently
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_link = {executor.submit(fetch_article, link): link for link in links}
        for future in as_completed(future_to_link):
            link = future_to_link[future]
            try:
                additional_details = future.result()
                article_details = {
                    'title': additional_details.get('title', ''),
                    'link': link
                }
                article_details.update(additional_details)  # Merge fetched details
                news.append(article_details)
                
                with lock:
                    completed_articles += 1
                    print(f"Completed {completed_articles}/{total_articles} articles ({completed_articles/total_articles*100:.2f}%)")

            except Exception as e:
                print(f"Failed to fetch details for {link}: {str(e)}")
    
    return news

In [2]:
data = scrape_yfinance_news()
print(data)

Completed 1/24 articles (4.17%)
Completed 2/24 articles (8.33%)
Completed 3/24 articles (12.50%)
Completed 4/24 articles (16.67%)
Completed 5/24 articles (20.83%)
Completed 6/24 articles (25.00%)
Completed 7/24 articles (29.17%)
Completed 8/24 articles (33.33%)
Completed 9/24 articles (37.50%)
Completed 10/24 articles (41.67%)
Completed 11/24 articles (45.83%)
Completed 12/24 articles (50.00%)
Completed 13/24 articles (54.17%)
Completed 14/24 articles (58.33%)
Completed 15/24 articles (62.50%)
Completed 16/24 articles (66.67%)
Completed 17/24 articles (70.83%)
Completed 18/24 articles (75.00%)
Completed 19/24 articles (79.17%)
Completed 20/24 articles (83.33%)
Completed 21/24 articles (87.50%)
Completed 22/24 articles (91.67%)
Completed 23/24 articles (95.83%)
Completed 24/24 articles (100.00%)
[{'title': '', 'link': 'https://finance.yahoo.com/news/mexico-sheinbaum-wants-debt-laden-140000390.html', 'datetime': '2024-04-20T17:35:36.000Z', 'content': 'Tip: Try a valid symbol or a specifi

In [29]:
def transform(data:dict) -> pd.DataFrame:
    df = pd.DataFrame(data, columns=['title', 'link', 'stock', 'stock-effect'])
    return df

df = transform(data)
print(df.head())

                                               title  \
0   Tesla cuts US prices of Models Y, X, S by $2,000   
1  Musk Postpones India Visit, Citing Heavy Tesla...   
2  Senate passes reauthorization of key US survei...   
3  The drug war devastated Black and other minori...   
4  Record Store Day celebrates indie retail music...   

                                                link  stock  stock-effect  
0  https://finance.yahoo.com/news/tesla-cuts-us-p...    NaN           NaN  
1  https://finance.yahoo.com/news/musk-postpones-...    NaN           NaN  
2  https://finance.yahoo.com/news/senate-passes-r...    NaN           NaN  
3  https://finance.yahoo.com/news/drug-war-devast...    NaN           NaN  
4  https://finance.yahoo.com/news/record-store-da...    NaN           NaN  


In [28]:
def load(df:pd.DataFrame) -> None:
    '''loads data into PostgreSQL database'''
    

                                               title  \
0   Tesla cuts US prices of Models Y, X, S by $2,000   
1  Musk Postpones India Visit, Citing Heavy Tesla...   
2  Senate passes reauthorization of key US survei...   
3  The drug war devastated Black and other minori...   
4  Record Store Day celebrates indie retail music...   

                                                link  stock  stock-effect  
0  https://finance.yahoo.com/news/tesla-cuts-us-p...    NaN           NaN  
1  https://finance.yahoo.com/news/musk-postpones-...    NaN           NaN  
2  https://finance.yahoo.com/news/senate-passes-r...    NaN           NaN  
3  https://finance.yahoo.com/news/drug-war-devast...    NaN           NaN  
4  https://finance.yahoo.com/news/record-store-da...    NaN           NaN  
