### Attempt 1: Beautiful Soup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
wsj_login_page = 'https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https%3A%2F%2Fwww.wsj.com%2Fclient%2Fauth&response_type=code&scope=openid%20idp_id%20roles%20tags%20email%20given_name%20family_name%20uuid%20djid%20djUsername%20djStatus%20trackid%20prts%20updated_at%20created_at%20offline_access&ui_locales=en-us-x-wsj-223-2&nonce=1b34552c-ac89-4605-81d9-5b552a7452a0&state=nmfLpXnMOWJLlEjK.ifKgeElwMAfxoWUUCsEZVu2q-WgzWtwQKEy_EyvQbTs&resource=https%253A%252F%252Fwww.wsj.com%252F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin'

In [5]:
def login(username, password):
    login_url = wsj_login_page
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"
    }

    # Create a session object to persist the login session
    with requests.Session() as session:
        # Send a POST request with login credentials
        login_data = {
            "username": username,
            "password": password
        }
        response = session.post(login_url, headers=headers, data=login_data)

        # Check if login was successful (you might need to customize this condition)
        if response.status_code == 200:
            return session
        else:
            print("Login failed:", response.status_code)
            return None

In [6]:
def scrape_headlines(session, technology_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"
    }

    # Use the session object to access the technology section page
    response = session.get(technology_url, headers=headers)
    if response.status_code == 200:
        # Scrape the headlines from the technology section page using BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")
        headlines = [headline.text.strip() for headline in soup.find_all("a", class_="WSJTheme--headline--unZqjb45")]
        return headlines
    else:
        print("Failed to access technology section page:", response.status_code)
        return []

In [7]:
username = 'mallika101@hotmail.com'
password = 'abc123'
session = login(username, password)

Login failed: 404


In [None]:
if session:
    technology_url = "https://www.wsj.com/news/technology"  # URL of the technology section
    headlines = scrape_headlines(session, technology_url)
    print(headlines)

### Attempt 2: Selenium

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import datetime

In [20]:
date = datetime.date(2023, 3, 19)

# Extract year, month, day
year = str(date.year)
month = str(date.month).zfill(2)  # Zero-padding for single-digit months
day = str(date.day).zfill(2)
page_num = 1

# Initialize lists to store data
topics = []
headlines = []
published_times = []
article_links = []

In [21]:
# Start a WebDriver (you need to have chromedriver installed in your system and its path added to the environment variables)
driver = webdriver.Edge()

while True:
    # URL of the webpage to scrape
    url = f"https://www.wsj.com/news/archive/{year}/{month}/{day}?page={page_num}"

    # Open the webpage
    driver.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Find all articles on the page
    articles = soup.find_all("article")

    # No more articles on this page
    if not articles: 
        break

    # Extract data from each article
    for article in articles:
        # Extract topic
        topic = article.find("div", class_="WSJTheme--articleType--34Gt-vdG").text.strip()
        topics.append(topic)
        
        # Extract headline
        headline = article.find("span", class_="WSJTheme--headlineText--He1ANr9C").text.strip()
        headlines.append(headline)
        
        # Extract published time
        published_time = article.find("p", class_="WSJTheme--timestamp--22sfkNDv").text.strip()
        published_times.append(published_time)

        # Extract article link
        link = article.find("a", href=True)["href"]
        article_links.append(link)
    
    # Increment page
    page_num += 1


# Create a Pandas DataFrame
data = {
    "Topic": topics,
    "Headline": headlines,
    "Published Time": published_times,
    "URL": article_links
}

df = pd.DataFrame(data)

# Close the WebDriver
driver.quit()

In [23]:
len(df)

58

In [None]:
df.head(10)

Testing speed

In [47]:
# Function to scrape all headlines on WSJ on a given day
def scrape_wsj_date(driver, date):
    # Base URL
    base_url = f'https://www.wsj.com/news/archive/{str(date.year)}/{str(date.month).zfill(2)}/{str(date.day).zfill(2)}?page='

    # Extract year, month, day
    #year = str(date.year)
    #month = str(date.month).zfill(2)  # Zero-padding for single-digit months
    #day = str(date.day).zfill(2)

    # Initialize lists to store data
    topics = []
    headlines = []
    published_times = []
    article_links = []

    # Create page counter
    page_num = 1

    # Loop through each page
    while True:
        # URL of the webpage to scrape
        url = base_url + str(page_num)

        # Open the webpage
        driver.get(url)

        # Parse the HTML content
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all articles on the page
        articles = soup.find_all('article')

        # Exit loop if no more articles on page
        if not articles: 
            break

        # Extract data from each article
        for article in articles:
            # Extract topic
            topic = article.find('div', class_='WSJTheme--articleType--34Gt-vdG').text.strip()
            topics.append(topic)
            
            # Extract headline
            headline = article.find('span', class_='WSJTheme--headlineText--He1ANr9C').text.strip()
            headlines.append(headline)
            
            # Extract published time
            published_time = article.find('p', class_='WSJTheme--timestamp--22sfkNDv').text.strip()
            published_times.append(published_time)

            # Extract article link
            link = article.find("a", href=True)["href"]
            article_links.append(link)
        
        # Increment page
        page_num += 1


    # Create a Pandas DataFrame
    data = {
        'Topic': topics,
        'Headline': headlines,
        'Published Time': published_times,
        'URL': article_links
    }

    df = pd.DataFrame(data)

    return df

def scrape_wsj(start_date, end_date):
    # Start a WebDriver (you need to have chromedriver installed in your system and its path added to the environment variables)
    driver = webdriver.Edge()

    # Generate a list of DataFrames for each day
    dfs = [scrape_wsj_date(driver, current_date) for current_date in pd.date_range(start_date, end_date)]
    
    # Concatenate all DataFrames in the list into a single DataFrame
    output_df = pd.concat(dfs, ignore_index=True)

    # Close the WebDriver
    driver.quit()
    
    return output_df

testing

In [62]:
# Function to scrape all headlines on WSJ on a given day
def scrape_wsj_date(driver, date):
    # Base URL
    base_url = f'https://www.wsj.com/news/archive/{str(date.year)}/{str(date.month).zfill(2)}/{str(date.day).zfill(2)}?page='

    # Initialize lists to store data
    all_articles = []

    # Create page counter
    page_num = 1

    # Loop through each page
    while True:
        # URL of the webpage to scrape
        url = base_url + str(page_num)

        # Open the webpage
        driver.get(url)

        # Parse the HTML content
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all articles on the page
        articles = soup.find_all('article')

        # Exit loop if no more articles on page
        if not articles: 
            break
        
        all_articles.append(articles)
        
        # Increment page
        page_num += 1

    return all_articles

def scrape_wsj(start_date, end_date):
    # Start a WebDriver (you need to have chromedriver installed in your system and its path added to the environment variables)
    driver = webdriver.Edge()

    # Generate a list of DataFrames for each day
    all_articles = [scrape_wsj_date(driver, current_date) for current_date in pd.date_range(start_date, end_date)]

    # Close the WebDriver
    driver.quit()
    
    return all_articles

In [63]:
start_date = datetime.date(2023, 3, 19)
end_date = datetime.date(2023, 3, 20)
wsj = scrape_wsj(start_date, end_date)