In [1]:
!pip install requests beautifulsoup4 pandas

Defaulting to user installation because normal site-packages is not writeable


In [4]:
import requests
from bs4 import BeautifulSoup

# URL of the page you want to scrape
url = "https://www.wsj.com/news/archive/2018/01/01?page=1"

# Add a User-Agent header to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Send GET request with headers
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    print("Page successfully fetched")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Page successfully fetched


In [8]:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Check the page structure by printing part of the HTML
print(soup.prettify()[:1000])  # Print the first 1000 characters of the HTML structure


<!DOCTYPE html>
<!--GRAND CANYON PREBID -->
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="Search WSJ's digital archive of news articles and top headlines from January 1, 2018" name="description"/>
  <meta content="News, WSJ, Archives, News Archives, business news, news articles, markets news, world business, newspaper archives, headlines, today's news, yesterday's news" name="keywords"/>
  <meta content="Archive" name="page.section"/>
  <meta content="NewsArchive" name="page.subsection"/>
  <meta content="wsj_newsarchive" name="page.id"/>
  <meta content="wsj_newsarchive" name="ad.id"/>
  <meta content="summary" name="twitter:card"/>
  <meta content="The Wall Street Journals' News Archive for January 1, 2018" name="twitter:title"/>
  <meta content="Search WSJ's digital archive of news articles and top headlines from January 1, 2018" name="twitter:description"/>
  <meta content="https://s.wsj.net/img/meta/wsj-social-share.png" name="twitter:image"/>
  <meta content

In [9]:
# Find all <h2> tags with the specific class containing the headlines
headlines = []

for h2_tag in soup.find_all('h2', class_='WSJTheme--headline--unZqjb45'):
    headline = h2_tag.find('span', class_='WSJTheme--headlineText--He1ANr9C')
    if headline:
        headlines.append(headline.text.strip())  # Extract headline text

# Print the extracted headlines
for idx, headline in enumerate(headlines, 1):
    print(f"{idx}. {headline}")


1. Georgia Outguns Oklahoma in Rose Bowl Epic
2. Pension Funds Ask: What to Buy When Nothing Is Cheap?
3. In Cities With Low Unemployment, Wages Finally Start to Get Bigger
4. A Browser You’ve Never Heard of Is Dethroning Google in Asia
5. 2017 Marked Safest Year in Commercial Aviation History
6. China Private Factory Gauge at Odds With Official PMI
7. The Limits of Amazon
8. When to Worry About a Bitcoin Bubble
9. Regulatory Monitors Save Shareholders Money
10. Menlo Therapeutics Files for $98 Million IPO
11. Smaller Chinese Banks Brace for Tighter Oversight
12. Europe Readies for Brexit, Mifid, Perhaps the Banking Union
13. Congress to Tackle Dodd-Frank Rollback, Fannie-Freddie Overhaul
14. Trump Backs Protesters in Iran
15. Bitcoin a New Kind of Test for CFTC
16. How the SEC Might Spur More IPOs
17. Why 2018 Could Be a Good Year for Fintech in Washington
18. What’s in Store for the CFPB Under Republican Leadership?
19. Where’s the Line on Bank Deregulation?
20. Costa Rica Probes Cau

In [11]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta

# Function to fetch and parse the headlines for a given date
def fetch_headlines_for_date(date):
    url = f"https://www.wsj.com/news/archive/{date}?page=1"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    # Send the request
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve {url}")
        return []
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract headlines and article links
    articles = []
    for h2_tag in soup.find_all('h2', class_='WSJTheme--headline--unZqjb45'):
        headline = h2_tag.find('span', class_='WSJTheme--headlineText--He1ANr9C')
        article_link_tag = h2_tag.find('a', href=True)
        if headline and article_link_tag:
            headline_text = headline.text.strip()
            article_url = article_link_tag['href']
            if article_url.startswith('/'):
                article_url = 'https://www.wsj.com' + article_url
            articles.append({
                'date': date,
                'headline': headline_text,
                'url': article_url
            })
    
    return articles

# Function to save the headlines to a CSV file
def save_to_csv(data, start_date, end_date):
    filename = f'/Users/namuunlkhagvadorj/Downloads/wsj_headlines_{start_date.strftime("%Y%m%d")}_{end_date.strftime("%Y%m%d")}.csv'
    with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['date', 'headline', 'url']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write the header only once
        if csvfile.tell() == 0:
            writer.writeheader()
        
        writer.writerows(data)

# Function to scrape headlines from 2018-01-01 to 2020-01-01
def scrape_headlines(start_date, end_date):
    current_date = start_date
    while current_date <= end_date:
        date_str = current_date.strftime('%Y/%m/%d')
        print(f"Scraping headlines for {date_str}")
        
        # Fetch headlines for the current date
        headlines = fetch_headlines_for_date(date_str)
        
        if headlines:
            save_to_csv(headlines, start_date, end_date)
        
        # Move to the next day
        current_date += timedelta(days=1)

# Set the date range from 2018-01-01 to 2020-01-01
start_date = datetime(2018, 2, 1)
end_date = datetime(2018, 2, 28)

# Create a CSV file and scrape headlines for each day in the range
scrape_headlines(start_date, end_date)


Scraping headlines for 2018/02/01
Scraping headlines for 2018/02/02
Scraping headlines for 2018/02/03
Scraping headlines for 2018/02/04
Scraping headlines for 2018/02/05
Scraping headlines for 2018/02/06
Scraping headlines for 2018/02/07
Scraping headlines for 2018/02/08
Scraping headlines for 2018/02/09
Scraping headlines for 2018/02/10
Scraping headlines for 2018/02/11
Scraping headlines for 2018/02/12
Scraping headlines for 2018/02/13
Scraping headlines for 2018/02/14
Scraping headlines for 2018/02/15
Scraping headlines for 2018/02/16
Scraping headlines for 2018/02/17
Scraping headlines for 2018/02/18
Scraping headlines for 2018/02/19
Scraping headlines for 2018/02/20
Scraping headlines for 2018/02/21
Scraping headlines for 2018/02/22
Scraping headlines for 2018/02/23
Scraping headlines for 2018/02/24
Scraping headlines for 2018/02/25
Scraping headlines for 2018/02/26
Scraping headlines for 2018/02/27
Scraping headlines for 2018/02/28
