In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import yfinance as yf
import pandas as pd 

In [None]:
"""
def collect_headlines(date):
    headlines = []
    formatted_date = datetime.strptime(date, '%Y-%m-%d').strftime('%Y-%m-%d')

    # Example URL structure; you need to find the correct URL structure for each source

    # Scraping CNBC headlines
    cnbc_url = f"https://www.cnbc.com/us-economy?date={formatted_date}"
    response = requests.get(cnbc_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    cnbc_headlines = soup.find_all('a', class_='Card-title')
    for headline in cnbc_headlines:
      headlines.append({'source': 'CNBC', 'text': headline.get_text(strip=True)})

    # Scraping Guardian headlines
    guardian_url = f"https://www.theguardian.com/money?date={formatted_date}"
    response = requests.get(guardian_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    guardian_headlines = soup.find_all('a', class_='dcr-lv2v9o')
    for headline in guardian_headlines:
        headlines.append({'source': 'Guardian', 'text': headline.get_text(strip=True)})

    # Scraping Reuters headlines
    #reuters_url = f"https://www.reuters.com/markets/us/?date={formatted_date}"
    #response = requests.get(reuters_url)
    #soup = BeautifulSoup(response.content, 'html.parser')
    #reuters_headlines = soup.find_all('a', {'Title': 'headline'})
    #for headline in reuters_headlines:
      #headlines.append({'source': 'Reuters', 'text': headline.get_text(strip=True)})
   
    return headlines
"""

In [2]:
def fetch_spy_data(date):
    # Ensure date is a datetime object
    if isinstance(date, str):
        date = datetime.strptime(date, '%Y-%m-%d')
    spy = yf.Ticker("SPY")

    # Define the start and end dates
    start_date = date.strftime('%Y-%m-%d')
    end_date = (date + timedelta(days=7)).strftime('%Y-%m-%d')

    data = spy.history(start=start_date, end=end_date)  # Get data for the specific date
    if not data.empty:
        # Calculate the difference as a percentage of the 'Open' price and add as a new column
        data['Difference (%)'] = ((data['Close'] - data['Open']) / data['Open']) * 100
        
        # Convert the relevant data to a dictionary and return
        return data[['Open', 'Close', 'Difference (%)']].reset_index().to_dict(orient='records')
    else:
        return []

In [3]:

def collect_data_with_spy(date):
    try:
        # Collect headlines for the given date
        headlines = pd.read_csv("../data/combined_headlines_new.csv")  

        # Convert date string to datetime object
        headline_date = datetime.strptime(date, '%Y-%m-%d')

        # Fetch SPY data for the given date
        spy_data = fetch_spy_data(headline_date)

        # Return collected data as a dictionary
        return {
            'headlines': headlines,
            'spy_data': spy_data
        }
    except Exception as e:
        print(f"An error occurred: {e}")
        return {
            'headlines': pd.DataFrame(),
            'spy_data': []
        }
    
    print(headlines)
    print(spy_data)

In [4]:
# Step 1: Prepare SPY data
spy_data = []
for date in pd.date_range(start='2018-03-20', end='2020-07-17'):
    result = collect_data_with_spy(date.strftime('%Y-%m-%d'))
    spy_data.extend(result['spy_data'])

spy_df = pd.DataFrame(spy_data)
spy_df['date'] = pd.to_datetime(spy_df['Date']).dt.tz_localize(None)  # Remove timezone info
spy_df.rename(columns={'Date': 'date'}, inplace=True)

In [5]:
# Remove any duplicate columns if they exist
spy_df = spy_df.loc[:, ~spy_df.columns.duplicated()]
spy_df['date'] = pd.to_datetime(spy_df['date']).dt.tz_localize(None)

# Print DataFrames for debugging
print("\nSPY DataFrame Columns:")
print(spy_df.columns)
print("SPY DataFrame:")
print(spy_df.head())


SPY DataFrame Columns:
Index(['date', 'Open', 'Close', 'Difference (%)'], dtype='object')
SPY DataFrame:
        date        Open       Close  Difference (%)
0 2018-03-20  244.465486  244.474518        0.003694
1 2018-03-21  244.429416  244.005341       -0.173496
2 2018-03-22  241.731570  237.905884       -1.582617
3 2018-03-23  238.357023  232.835007       -2.316700
4 2018-03-26  236.516313  239.205109        1.136833


In [6]:
# Step 2: Prepare headline data
headline_data = []
for date in pd.date_range(start='2018-03-20', end='2020-07-17'):
    result = collect_data_with_spy(date.strftime('%Y-%m-%d'))
    for headline in result['headlines'].to_dict('records'):
        headline_data.append({
            'date': date,          
            'headline': headline.get('headlines', '')  # Use .get() to handle missing keys
        })
headline_df = pd.DataFrame(headline_data)

In [7]:
# Remove any duplicate columns if they exist
headline_df = headline_df.loc[:, ~headline_df.columns.duplicated()]
headline_df['date'] = pd.to_datetime(headline_df['date']).dt.tz_localize(None)
print(headline_df)

               date                                           headline
0        2018-03-20  Jim Cramer : A better way to invest in the Cov...
1        2018-03-20    Cramer's lightning round : I would own Teradyne
2        2018-03-20  Cramer's week ahead : Big week for earnings , ...
3        2018-03-20  IQ Capital CEO Keith Bliss says tech and healt...
4        2018-03-20  Wall Street delivered the 'kind of pullback I'...
...             ...                                                ...
45383825 2020-07-17  Malaysia says never hired British data firm at...
45383826 2020-07-17  Prosecutors search Volkswagen headquarters in ...
45383827 2020-07-17   McDonald's sets greenhouse gas reduction targets
45383828 2020-07-17  Pratt & Whitney to deliver spare A320neo engin...
45383829 2020-07-17  UK will always consider ways to improve data l...

[45383830 rows x 2 columns]


In [8]:
# Merge headline data with SPY data
merged_df = pd.merge(headline_df, spy_df, on='date', how='left')

In [9]:
# Save final dataset
merged_df.to_csv('final_dataset_with_stock_price.csv', index=False)

In [20]:
# Print the final DataFrame for verification
print(merged_df)

                date                                           headline  \
0         2018-03-20  Jim Cramer : A better way to invest in the Cov...   
1         2018-03-20    Cramer's lightning round : I would own Teradyne   
2         2018-03-20  Cramer's week ahead : Big week for earnings , ...   
3         2018-03-20  IQ Capital CEO Keith Bliss says tech and healt...   
4         2018-03-20  Wall Street delivered the 'kind of pullback I'...   
...              ...                                                ...   
232252145 2020-07-17  UK will always consider ways to improve data l...   
232252146 2020-07-17  UK will always consider ways to improve data l...   
232252147 2020-07-17  UK will always consider ways to improve data l...   
232252148 2020-07-17  UK will always consider ways to improve data l...   
232252149 2020-07-17  UK will always consider ways to improve data l...   

                Close  
0          244.474564  
1          244.474564  
2          244.474564  
3  