In [50]:
# For downloading historical market data:
import yfinance as yf

# For data manipulation:
import pandas as pd

# For working with dates and times:
from datetime import datetime, timedelta

# For timezone support:
import pytz

# <font color='pink'>Utility Functions</font>


Removing seconds from datetime strings in a Dataframe.

In [51]:
def removeSeconds(df):
    modified_dates=[]
    for date in df['Date']:

        format = '%Y-%m-%d %H:%M:%S%z'

        # Converting stirng to datetime object using the above format:
        date = datetime.strptime(date, format)

        # Removing seconds from the datetime object:
        modified_date = date.replace(second=0)
        
        modified_dates.append(modified_date)
    df['Date']=modified_dates
    return df

Converting datetime values in a DataFrame to a specific timezone.

In [52]:
def convertTimezone(timezone,df,name):

    # The target timezone:
    target_tz = pytz.timezone(timezone)

    # If we're using DataFrame index:
    if df.index is not None:
        # Converting the datetime to the target timezone:
        df.index = pd.to_datetime(df.index, utc=True).tz_convert(timezone)

    # If we're not using DataFrame index:
    else:
        # Converting the datetime to the target timezone:
        df[name] = pd.to_datetime(df[name],utc=True).dt.tz_convert(target_tz)
    return df

# NOTES:
    # 'utc=True' for ensuring that all datetime values are in UTC before any operations
    # ... to avoid issues related to timezone conversions

Retrieving stock prices for a specified date range using Yahoo Finance API 'yfinance'.

In [53]:
def getStockPrices(start_date,end_date,ticker):

    # Converting start and end date to datetime objects:
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")

    # Interval for stock price data retrieval (1 minute):
    interval = "1m"                     

    # Assigning start_date to current_date:
    current_date = start_date
    
    # List that will contain all retrieved stock price data:
    data_list = []  

    todays_date = datetime.now()

    while current_date < end_date:

        # Since we're downloading stock price data in chunks of up to 7 days to avoid API limitations,
        # ... we must calculate the remaining days at each iteration:
        remaining_days = (end_date - current_date).days
        
        # If remaining_days < 7, we download the data until end_date
        if remaining_days < 7:
            print(f"getting dates for {current_date}->{next_date}")
            data = yf.download(tickers=ticker, start=current_date, end=end_date, interval=interval)

            # We must convert the data to a specified timezone:
            data = convertTimezone('UTC', data, 'Datetime')
            
            data_list.append(data)

            # Setting current_date to end_date to exit the loop
            current_date = end_date

        # If remaining_days>=7, we download data for next 7 days:
        else:
            next_date = current_date + timedelta(days=7)

            print(f"getting dates for {current_date}->{next_date}")
            data = yf.download(tickers=ticker, start=current_date, end=next_date, interval=interval)

            # We must convert the data to a specified timezone:
            data = convertTimezone('UTC', data, 'Datetime')
            
            data_list.append(data)

            # Updating current_date for the next iteration:
            current_date = next_date

    final_data = pd.concat(data_list)
    final_data.reset_index(inplace=True)
    if 'index' in final_data.columns:  
        final_data.rename(columns={'index': 'Datetime'}, inplace=True)  
    return final_data

# Notes:
    # We must convert the data to a specified timezone, 
    # ... to ensure that all datetime values in the dataset are in the same timezone,
    # ... for consistency and comparison purposes.


Adding stock prices to a DataFrame based on matching dates

In [54]:
def addStockPrices(stockPrices,df):

    # Ensuring that both datasets have their datetime columns properly formatted as strings,
    # ... for accurate matching.
    stockPrices['Datetime'] = pd.to_datetime(stockPrices['Datetime'],utc=True)
    stockPrices['Datetime'] = stockPrices['Datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

    df['Date'] = pd.to_datetime(df['Date'],utc=True)
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

    # This list will store retrieved stock prices:
    stock_prices = []

    for index, row in df.iterrows():

        publication_date = row['Date']
       
        # If the publication date exists in teh stockPrices DataFrame,
        # ... we retrieve the corresponding stock price from stockPrices DataFrame
        if publication_date in stockPrices['Datetime'].values:
            stock_price = stockPrices.loc[stockPrices['Datetime'] == publication_date, 'Adj Close'].values[0]
            stock_prices.append(stock_price)
        else:
            stock_prices.append(None)  
    df['stock_price'] = stock_prices
    df= df.dropna(subset=['stock_price'])
    return df

Adding stock prices after a specified time period to a DataFrame based on publication dates.

In [55]:
def addStockPricesAfter(stockPrices,df,time):

    # Ensuring that both datasets have their datetime columns properly formatted as strings,
    # ... for accurate matching.
    stockPrices['Datetime'] = pd.to_datetime(stockPrices['Datetime'],utc=True)
    stockPrices['Datetime'] = stockPrices['Datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

    df['Date'] = pd.to_datetime(df['Date'],utc=True)

    # Adding the specified time period to the publication dates
    df['Date']+=timedelta(minutes=time)

    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

    # This list will store retrieved stock prices:
    stock_prices = []

 
    for index, row in df.iterrows():
        publication_date = row['Date']
              
        # If the publication date exists in teh stockPrices DataFrame,
        # ... we retrieve the corresponding stock price from stockPrices DataFrame
        if publication_date in stockPrices['Datetime'].values:
           
            stock_price = stockPrices.loc[stockPrices['Datetime'] == publication_date, 'Adj Close'].values[0]
            stock_prices.append(stock_price)
        else:
            stock_prices.append(None) 


    df[f'stock_price_after_{time}_mins'] = stock_prices
    df= df.dropna(subset=[f'stock_price_after_{time}_mins'])
    return df

Labeling the Data based on stock price changes after a certain period of time.

In [56]:
def label(df,time,down,up):
    # List to store labels indicating stock price changes
    labels=[]

    down_threshold = down
    up_threshold = up  
    
    # List to store percentage changes in stock prices
    change_percentages=[]

    # column name of stock prices based on the time:
    name=f'stock_price_after_{time}_mins'

    for index, row in df.iterrows():

        change_percentage=(row[name] - row['stock_price'] )/ row['stock_price'] * 100
        # Calculating the percentage change in stock price after a certain time
        change_percentages.append(change_percentage)
        
        # Labeling the data:
        if change_percentage>up_threshold:
            labels.append(1)
        elif change_percentage<down_threshold:
            labels.append(-1)
        else:
            labels.append(0)
    df['change_percentage']=change_percentages
    df['label']=labels
    return df

Creating a CSV file

In [57]:
def turnToCSV(df,name):
    df.to_csv(fr'..\data\{name}.csv',index=False)

# <font color='pink'>Main Program</font>

Reading our News dataset tha contains all of the News articles and their publication dates

In [58]:
News=pd.read_csv(r'..\data\News.csv')
News.head(1)


Unnamed: 0,Date,article_title,article,source_name,source_link,ticker_symbol
0,2024-05-01 10:38:36+00:00,Apple set for big sales decline as investors a...,By Yuvraj Malik(Reuters) - Apple's plan to add...,Yahoo Finance,https://finance.yahoo.com/news/apple-set-big-s...,AAPL


Creating the stockPrices dataset

In [59]:
# Getting the start date from the News dataset and the end date:
start_date = News['Date'].min().split()[0] 
end_date =  News['Date'].max().split()[0]  
print("Starting Date:",start_date)
print("Ending Date:",end_date)

Starting Date: 2024-03-05
Ending Date: 2024-05-01


In [60]:
# Creating a CSV file with all the stock prices corresponding to these dates:
#stockPrices=getStockPrices(start_date,end_date,'AAPL')

try:
    existing_data = pd.read_csv(r"..\data\StockPrices.csv")
except FileNotFoundError:
    existing_data = pd.DataFrame()

new_data=getStockPrices(start_date,end_date,'AAPL')

# Appending new data to existing DataFrame
stockPrices = pd.concat([existing_data, new_data])
stockPrices.drop_duplicates(subset='Datetime', keep='first', inplace=True)

turnToCSV(stockPrices,'stockPrices')
stockPrices

getting dates for 2024-03-05 00:00:00->2024-03-12 00:00:00
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- AAPL: 1m data not available for startTime=1709589600 and endTime=1710194400. The requested range must be within the last 30 days.
getting dates for 2024-03-12 00:00:00->2024-03-19 00:00:00
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- AAPL: 1m data not available for startTime=1710194400 and endTime=1710799200. The requested range must be within the last 30 days.
getting dates for 2024-03-19 00:00:00->2024-03-26 00:00:00
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- AAPL: 1m data not available for startTime=1710799200 and endTime=1711404000. The requested range must be within the last 30 days.
getting dates for 2024-03-26 00:00:00->2024-04-02 00:00:00
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- AAPL: 1m d

Unnamed: 0,Datetime,Open,High,Low,Close,Adj Close,Volume
0,2024-03-05 14:30:00+00:00,171.000000,171.000000,170.610001,170.919998,170.919998,5958861.0
1,2024-03-05 14:31:00+00:00,171.020004,171.074997,171.020004,171.059998,171.059998,926341.0
2,2024-03-05 14:32:00+00:00,171.335999,171.339996,171.270004,171.300003,171.300003,703908.0
3,2024-03-05 14:33:00+00:00,171.929993,171.970001,171.875000,171.895004,171.895004,994237.0
4,2024-03-05 14:34:00+00:00,171.625000,171.740005,171.580002,171.610001,171.610001,626087.0
...,...,...,...,...,...,...,...
8172,2024-04-30 19:55:00+00:00,171.309998,171.360001,170.514999,170.759995,170.759995,1272491.0
8173,2024-04-30 19:56:00+00:00,170.744003,170.779999,170.320007,170.419998,170.419998,855457.0
8174,2024-04-30 19:57:00+00:00,170.514999,170.570007,170.020004,170.549194,170.549194,1178925.0
8175,2024-04-30 19:58:00+00:00,170.545105,170.570007,170.300003,170.449997,170.449997,993418.0


Getting the stock prices before and after a certain amount of time corresponding to the publication date.

In [61]:
# First, we need to remove the seconds from the dates since 'yfinance operates on minute-based intervals:
News=removeSeconds(News)

# Adding the stock prices corresponding to the publication date:
News=addStockPrices(stockPrices,News)

# Adding stock prices after 30 minutes of the publication date:
News=addStockPricesAfter(stockPrices,News,30)

News.head(1)

Unnamed: 0,Date,article_title,article,source_name,source_link,ticker_symbol,stock_price,stock_price_after_30_mins
6,2024-04-30 18:37:00,Apple's Secret AI Lab in Zurich Poised to Enha...,Apple Inc (NASDAQ:AAPL) has aggressively recru...,Yahoo Finance,https://finance.yahoo.com/news/apples-secret-a...,AAPL,173.6577,173.251602


Labeling the data.

In [62]:
News=label(News,30,-0.1,0.1)
turnToCSV(News,'NewsWithStockPrice')

News

Unnamed: 0,Date,article_title,article,source_name,source_link,ticker_symbol,stock_price,stock_price_after_30_mins,change_percentage,label
6,2024-04-30 18:37:00,Apple's Secret AI Lab in Zurich Poised to Enha...,Apple Inc (NASDAQ:AAPL) has aggressively recru...,Yahoo Finance,https://finance.yahoo.com/news/apples-secret-a...,AAPL,173.657700,173.251602,-0.233849,-1
7,2024-04-30 18:23:00,"Apple Likely to Meet Q2 Guidance, But Q3 Outlo...","Apple Likely to Meet Q2 Guidance, But Q3 Outlo...",Yahoo Finance,https://finance.yahoo.com/news/apple-likely-me...,AAPL,173.654999,173.339996,-0.181396,-1
8,2024-04-30 17:18:00,Q1 Earnings Season Scorecard and Fresh Researc...,"Tuesday, April 30, 2024Today's Research Daily ...",Yahoo Finance,https://finance.yahoo.com/news/q1-earnings-sea...,AAPL,173.274994,173.296494,0.012408,0
17,2024-04-29 17:19:00,Apple's iPadOS will have to comply with EU's D...,The European Union will apply its flagship mar...,Yahoo Finance,https://finance.yahoo.com/news/apples-ipados-c...,AAPL,174.359894,174.442505,0.047380,0
22,2024-04-29 19:59:00,Apple Earnings: Time to Take a Bite?,"It’s another busy slate of earnings this week,...",Yahoo Finance,https://finance.yahoo.com/news/apple-earnings-...,AAPL,173.824997,173.490005,-0.192718,-1
...,...,...,...,...,...,...,...,...,...,...
982,2024-03-05 17:40:00,"Goodbye, Project Titan","In this podcast, Motley Fool analyst Asit Shar...",Yahoo Finance,https://finance.yahoo.com/quote/AAPL/news/good...,,170.503799,170.317596,-0.109208,-1
983,2024-03-05 17:29:00,Apple Stock Gloom Deepens as Pressure to Show ...,(Bloomberg) -- Apple Inc.’s move to shutter it...,Yahoo Finance,https://finance.yahoo.com/quote/AAPL/news/appl...,,170.259201,170.785004,0.308825,1
984,2024-03-05 17:25:00,What the shakeup in Magnificent 7 stocks means...,Stock market indices have been propped up on t...,Yahoo Finance,https://finance.yahoo.com/quote/AAPL/video/sha...,,170.160004,170.804993,0.379049,1
985,2024-03-05 17:16:00,Alphabet (GOOGL) Enhances Google Maps With New...,Alphabet’s GOOGL Google is gaining strong mome...,Yahoo Finance,https://finance.yahoo.com/quote/AAPL/news/alph...,,169.865005,170.692505,0.487151,1
