In [1]:
# For downloading historical market data:
import yfinance as yf

# For data manipulation:
import pandas as pd

# For working with dates and times:
from datetime import datetime, timedelta

# For timezone support:
import pytz

# <font color='pink'>Utility Functions</font>


Removing seconds from datetime strings in a Dataframe.

In [2]:
def removeSeconds(df):
    modified_dates=[]
    for date in df['Date']:

        format = '%Y-%m-%d %H:%M:%S%z'

        # Converting stirng to datetime object using the above format:
        date = datetime.strptime(date, format)

        # Removing seconds from the datetime object:
        modified_date = date.replace(second=0)
        
        modified_dates.append(modified_date)
    df['Date']=modified_dates
    return df

Converting datetime values in a DataFrame to a specific timezone.

In [3]:
def convertTimezone(timezone,df,name):

    # The target timezone:
    target_tz = pytz.timezone(timezone)

    # If we're using DataFrame index:
    if df.index is not None:
        # Converting the datetime to the target timezone:
        df.index = pd.to_datetime(df.index, utc=True).tz_convert(timezone)

    # If we're not using DataFrame index:
    else:
        # Converting the datetime to the target timezone:
        df[name] = pd.to_datetime(df[name],utc=True).dt.tz_convert(target_tz)
    return df

# NOTES:
    # 'utc=True' for ensuring that all datetime values are in UTC before any operations
    # ... to avoid issues related to timezone conversions

Retrieving stock prices for a specified date range using Yahoo Finance API 'yfinance'.

In [4]:
def getStockPrices(start_date,end_date,ticker):

    # Converting start and end date to datetime objects:
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")

    # Interval for stock price data retrieval (1 minute):
    interval = "1m"                     

    # Assigning start_date to current_date:
    current_date = start_date
    
    # List that will contain all retrieved stock price data:
    data_list = []  

    todays_date = datetime.now()

    while current_date < end_date:

        # Since we're downloading stock price data in chunks of up to 7 days to avoid API limitations,
        # ... we must calculate the remaining days at each iteration:
        remaining_days = (end_date - current_date).days
        
        # If remaining_days < 7, we download the data until end_date
        if remaining_days < 7:
            print(f"getting dates for {current_date}->{next_date}")
            data = yf.download(tickers=ticker, start=current_date, end=end_date, interval=interval)

            # We must convert the data to a specified timezone:
            data = convertTimezone('UTC', data, 'Datetime')
            
            data_list.append(data)

            # Setting current_date to end_date to exit the loop
            current_date = end_date

        # If remaining_days>=7, we download data for next 7 days:
        else:
            next_date = current_date + timedelta(days=7)

            print(f"getting dates for {current_date}->{next_date}")
            data = yf.download(tickers=ticker, start=current_date, end=next_date, interval=interval)

            # We must convert the data to a specified timezone:
            data = convertTimezone('UTC', data, 'Datetime')
            
            data_list.append(data)

            # Updating current_date for the next iteration:
            current_date = next_date

    final_data = pd.concat(data_list)
    final_data.reset_index(inplace=True)
    if 'index' in final_data.columns:  
        final_data.rename(columns={'index': 'Datetime'}, inplace=True)  
    return final_data

# Notes:
    # We must convert the data to a specified timezone, 
    # ... to ensure that all datetime values in the dataset are in the same timezone,
    # ... for consistency and comparison purposes.


Adding stock prices to a DataFrame based on matching dates

In [5]:
def addStockPrices(stockPrices,df):

    # Ensuring that both datasets have their datetime columns properly formatted as strings,
    # ... for accurate matching.
    stockPrices['Datetime'] = pd.to_datetime(stockPrices['Datetime'],utc=True)
    stockPrices['Datetime'] = stockPrices['Datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

    df['Date'] = pd.to_datetime(df['Date'],utc=True)
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

    # This list will store retrieved stock prices:
    stock_prices = []

    for index, row in df.iterrows():

        publication_date = row['Date']
       
        # If the publication date exists in teh stockPrices DataFrame,
        # ... we retrieve the corresponding stock price from stockPrices DataFrame
        if publication_date in stockPrices['Datetime'].values:
            stock_price = stockPrices.loc[stockPrices['Datetime'] == publication_date, 'Adj Close'].values[0]
            stock_prices.append(stock_price)
        else:
            stock_prices.append(None)  
    df['stock_price'] = stock_prices
    df= df.dropna(subset=['stock_price'])
    return df

Adding stock prices after a specified time period to a DataFrame based on publication dates.

In [6]:
def addStockPricesAfter(stockPrices,df,time):

    # Ensuring that both datasets have their datetime columns properly formatted as strings,
    # ... for accurate matching.
    stockPrices['Datetime'] = pd.to_datetime(stockPrices['Datetime'],utc=True)
    stockPrices['Datetime'] = stockPrices['Datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

    df['Date'] = pd.to_datetime(df['Date'],utc=True)

    # Adding the specified time period to the publication dates
    df['Date']+=timedelta(minutes=time)

    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

    # This list will store retrieved stock prices:
    stock_prices = []

 
    for index, row in df.iterrows():
        publication_date = row['Date']
              
        # If the publication date exists in teh stockPrices DataFrame,
        # ... we retrieve the corresponding stock price from stockPrices DataFrame
        if publication_date in stockPrices['Datetime'].values:
           
            stock_price = stockPrices.loc[stockPrices['Datetime'] == publication_date, 'Adj Close'].values[0]
            stock_prices.append(stock_price)
        else:
            stock_prices.append(None) 


    df[f'stock_price_after_{time}_mins'] = stock_prices
    df= df.dropna(subset=[f'stock_price_after_{time}_mins'])
    return df

Labeling the Data based on stock price changes after a certain period of time.

In [7]:
def label(df,time):
    # List to store labels indicating stock price changes
    labels=[]

    # List to store percentage changes in stock prices
    change_percentages=[]

    # column name of stock prices based on the time:
    name=f'stock_price_after_{time}_mins'

    for index, row in df.iterrows():

        # Calculating the percentage change in stock price after a certain time
        change_percentages.append((row[name] - row['stock_price'] )/ row['stock_price'] * 100)
        
        # Labeling the data:
        if row[name]>row['stock_price']:
            labels.append(1)
        elif row[name]<row['stock_price']:
            labels.append(-1)
        else:
            labels.append(0)
    df['change_percentage']=change_percentages
    df['label']=labels
    return df

Creating a CSV file

In [8]:
def turnToCSV(df,name):
    df.to_csv(fr'C:\Users\Legion\Desktop\FinalYearProject\data\{name}.csv',index=False)

# <font color='pink'>Main Program</font>

Reading our News dataset tha contains all of the News articles and their publication dates

In [9]:
News=pd.read_csv(r'C:\Users\Legion\Desktop\FinalYearProject\data\News.csv')
News.head(1)


Unnamed: 0,Date,article_title,article,source_name,source_link
0,2024-04-09 00:05:00+00:00,"Meet the Cheapest ""Magnificent Seven"" Stock Ac...","Apple (NASDAQ: AAPL), Microsoft, Nvidia, Amazo...",Yahoo Finance,https://finance.yahoo.com/quote/AAPL/news/meet...


Creating the stockPrices dataset

In [10]:
# Getting the start date from the News dataset and the end date:
start_date = News['Date'].min().split()[0] 
end_date =  News['Date'].max().split()[0]  
print(start_date)
print(end_date)

# Creating a CSV file with all the stock prices corresponding to these dates:
#stockPrices=getStockPrices(start_date,end_date,'AAPL')

try:
    existing_data = pd.read_csv(r"C:\Users\Legion\Desktop\FinalYearProject\data\StockPrices.csv")
except FileNotFoundError:
    existing_data = pd.DataFrame()

new_data=getStockPrices(start_date,end_date,'AAPL')

# Appending new data to existing DataFrame
stockPrices = pd.concat([existing_data, new_data])
stockPrices.drop_duplicates(subset='Datetime', keep='first', inplace=True)

turnToCSV(stockPrices,'stockPrices')
stockPrices

2024-03-05
2024-04-09
getting dates for 2024-03-05 00:00:00->2024-03-12 00:00:00
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- AAPL: 1m data not available for startTime=1709589600 and endTime=1710194400. The requested range must be within the last 30 days.
getting dates for 2024-03-12 00:00:00->2024-03-19 00:00:00
[*********************100%***********************]  1 of 1 completed
getting dates for 2024-03-19 00:00:00->2024-03-26 00:00:00
[*********************100%***********************]  1 of 1 completed
getting dates for 2024-03-26 00:00:00->2024-04-02 00:00:00
[*********************100%***********************]  1 of 1 completed
getting dates for 2024-04-02 00:00:00->2024-04-09 00:00:00
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Datetime,Open,High,Low,Close,Adj Close,Volume
0,2024-03-05 14:30:00+00:00,171.000000,171.000000,170.610001,170.919998,170.919998,5958861.0
1,2024-03-05 14:31:00+00:00,171.020004,171.074997,171.020004,171.059998,171.059998,926341.0
2,2024-03-05 14:32:00+00:00,171.335999,171.339996,171.270004,171.300003,171.300003,703908.0
3,2024-03-05 14:33:00+00:00,171.929993,171.970001,171.875000,171.895004,171.895004,994237.0
4,2024-03-05 14:34:00+00:00,171.625000,171.740005,171.580002,171.610001,171.610001,626087.0
...,...,...,...,...,...,...,...
7399,2024-04-08 19:55:00+00:00,168.490005,168.490005,168.338303,168.369995,168.369995,358263.0
7400,2024-04-08 19:56:00+00:00,168.360001,168.414993,168.324997,168.414993,168.414993,231272.0
7401,2024-04-08 19:57:00+00:00,168.410004,168.410004,168.330002,168.375000,168.375000,291370.0
7402,2024-04-08 19:58:00+00:00,168.379303,168.479996,168.354996,168.470001,168.470001,429591.0


Getting the stock prices before and after a certain amount of time corresponding to the publication date.

In [11]:
# First, we need to remove the seconds from the dates since 'yfinance operates on minute-based intervals:
News=removeSeconds(News)

# Adding the stock prices corresponding to the publication date:
News=addStockPrices(stockPrices,News)

# Adding stock prices after 30 minutes of the publication date:
News=addStockPricesAfter(stockPrices,News,30)

News.head(1)

Unnamed: 0,Date,article_title,article,source_name,source_link,stock_price,stock_price_after_30_mins
2,2024-04-08 18:00:00,Could Apple Help You Retire a Millionaire?,Apple (NASDAQ: AAPL) has made a lot of million...,Yahoo Finance,https://finance.yahoo.com/quote/AAPL/news/coul...,168.625,168.720001


Labeling the data.

In [12]:
News=label(News,30)
turnToCSV(News,'NewsWithStockPrice')

News.head(1)

Unnamed: 0,Date,article_title,article,source_name,source_link,stock_price,stock_price_after_30_mins,change_percentage,label
2,2024-04-08 18:00:00,Could Apple Help You Retire a Millionaire?,Apple (NASDAQ: AAPL) has made a lot of million...,Yahoo Finance,https://finance.yahoo.com/quote/AAPL/news/coul...,168.625,168.720001,0.056339,1


In [13]:
# down_threshold = -1 
# up_threshold = 1    

# labels = []
# for index, row in News.iterrows():
#     change_percentage = ((row['stock_price_after_30_mins'] - row['stock_price']) / row['stock_price']) * 100
#     print(change_percentage)
#     if change_percentage < down_threshold:
#         labels.append(-1)
#     elif change_percentage > up_threshold:
#         labels.append(1)
#     else:
#         labels.append(0)


# News['label'] = labels
