In [77]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import pytz

In [78]:
def removeSeconds(df):
    modified_dates=[]
    for date in df['Date']:
        format = '%Y-%m-%d %H:%M:%S%z'
        date = datetime.strptime(date, format)
        modified_date = date.replace(second=0)
        modified_dates.append(modified_date)
    df['Date']=modified_dates
    return df

In [79]:
def convertTimezone(timezone, df,name):
    target_tz = pytz.timezone(timezone)

    if df.index is not None:
        #  a timezone-aware index
        df.index = pd.to_datetime(df.index, utc=True).tz_convert(timezone)

    else:
        df[name] = pd.to_datetime(df[name],utc=True)
        df[name] = df[name].dt.tz_convert(target_tz)
    return df

In [80]:
def getStockPrices(start_date,end_date):

    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    interval = "1m"                     

    current_date = start_date
    
    data_list = []  

    while current_date < end_date:
        remaining_days = (end_date - current_date).days
        
        if remaining_days < 7:
            
            data = yf.download("AAPL", start=current_date, end=end_date, interval=interval)
            data = convertTimezone('UTC', data, 'Datetime')
            
            data_list.append(data)
            current_date = end_date
        else:
           
            next_date = current_date + timedelta(days=7)
            data = yf.download("AAPL", start=current_date, end=next_date, interval=interval)
            data = convertTimezone('UTC', data, 'Datetime')
            
            data_list.append(data)
            current_date = next_date

   
    final_data = pd.concat(data_list)
    final_data.reset_index(inplace=True)
    return final_data


In [81]:
def addStockPrices(stockPrices,df):

  

    stockPrices['Datetime'] = pd.to_datetime(stockPrices['Datetime'])
    stockPrices['Datetime'] = stockPrices['Datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

    df['Date'] = pd.to_datetime(df['Date'])
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

   
    stock_prices = []

    for index, row in df.iterrows():
        publication_date = row['Date']
       
        if publication_date in stockPrices['Datetime'].values:
           
            stock_price = stockPrices.loc[stockPrices['Datetime'] == publication_date, 'Adj Close'].values[0]
            stock_prices.append(stock_price)
        else:
            stock_prices.append(None)  

    
    df['stock_price'] = stock_prices
    return df

In [82]:
def addStockPricesAfter(stockPrices,df,time):


    stockPrices['Datetime'] = pd.to_datetime(stockPrices['Datetime'])
    stockPrices['Datetime'] = stockPrices['Datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

    df['Date'] = pd.to_datetime(df['Date'])
    df['Date']+=timedelta(minutes=time)
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')


    stock_prices = []

 
    for index, row in df.iterrows():
        publication_date = row['Date']
       
        if publication_date in stockPrices['Datetime'].values:
           
            stock_price = stockPrices.loc[stockPrices['Datetime'] == publication_date, 'Adj Close'].values[0]
            stock_prices.append(stock_price)
        else:
            stock_prices.append(None) 


    df[f'stock_price_after_{time}_mins'] = stock_prices
    return df

In [83]:
def label(df):
    labels=[]
    change_percentages=[]
    for index, row in df.iterrows():
        change_percentages.append((row['stock_price_after_30_mins'] - row['stock_price'] )/ row['stock_price'] * 100)
        if row['stock_price_after_30_mins']>row['stock_price']:
            labels.append(1)
        elif row['stock_price_after_30_mins']<row['stock_price']:
            labels.append(-1)
        else:
            labels.append(0)
    df['change_percentage']=change_percentages
    df['label']=labels
    return df

In [84]:
def turnToCSV(df,name):
    df.to_csv(fr'C:\Users\Legion\Desktop\FinalYearProject\data\{name}.csv',index=False)

In [85]:
News=pd.read_csv(r'C:\Users\Legion\Desktop\FinalYearProject\data\News.csv')


In [86]:
start_date = News['Date'].min().split()[0] 
end_date =  News['Date'].max().split()[0]  
stockPrices=getStockPrices(start_date,end_date)
turnToCSV(stockPrices,'stockPrices')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [87]:
News=removeSeconds(News)

In [88]:
News=addStockPrices(stockPrices,News)
News= News.dropna(subset=['stock_price'])

In [89]:
News=addStockPricesAfter(stockPrices,News,30)
News= News.dropna(subset=['stock_price_after_30_mins'])

In [90]:
News=label(News)
turnToCSV(News,'NewsWithStockPrice')

In [91]:
# down_threshold = -1 
# up_threshold = 1    

# labels = []
# for index, row in News.iterrows():
#     change_percentage = ((row['stock_price_after_30_mins'] - row['stock_price']) / row['stock_price']) * 100
#     print(change_percentage)
#     if change_percentage < down_threshold:
#         labels.append(-1)
#     elif change_percentage > up_threshold:
#         labels.append(1)
#     else:
#         labels.append(0)


# News['label'] = labels
