In [118]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import pytz

In [119]:
def convertTimezone(timezone, df,name):
    """
    Converting timestamps in to a specified timezone.

    Arguments:
        timezone (str): Target timezone to convert timestamps to.
        df : DataFrame containing a column of timestamp strings.
        name: name of the timestamp column

    Returns a dataFrame with timestamps converted to the specified timezone.
    """

    target_tz = pytz.timezone(timezone)

    if df.index is not None:
        # Convert the index to a timezone-aware index
        df.index = pd.to_datetime(df.index, utc=True).tz_convert(timezone)

    else:
        # Convert timestamp column to datetime objects
        df[name] = pd.to_datetime(df[name])
        df[name] = df[name].dt.tz_convert(target_tz)
    return df

In [120]:
def getStockPrices(start_date,end_date,df):

    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    interval = "1m"                     

    current_date = start_date
    # Initialize an empty list to store data
    data_list = []  

    while current_date < end_date:
        remaining_days = (end_date - current_date).days
        
        if remaining_days < 7:
            # If remaining days are less than 7, get data till end_date
            data = yf.download("AAPL", start=current_date, end=end_date, interval=interval)
            data = convertTimezone('UTC', data, 'Datetime')
            # Append data to the list
            data_list.append(data)
            current_date = end_date
        else:
            # If remaining days are 7 or more, get data for the next 7 days
            next_date = current_date + timedelta(days=7)
            data = yf.download("AAPL", start=current_date, end=next_date, interval=interval)
            data = convertTimezone('UTC', data, 'Datetime')
            # Append data to the list
            data_list.append(data)
            current_date = next_date

    # Concatenate the list of dataframes into a single dataframe
    final_data = pd.concat(data_list)
    final_data.reset_index(inplace=True)
    return final_data


In [121]:
def removeSeconds(df):
    modified_dates=[]
    for date in df['Date']:
        format = '%Y-%m-%d %H:%M:%S%z'
        date = datetime.strptime(date, format)
        modified_date = date.replace(second=0)
        modified_dates.append(modified_date)
        #print(data['Date'])
        #data.index = data.index.set_value(data.index.get_loc(date), modified_date)
        #data.index.values[data.index.get_loc(date)] = modified_date
    df['Date']=modified_dates
    return df

In [122]:
df=pd.read_csv(r'C:\Users\Legion\Desktop\FinalYearProject\data\News.csv')

In [123]:
start_date = df['Date'].min().split()[0] 
end_date =  df['Date'].max().split()[0]  
stockPrices=getStockPrices(start_date,end_date,df)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [124]:
stockPrices

Unnamed: 0,Datetime,Open,High,Low,Close,Adj Close,Volume
0,2024-03-05 14:30:00+00:00,171.000000,171.000000,170.610001,170.919998,170.919998,5958861
1,2024-03-05 14:31:00+00:00,171.020004,171.074997,171.020004,171.059998,171.059998,926341
2,2024-03-05 14:32:00+00:00,171.335999,171.339996,171.270004,171.300003,171.300003,703908
3,2024-03-05 14:33:00+00:00,171.929993,171.970001,171.875000,171.895004,171.895004,994237
4,2024-03-05 14:34:00+00:00,171.625000,171.740005,171.580002,171.610001,171.610001,626087
...,...,...,...,...,...,...,...
6981,2024-03-28 19:56:00+00:00,171.774994,171.860001,171.699997,171.835007,171.835007,257474
6982,2024-03-28 19:57:00+00:00,171.845001,171.990005,171.800003,171.945007,171.945007,255920
6983,2024-03-28 19:58:00+00:00,171.940002,171.949997,171.860001,171.860001,171.860001,292888
6984,2024-03-28 19:59:00+00:00,171.860001,171.899994,171.259995,171.520004,171.520004,1669914


In [125]:
stockPrices.to_csv(r'C:\Users\Legion\Desktop\FinalYearProject\data\StockPrices.csv',index=False)

In [126]:
df=removeSeconds(df)

In [127]:
# Convert 'Datetime' column in stockPrices dataset to string
stockPrices['Datetime'] = pd.to_datetime(stockPrices['Datetime'])
stockPrices['Datetime'] = stockPrices['Datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

df['Date'] = pd.to_datetime(df['Date'])
# Convert 'Date' column in df dataset to string
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Create an empty list to store stock prices
stock_prices = []

# Iterate through each row in the News dataset
for index, row in df.iterrows():
    publication_date = row['Date']
    # Check if the publication date exists in the stockPrice dataset
    if publication_date in stockPrices['Datetime'].values:
        # Retrieve the corresponding stock price
        stock_price = stockPrices.loc[stockPrices['Datetime'] == publication_date, 'Adj Close'].values[0]
        stock_prices.append(stock_price)
    else:
        stock_prices.append(None)  # If publication date not found, append None

# Add the list of stock prices to the News dataset as a new column
df['price_on_publication_date'] = stock_prices



In [128]:
df.to_csv(r'C:\Users\Legion\Desktop\FinalYearProject\data\test2.csv',index=False)

In [129]:
df = df.dropna(subset=['price_on_publication_date'])

In [130]:
df.to_csv(r'C:\Users\Legion\Desktop\FinalYearProject\data\test3.csv',index=False)