In [None]:
import pandas as pd
import os
import glob
import yfinance as yf
import numpy as np

Importing Collected news data from drive


In [None]:
# Path to the directory containing the CSV files
directory = '/content/drive/MyDrive/Data_msc/news/'
# Get a list of all CSV files in the directory
csv_files = glob.glob(directory + '*.csv')
# Initialize an empty list to store the dataframes
dfs = []
# Read each CSV file and append its contents to the list of dataframes
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)
# Concatenate all the dataframes into a single dataframe
merged_df = pd.concat(dfs)
# Path and filename for the merged CSV file
merged_file = '/content/drive/MyDrive/Data_msc/merged_file.csv'
# Save the merged dataframe as a CSV file
merged_df.to_csv(merged_file, index=False)
print("CSV files merged successfully!")


In [None]:
data=pd.read_csv('/content/drive/MyDrive/Data_msc/merged_file.csv')

In [None]:
data['ticker'].value_counts()

Removing unneccessary data by filtering topics

In [None]:
topics = ['Economy - Fiscal', 'Economy - Monetary', 'IPO',
          'Real Estate & Construction', 'Mergers & Acquisitions',
          'Energy & Transportation', 'Finance','Financial Markets']
data = data[data['topic'].isin(topics)]


In [None]:
data['Date'] = pd.to_datetime(data['Date'], format='%Y%m%d')
data['Date'] = data['Date'].dt.strftime('%Y-%m-%d')


In [None]:
data['Description'] = data['Title'] + ' ' + data['Description']
data=data.drop('Title',axis=1)


In [None]:
data_n = data.groupby(['Date', 'ticker']).agg({
    'Description': lambda x: ' '.join(x),  # Concatenate the text from the 'body' column
}).reset_index()
data_n=data_n.set_index(['Date','ticker'])


In [None]:
# Downloading historical stock prices with yfinance
# Configuring tickers and period
ticker_symbols = ["AAPL", "GOOG", "AMZN", "TSLA", "MSFT","JNJ","AMD","IVZ","PFE"]
start_date = "2022-01-01"
end_date = "2023-06-01"
interval = "1d"

# Adjusting all prices to stock splits and dividend payments
auto_adjust = True

# Using yfinance package to get data from Yahoo Finance for each ticker
tickers = yf.Tickers(ticker_symbols)
tickers_df = tickers.history(start=start_date, end=end_date, interval=interval, auto_adjust=auto_adjust)

# Investigating data
print(tickers_df.shape)
tickers_df.head(20)

In [None]:
transformed_df = tickers_df.stack(level=1).rename_axis(['Date', 'Ticker'])
transformed_df.head(10)

In [None]:
# Choosing columns to keep
cols = ['Close', 'Open', 'Volume']

# Creating a new dataFrame with selected columns
stock_df = transformed_df[cols].copy()


def calculate_log_change(x):
    result = np.log(x) - np.log(x.shift(1))
    return result

def create_binary_variable(x):
    result = np.where(x >= 0, 1, 0)
    return result

# Creating function for defining the Up (2), Stable (1), and Down (0) classes
def create_multiclass(x):
    result = 2 if x >= 0.005 else (0 if x <= -0.005 else 1)
    return result

# Creating columns for log returns and log volume change and ensuring that its calculated on individual ticker level
stock_df['log_ret'] = stock_df.groupby(level='Ticker')['Close'].apply(calculate_log_change)
stock_df['log_volume_change'] = stock_df.groupby(level='Ticker')['Volume'].apply(calculate_log_change)

# Creating columns for binary variables
# Value of 1 if equal or above 0, 0 if below
stock_df['log_ret_binary'] = stock_df['log_ret'].apply(create_binary_variable)
stock_df['log_volume_change_binary'] = stock_df['log_volume_change'].apply(create_binary_variable)

# Creating the multiclass target variable
# Creating function for defining the Up (2), Stable (1), and Down (0) classes
stock_df['target'] = stock_df['log_ret'].apply(create_multiclass)

stock_df.head(20)

In [None]:
stock_df['target'].value_counts()

In [None]:
stock_df.reset_index(inplace=True)
stock_df['Date'] = pd.to_datetime(stock_df['Date'])
stock_df.set_index(['Date', 'Ticker'], inplace=True)
stock_df