# Package Installation Section #

In [2]:
'''
pip install yfinance  # Install the yfinance library
pip install openpyxl
pip install pandas
pip install re
'''

'\npip install yfinance  # Install the yfinance library\npip install openpyxl\npip install pandas\npip install re\n'

# Package Import Section #

In [1]:
# import yfinance as yf
import pandas as pd
from openpyxl import load_workbook
import re
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Download S&P500 Return Index and Volatility Index (VIX) from Yahoo Finance #

In [5]:
# Define the tickers for S&P 500 (^GSPC) and VIX (^VIX)
tickers = ["^GSPC", "^VIX"]

# Download weekly data from Yahoo Finance
data = yf.download(tickers, start="2014-12-01", end="2024-11-30", interval="1wk", group_by="ticker")

# Extract closing prices
sp500_close = data["^GSPC"]["Close"]
vix_close = data["^VIX"]["Close"]

# Calculate weekly returns for S&P 500 (percentage change)
sp500_return = sp500_close.pct_change() * 100

# Combine data into a single DataFrame
sp500_vix = pd.DataFrame({
    "Date": sp500_close.index,
    "SP500_Return (%)": sp500_return,
    "VIX_Close": vix_close
}).set_index("Date")

# Drop rows with NaN values (e.g., first row for returns calculation)
sp500_vix.dropna(inplace=True)

# Save to a CSV file
#combined_data.to_csv("weekly_sp500_vix.csv")

sp500_vix

[*********************100%***********************]  2 of 2 completed


Unnamed: 0_level_0,SP500_Return (%),VIX_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-12-08,-3.519380,21.080000
2014-12-15,3.412022,16.490000
2014-12-22,0.875093,14.500000
2014-12-29,-1.463544,17.790001
2015-01-05,-0.650563,17.549999
...,...,...
2024-10-28,-1.365680,21.879999
2024-11-04,4.656128,14.940000
2024-11-11,-2.083547,16.139999
2024-11-18,1.681589,15.240000


# Import Downloaded Dataset from Refinitiv (weekly from 01/12/2014 to 30/11/2024) #

In [2]:
# Load the workbook
file_path = "Request finish.xlsm"
load_file = load_workbook(file_path, keep_vba=True)

# Access the worksheet
stock_data = load_file['full']

# Extract data from the worksheet into a pandas DataFrame
data = stock_data.values  # Extract the data as a generator of rows
columns = next(data)  # Get the first row as column names
df_stock = pd.DataFrame(data, columns=columns)  # Create DataFrame

# Display the DataFrame
df_stock.head()

Unnamed: 0,Name,Code,2014-01-10 00:00:00,2014-01-17 00:00:00,2014-01-24 00:00:00,2014-01-31 00:00:00,2014-02-07 00:00:00,2014-02-14 00:00:00,2014-02-21 00:00:00,2014-02-28 00:00:00,...,2024-09-27 00:00:00,2024-10-04 00:00:00,2024-10-11 00:00:00,2024-10-18 00:00:00,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00
0,PALANTIR TECHNOLOGIES A,US69608A1088(P),,,,,,,,,...,36.84,40.01,43.51,42.97,44.86,41.92,58.39,65.77,64.35,67.08
1,PALANTIR TECHNOLOGIES A - ASK PRICE,US69608A1088(PA),,,,,,,,,...,36.86,40.01,43.5,42.96,44.86,41.94,58.44,65.79,64.33,67.09
2,PALANTIR TECHNOLOGIES A - BID PRICE,US69608A1088(PB),,,,,,,,,...,36.85,40.0,43.49,42.95,44.85,41.93,58.43,65.78,64.32,67.06
3,PALANTIR TECHNOLOGIES A - PRICE HIGH,US69608A1088(PH),,,,,,,,,...,37.47,40.29,44.38,42.99,45.07,42.57,58.48,66.0,64.44,67.16
4,PALANTIR TECHNOLOGIES A - PRICE LOW,US69608A1088(PL),,,,,,,,,...,36.59,39.4,42.62,41.65,43.645,41.59,55.3,60.91,61.37,65.47


In [3]:
# Make a copy of df_stock
df = df_stock.copy()

# Transpose the Dataset to Long Format #

In [4]:
# Step 1: delete data before December 2024 to align with the index data

# Define the date range
start_date = "2014-01-10 00:00:00"
end_date = "2014-11-28 00:00:00"

# Separate datetime columns
datetime_cols = pd.to_datetime(df.columns[2:], format="%Y-%m-%d %H:%M:%S", errors='coerce')

# Filter columns outside the specified date range
columns_to_keep = ~datetime_cols.to_series().between(start_date, end_date)

# Keep only the non-datetime columns and filtered datetime columns
df = df.iloc[:, :2].join(df.iloc[:, 2:].loc[:, columns_to_keep])

# Reconstruct the column names: combine non-datetime and filtered datetime columns
#df.columns = list(df.columns[:2]) + list(datetime_cols[columns_to_keep])

# Display the resulting DataFrame
df.head()

Unnamed: 0,Name,Code,2014-12-05 00:00:00,2014-12-12 00:00:00,2014-12-19 00:00:00,2014-12-26 00:00:00,2015-01-02 00:00:00,2015-01-09 00:00:00,2015-01-16 00:00:00,2015-01-23 00:00:00,...,2024-09-27 00:00:00,2024-10-04 00:00:00,2024-10-11 00:00:00,2024-10-18 00:00:00,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00
0,PALANTIR TECHNOLOGIES A,US69608A1088(P),,,,,,,,,...,36.84,40.01,43.51,42.97,44.86,41.92,58.39,65.77,64.35,67.08
1,PALANTIR TECHNOLOGIES A - ASK PRICE,US69608A1088(PA),,,,,,,,,...,36.86,40.01,43.5,42.96,44.86,41.94,58.44,65.79,64.33,67.09
2,PALANTIR TECHNOLOGIES A - BID PRICE,US69608A1088(PB),,,,,,,,,...,36.85,40.0,43.49,42.95,44.85,41.93,58.43,65.78,64.32,67.06
3,PALANTIR TECHNOLOGIES A - PRICE HIGH,US69608A1088(PH),,,,,,,,,...,37.47,40.29,44.38,42.99,45.07,42.57,58.48,66.0,64.44,67.16
4,PALANTIR TECHNOLOGIES A - PRICE LOW,US69608A1088(PL),,,,,,,,,...,36.59,39.4,42.62,41.65,43.645,41.59,55.3,60.91,61.37,65.47


In [5]:
# Step 2: create a new column with only company names and a new column name with only variable names

# Create a new column 'company_name' and set it to None
df['company_name'] = None

# Iterate every 35 rows to extract the company name and forward-fill
for start in range(0, len(df), 35):
    # Extract the company name at the start of each 35-row block
    company_name = df.loc[start, 'Name']

    # Assign this company name to all rows in the current block
    df.loc[start:start+34, 'company_name'] = company_name

# Forward-fill the remaining rows
df['company_name'] = df['company_name'].ffill()

# Create a new column 'var_name' and set it to None
df['var_name'] = None

# Create a list of 35 different variables
variables = ["close_price", "ask_price", "bid_price", "price_high", "price_low", "price_open",
             "turnover_value", "turnover_volume", "number_trades", "vwap", "trading_volume_wa",
             "total_return", "pe", "ptbv", "dividend_yield", "operating_pm", "gross_pm",
             "net_operating_income", "net_income", "roe", "roic", "roa", "ebit", "ebitda", "dpps",
             "current_ratio", "quick_ratio", "inventory_turnover", "asset_turnover",
             "tdce", "ltdce", "interst_cover1", "interst_cover2", "cash_dividend", "shares_outstanding"]

# Iterate over every 35 rows to assign variable names
for start in range(0, len(df), 35):
    # Assign variable names to the 'var_name' column for the current block
    var_names_to_assign = variables[:len(df.loc[start:start+34])]
    df.loc[start:start+34, 'var_name'] = var_names_to_assign

# Move 'company_name' and 'var_name' to the correct positions
df = df[['company_name', 'var_name'] + [col for col in df.columns if col not in ['company_name', 'var_name']]]

# View the resulting DataFrame
df

Unnamed: 0,company_name,var_name,Name,Code,2014-12-05 00:00:00,2014-12-12 00:00:00,2014-12-19 00:00:00,2014-12-26 00:00:00,2015-01-02 00:00:00,2015-01-09 00:00:00,...,2024-09-27 00:00:00,2024-10-04 00:00:00,2024-10-11 00:00:00,2024-10-18 00:00:00,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00
0,PALANTIR TECHNOLOGIES A,close_price,PALANTIR TECHNOLOGIES A,US69608A1088(P),,,,,,,...,36.84,40.01,43.51,42.97,44.860,41.92,58.39,65.77,64.35,67.08
1,PALANTIR TECHNOLOGIES A,ask_price,PALANTIR TECHNOLOGIES A - ASK PRICE,US69608A1088(PA),,,,,,,...,36.86,40.01,43.50,42.96,44.860,41.94,58.44,65.79,64.33,67.09
2,PALANTIR TECHNOLOGIES A,bid_price,PALANTIR TECHNOLOGIES A - BID PRICE,US69608A1088(PB),,,,,,,...,36.85,40.00,43.49,42.95,44.850,41.93,58.43,65.78,64.32,67.06
3,PALANTIR TECHNOLOGIES A,price_high,PALANTIR TECHNOLOGIES A - PRICE HIGH,US69608A1088(PH),,,,,,,...,37.47,40.29,44.38,42.99,45.070,42.57,58.48,66.00,64.44,67.16
4,PALANTIR TECHNOLOGIES A,price_low,PALANTIR TECHNOLOGIES A - PRICE LOW,US69608A1088(PL),,,,,,,...,36.59,39.40,42.62,41.65,43.645,41.59,55.30,60.91,61.37,65.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,MASTERCARD,ltdce,"MASTERCARD, INC. - LONG TERM DEBT % COMMON EQUITY",US57636Q1040(WC08226),22.090,22.090,22.090,22.090,54.610,54.610,...,,,,,,,,,,
6996,MASTERCARD,interst_cover1,"MASTERCARD, INC. - EBIT/TOT INT EXPENSE RATIO",US57636Q1040(WC08291),106.813,106.813,106.813,106.813,82.279,82.279,...,,,,,,,,,,
6997,MASTERCARD,interst_cover2,"MASTERCARD, INC. - INTEREST COVER",US57636Q1040(ICBT),106.810,106.810,106.810,106.810,82.280,82.280,...,24.72,24.72,24.72,24.72,24.720,24.72,24.72,24.72,24.72,24.72
6998,MASTERCARD,cash_dividend,"MASTERCARD, INC. - CASH DIVIDENDS PAID - TOTAL",US57636Q1040(WC04551),515000.000,515000.000,515000.000,515000.000,727000.000,727000.000,...,,,,,,,,,,


In [6]:
# Step 3: identify and count #ERROR occurrences:

# Filter rows where the 'Name' column contains '#ERROR'
error_rows = df[df['Name'] == '#ERROR']

# Group by 'var_name' and count occurrences of '#ERROR'
error_counts = error_rows.groupby('var_name').size().reset_index(name='error_count')

# View the result
error_counts

Unnamed: 0,var_name,error_count
0,current_ratio,43
1,dpps,2
2,gross_pm,30
3,interst_cover1,4
4,inventory_turnover,76
5,ltdce,3
6,net_operating_income,200
7,ptbv,2
8,quick_ratio,43
9,roe,3


In [7]:
# Step 4: drop variables with error_count > 60 (30%) for all companies

# Identify variables with error_count > 60
variables_to_drop = error_counts[error_counts['error_count'] > 60]['var_name']

# Drop these variables from the dataset
df = df[~df['var_name'].isin(variables_to_drop)]

# View the updated DataFrame
df                         # Two variables(inventory_turnover and net_operating_income) are dropped at this stage

Unnamed: 0,company_name,var_name,Name,Code,2014-12-05 00:00:00,2014-12-12 00:00:00,2014-12-19 00:00:00,2014-12-26 00:00:00,2015-01-02 00:00:00,2015-01-09 00:00:00,...,2024-09-27 00:00:00,2024-10-04 00:00:00,2024-10-11 00:00:00,2024-10-18 00:00:00,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00
0,PALANTIR TECHNOLOGIES A,close_price,PALANTIR TECHNOLOGIES A,US69608A1088(P),,,,,,,...,36.84,40.01,43.51,42.97,44.860,41.92,58.39,65.77,64.35,67.08
1,PALANTIR TECHNOLOGIES A,ask_price,PALANTIR TECHNOLOGIES A - ASK PRICE,US69608A1088(PA),,,,,,,...,36.86,40.01,43.50,42.96,44.860,41.94,58.44,65.79,64.33,67.09
2,PALANTIR TECHNOLOGIES A,bid_price,PALANTIR TECHNOLOGIES A - BID PRICE,US69608A1088(PB),,,,,,,...,36.85,40.00,43.49,42.95,44.850,41.93,58.43,65.78,64.32,67.06
3,PALANTIR TECHNOLOGIES A,price_high,PALANTIR TECHNOLOGIES A - PRICE HIGH,US69608A1088(PH),,,,,,,...,37.47,40.29,44.38,42.99,45.070,42.57,58.48,66.00,64.44,67.16
4,PALANTIR TECHNOLOGIES A,price_low,PALANTIR TECHNOLOGIES A - PRICE LOW,US69608A1088(PL),,,,,,,...,36.59,39.40,42.62,41.65,43.645,41.59,55.30,60.91,61.37,65.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,MASTERCARD,ltdce,"MASTERCARD, INC. - LONG TERM DEBT % COMMON EQUITY",US57636Q1040(WC08226),22.090,22.090,22.090,22.090,54.610,54.610,...,,,,,,,,,,
6996,MASTERCARD,interst_cover1,"MASTERCARD, INC. - EBIT/TOT INT EXPENSE RATIO",US57636Q1040(WC08291),106.813,106.813,106.813,106.813,82.279,82.279,...,,,,,,,,,,
6997,MASTERCARD,interst_cover2,"MASTERCARD, INC. - INTEREST COVER",US57636Q1040(ICBT),106.810,106.810,106.810,106.810,82.280,82.280,...,24.72,24.72,24.72,24.72,24.720,24.72,24.72,24.72,24.72,24.72
6998,MASTERCARD,cash_dividend,"MASTERCARD, INC. - CASH DIVIDENDS PAID - TOTAL",US57636Q1040(WC04551),515000.000,515000.000,515000.000,515000.000,727000.000,727000.000,...,,,,,,,,,,


In [8]:
# Step 5: clean the company code column and drop the "Name" and "Code" column

# Reset index to ensure integer-based indexing
df = df.reset_index(drop=True)

# Initialize the 'company_code' column
df['company_code'] = None  # Avoid SettingWithCopyWarning as this directly modifies the DataFrame

# Iterate through the DataFrame in blocks of 33 rows
for start in range(0, len(df), 33):
    # Extract the first value in the current block
    first_value = df.iloc[start, df.columns.get_loc('Code')]  # Use .iloc for integer-based indexing

    # Initialize company_code
    company_code = None

    # Check if first_value is a string and matches the desired pattern
    if isinstance(first_value, str):
        match = re.match(r'([^\(]+)', first_value)  # Match company codes starting with "US"
        if match:
            company_code = match.group(1)

    # Fill the current block (33 rows) with the extracted company code
    df.loc[start:start+32, 'company_code'] = company_code

# Move the 'company_code' column to the second position
columns = list(df.columns)  # Get the list of columns
columns.remove('company_code')  # Remove 'company_code_filled' from the list
columns.insert(1, 'company_code')  # Insert it at the second position
df = df[columns]  # Reorder the DataFrame

# View the result
df

Unnamed: 0,company_name,company_code,var_name,Name,Code,2014-12-05 00:00:00,2014-12-12 00:00:00,2014-12-19 00:00:00,2014-12-26 00:00:00,2015-01-02 00:00:00,...,2024-09-27 00:00:00,2024-10-04 00:00:00,2024-10-11 00:00:00,2024-10-18 00:00:00,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00
0,PALANTIR TECHNOLOGIES A,US69608A1088,close_price,PALANTIR TECHNOLOGIES A,US69608A1088(P),,,,,,...,36.84,40.01,43.51,42.97,44.860,41.92,58.39,65.77,64.35,67.08
1,PALANTIR TECHNOLOGIES A,US69608A1088,ask_price,PALANTIR TECHNOLOGIES A - ASK PRICE,US69608A1088(PA),,,,,,...,36.86,40.01,43.50,42.96,44.860,41.94,58.44,65.79,64.33,67.09
2,PALANTIR TECHNOLOGIES A,US69608A1088,bid_price,PALANTIR TECHNOLOGIES A - BID PRICE,US69608A1088(PB),,,,,,...,36.85,40.00,43.49,42.95,44.850,41.93,58.43,65.78,64.32,67.06
3,PALANTIR TECHNOLOGIES A,US69608A1088,price_high,PALANTIR TECHNOLOGIES A - PRICE HIGH,US69608A1088(PH),,,,,,...,37.47,40.29,44.38,42.99,45.070,42.57,58.48,66.00,64.44,67.16
4,PALANTIR TECHNOLOGIES A,US69608A1088,price_low,PALANTIR TECHNOLOGIES A - PRICE LOW,US69608A1088(PL),,,,,,...,36.59,39.40,42.62,41.65,43.645,41.59,55.30,60.91,61.37,65.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6595,MASTERCARD,US57636Q1040,ltdce,"MASTERCARD, INC. - LONG TERM DEBT % COMMON EQUITY",US57636Q1040(WC08226),22.090,22.090,22.090,22.090,54.610,...,,,,,,,,,,
6596,MASTERCARD,US57636Q1040,interst_cover1,"MASTERCARD, INC. - EBIT/TOT INT EXPENSE RATIO",US57636Q1040(WC08291),106.813,106.813,106.813,106.813,82.279,...,,,,,,,,,,
6597,MASTERCARD,US57636Q1040,interst_cover2,"MASTERCARD, INC. - INTEREST COVER",US57636Q1040(ICBT),106.810,106.810,106.810,106.810,82.280,...,24.72,24.72,24.72,24.72,24.720,24.72,24.72,24.72,24.72,24.72
6598,MASTERCARD,US57636Q1040,cash_dividend,"MASTERCARD, INC. - CASH DIVIDENDS PAID - TOTAL",US57636Q1040(WC04551),515000.000,515000.000,515000.000,515000.000,727000.000,...,,,,,,,,,,


In [9]:
# Drop the "Name" and "Code" columns
df = df.drop(columns=['Name', 'Code'])

# View the result
df

Unnamed: 0,company_name,company_code,var_name,2014-12-05 00:00:00,2014-12-12 00:00:00,2014-12-19 00:00:00,2014-12-26 00:00:00,2015-01-02 00:00:00,2015-01-09 00:00:00,2015-01-16 00:00:00,...,2024-09-27 00:00:00,2024-10-04 00:00:00,2024-10-11 00:00:00,2024-10-18 00:00:00,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00
0,PALANTIR TECHNOLOGIES A,US69608A1088,close_price,,,,,,,,...,36.84,40.01,43.51,42.97,44.860,41.92,58.39,65.77,64.35,67.08
1,PALANTIR TECHNOLOGIES A,US69608A1088,ask_price,,,,,,,,...,36.86,40.01,43.50,42.96,44.860,41.94,58.44,65.79,64.33,67.09
2,PALANTIR TECHNOLOGIES A,US69608A1088,bid_price,,,,,,,,...,36.85,40.00,43.49,42.95,44.850,41.93,58.43,65.78,64.32,67.06
3,PALANTIR TECHNOLOGIES A,US69608A1088,price_high,,,,,,,,...,37.47,40.29,44.38,42.99,45.070,42.57,58.48,66.00,64.44,67.16
4,PALANTIR TECHNOLOGIES A,US69608A1088,price_low,,,,,,,,...,36.59,39.40,42.62,41.65,43.645,41.59,55.30,60.91,61.37,65.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6595,MASTERCARD,US57636Q1040,ltdce,22.090,22.090,22.090,22.090,54.610,54.610,54.610,...,,,,,,,,,,
6596,MASTERCARD,US57636Q1040,interst_cover1,106.813,106.813,106.813,106.813,82.279,82.279,82.279,...,,,,,,,,,,
6597,MASTERCARD,US57636Q1040,interst_cover2,106.810,106.810,106.810,106.810,82.280,82.280,82.280,...,24.72,24.72,24.72,24.72,24.720,24.72,24.72,24.72,24.72,24.72
6598,MASTERCARD,US57636Q1040,cash_dividend,515000.000,515000.000,515000.000,515000.000,727000.000,727000.000,727000.000,...,,,,,,,,,,


In [10]:
# Make a copy of df
df_short = df.copy()

df_short['company_name'].nunique()   # Check the number of unique company names

200

In [11]:
df_short['company_code'].nunique()   # Check the number of unique company codes

200

In [12]:
# Step 6: transpose the dataset from short form to long form

# Identify date columns
date_columns = df_short.columns[3:]

# Transform from Short Form to Long Form using melt()
df_tran = df_short.melt(
    id_vars=['company_name', 'company_code', 'var_name'],  # Columns to keep
    value_vars=date_columns,  # Date columns to "unpivot"
    var_name='date',  # New column for date names
    value_name='value'  # Column for values
)

df_tran

Unnamed: 0,company_name,company_code,var_name,date,value
0,PALANTIR TECHNOLOGIES A,US69608A1088,close_price,2014-12-05,
1,PALANTIR TECHNOLOGIES A,US69608A1088,ask_price,2014-12-05,
2,PALANTIR TECHNOLOGIES A,US69608A1088,bid_price,2014-12-05,
3,PALANTIR TECHNOLOGIES A,US69608A1088,price_high,2014-12-05,
4,PALANTIR TECHNOLOGIES A,US69608A1088,price_low,2014-12-05,
...,...,...,...,...,...
3445195,MASTERCARD,US57636Q1040,ltdce,2024-11-29,
3445196,MASTERCARD,US57636Q1040,interst_cover1,2024-11-29,
3445197,MASTERCARD,US57636Q1040,interst_cover2,2024-11-29,24.72
3445198,MASTERCARD,US57636Q1040,cash_dividend,2024-11-29,


In [13]:
# Ensure 'date' column is in datetime format for consistency
df_tran['date'] = pd.to_datetime(df_tran['date'])

# Reshape: Pivot to separate variables (var1, var2, ..., var33) into columns
df_long = df_tran.pivot_table(
    index=['company_name', 'company_code', 'date'],  # Group by company and date
    columns='var_name',  # Pivot on variable names
    values='value'  # Values column
).reset_index()

# Clean column names
df_long.columns.name = None  # Remove column grouping name

# The resulting `df_long` is in long form with desired structure.
df_long

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,roa,roe,roic,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap
0,3M,US88579Y1010,2014-12-05,135.6365,1.05,135.6114,2216000.0,135.6699,1.96,2.11,...,15.80,32.38,22.79,635135.0,52.61,5622.58,1782344.0,,14274.8,135.7130
1,3M,US88579Y1010,2014-12-12,131.4896,1.05,131.4812,2216000.0,131.3642,1.96,2.18,...,15.80,32.38,22.79,635135.0,52.61,5444.14,1782344.0,,14913.8,132.0684
2,3M,US88579Y1010,2014-12-19,138.3705,1.05,138.3621,2216000.0,138.3537,1.96,2.48,...,15.80,32.38,22.79,635135.0,52.61,5733.81,1782344.0,1006228.0,21761.2,138.1173
3,3M,US88579Y1010,2014-12-26,139.0477,1.05,139.0226,2216000.0,139.0059,1.96,2.47,...,15.80,32.38,22.79,635135.0,52.61,5760.84,1782344.0,394893.9,8217.5,139.2474
4,3M,US88579Y1010,2015-01-02,137.1414,0.94,137.1163,2561000.0,137.1665,1.54,2.50,...,15.77,38.95,23.05,609330.0,92.61,5684.61,1821209.0,349564.0,7354.4,136.9276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103205,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-01,384.6499,,384.4900,,384.6399,,0.00,...,,,,,,9484.27,,1593302.0,4194.4,384.1338
103206,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-08,399.3000,,399.0500,,399.3101,,0.00,...,,,,,,9846.00,,798172.1,2052.8,399.8765
103207,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-15,385.8899,,385.7100,,385.9099,,0.00,...,,,,,,9515.59,,688635.1,1732.4,386.1726
103208,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-22,397.1899,,396.9900,,397.1899,,0.00,...,,,,,,9793.72,,710837.3,1836.0,396.1980


In [14]:
# Check the count of observations for a company (e.g., "PALANTIR TECHNOLOGIES A")

company_name = 'PALANTIR TECHNOLOGIES A'  # Replace with the company name you're interested in

# Filter the dataset to select only rows for the given company
company_data = df_long[df_long['company_name'] == company_name]

# Get the count of observations (rows) for that company
company_observation_count = company_data.shape[0]

print(f"Number of observations for {company_name}: {company_observation_count}")

Number of observations for PALANTIR TECHNOLOGIES A: 361


In [15]:
df_long.groupby('company_name').size().sort_values(ascending=True)  # Observation number ranked in ascending order

company_name
GE VERNOVA                 205
VERALTO                    257
PALANTIR TECHNOLOGIES A    361
CONSTELLATION ENERGY       361
CARRIER GLOBAL             413
                          ... 
F5                         522
FAIR ISAAC                 522
FASTENAL                   522
JOHNSON CONTROLS INTL.     522
ZEBRA TECHNOLOGIES 'A'     522
Length: 200, dtype: int64

In [16]:
df_long.groupby('company_name').size().sum() # Check the total observation number for all companies

103210

# Handling Missing values #

In [17]:
# Make a copy of df_long and check for missing value
df_mis = df_long.copy()

print(df_mis.isna().sum())

company_name              0
company_code              0
date                      0
ask_price              5421
asset_turnover         9052
bid_price              5421
cash_dividend          8747
close_price            2029
current_ratio         29477
dividend_yield         2029
dpps                  22326
ebit                   9370
ebitda                10210
gross_pm              23219
interst_cover1        12714
interst_cover2          210
ltdce                 14061
net_income             8634
number_trades         88264
operating_pm           8634
pe                     9897
price_high             5316
price_low              5316
price_open             5316
ptbv                   2587
quick_ratio           29582
roa                    9848
roe                   14007
roic                   9739
shares_outstanding     9056
tdce                   9056
total_return           2029
trading_volume_wa     11765
turnover_value         2431
turnover_volume        2029
vwap                

In [18]:
# Calculate the missing value percentage for each variable
missing_percentage = df_mis.isna().mean()
# Sort the percentages in descending order and round to 2 decimal places
sorted_missing_percentage = missing_percentage.sort_values(ascending=False).round(2)

# Print the sorted percentages
print(sorted_missing_percentage)

number_trades         0.86
quick_ratio           0.29
current_ratio         0.29
gross_pm              0.22
dpps                  0.22
ltdce                 0.14
roe                   0.14
interst_cover1        0.12
trading_volume_wa     0.11
ebitda                0.10
pe                    0.10
roa                   0.10
roic                  0.09
ebit                  0.09
shares_outstanding    0.09
tdce                  0.09
asset_turnover        0.09
cash_dividend         0.08
net_income            0.08
operating_pm          0.08
ask_price             0.05
bid_price             0.05
vwap                  0.05
price_high            0.05
price_low             0.05
price_open            0.05
ptbv                  0.03
turnover_value        0.02
turnover_volume       0.02
dividend_yield        0.02
close_price           0.02
total_return          0.02
interst_cover2        0.00
date                  0.00
company_code          0.00
company_name          0.00
dtype: float64


In [19]:
# Set a threshold for dropping variables (e.g., 30% missing)
threshold = 0.3

# Drop variables with missing percentage above the threshold
columns_to_drop = missing_percentage[missing_percentage > threshold].index
df_mis = df_mis.drop(columns=columns_to_drop)

print(f"Dropped columns: {columns_to_drop.tolist()}")
print("Remaining columns after dropping:")
print(df_mis.columns)

Dropped columns: ['number_trades']
Remaining columns after dropping:
Index(['company_name', 'company_code', 'date', 'ask_price', 'asset_turnover',
       'bid_price', 'cash_dividend', 'close_price', 'current_ratio',
       'dividend_yield', 'dpps', 'ebit', 'ebitda', 'gross_pm',
       'interst_cover1', 'interst_cover2', 'ltdce', 'net_income',
       'operating_pm', 'pe', 'price_high', 'price_low', 'price_open', 'ptbv',
       'quick_ratio', 'roa', 'roe', 'roic', 'shares_outstanding', 'tdce',
       'total_return', 'trading_volume_wa', 'turnover_value',
       'turnover_volume', 'vwap'],
      dtype='object')


In [20]:
# Function to handle missing data
def handle_missing_data(df_mis):
    # List of variables (columns starting from the 4th column)
    variable_columns = df_mis.columns[3:]

    for var in variable_columns:
        for company in df_mis['company_name'].unique():
            # Filter data for the specific company
            company_mask = df_mis['company_name'] == company
            var_data = df_mis.loc[company_mask, var]

            # Case 1: Entire column for the variable is missing for this company
            if var_data.isna().all():
                # Calculate the mean of the variable across other companies
                mean_value = df_mis.loc[df_mis['company_name'] != company, var].mean()
                df_mis.loc[company_mask, var] = mean_value

            else:
                # Case 2: Missing data at the beginning
                if var_data.isna().iloc[0]:
                    df_mis.loc[company_mask, var] = var_data.bfill()

                # Case 3: Missing data at the end
                if var_data.isna().iloc[-1]:
                    df_mis.loc[company_mask, var] = var_data.ffill()

                # Case 4: Missing data in between (including jumps)
                if var_data.isna().sum() > 0:
                    # Interpolate and assign to the correct company and variable
                    interpolated_values = var_data.interpolate()
                    df_mis.loc[company_mask, var] = interpolated_values

    return df_mis

# Apply the function to your dataset
df_filled = handle_missing_data(df_mis)

# View the result
df_filled

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,roa,roe,roic,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap
0,3M,US88579Y1010,2014-12-05,135.6365,1.05,135.6114,2216000.0,135.6699,1.96,2.11,...,15.80,32.38,22.79,635135.0,52.61,5622.58,1782344.0,,14274.8,135.7130
1,3M,US88579Y1010,2014-12-12,131.4896,1.05,131.4812,2216000.0,131.3642,1.96,2.18,...,15.80,32.38,22.79,635135.0,52.61,5444.14,1782344.0,,14913.8,132.0684
2,3M,US88579Y1010,2014-12-19,138.3705,1.05,138.3621,2216000.0,138.3537,1.96,2.48,...,15.80,32.38,22.79,635135.0,52.61,5733.81,1782344.0,1006228.0,21761.2,138.1173
3,3M,US88579Y1010,2014-12-26,139.0477,1.05,139.0226,2216000.0,139.0059,1.96,2.47,...,15.80,32.38,22.79,635135.0,52.61,5760.84,1782344.0,394893.9,8217.5,139.2474
4,3M,US88579Y1010,2015-01-02,137.1414,0.94,137.1163,2561000.0,137.1665,1.54,2.50,...,15.77,38.95,23.05,609330.0,92.61,5684.61,1821209.0,349564.0,7354.4,136.9276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103205,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-01,384.6499,0.67,384.4900,0.0,384.6399,1.05,0.00,...,5.73,10.26,8.01,51379.0,73.12,9484.27,535725.0,1593302.0,4194.4,384.1338
103206,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-08,399.3000,0.67,399.0500,0.0,399.3101,1.05,0.00,...,5.73,10.26,8.01,51379.0,73.12,9846.00,535725.0,798172.1,2052.8,399.8765
103207,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-15,385.8899,0.67,385.7100,0.0,385.9099,1.05,0.00,...,5.73,10.26,8.01,51379.0,73.12,9515.59,535725.0,688635.1,1732.4,386.1726
103208,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-22,397.1899,0.67,396.9900,0.0,397.1899,1.05,0.00,...,5.73,10.26,8.01,51379.0,73.12,9793.72,535725.0,710837.3,1836.0,396.1980


In [21]:
# Check for missing values
print(df_filled.isna().sum())

company_name             0
company_code             0
date                     0
ask_price             2065
asset_turnover         418
bid_price             2065
cash_dividend          113
close_price           2029
current_ratio          422
dividend_yield        2029
dpps                  3838
ebit                   160
ebitda                 478
gross_pm                 0
interst_cover1        1296
interst_cover2         210
ltdce                  955
net_income               0
operating_pm             0
pe                    4259
price_high            2031
price_low             2031
price_open            2031
ptbv                  1860
quick_ratio            527
roa                    901
roe                   1266
roic                   949
shares_outstanding     422
tdce                   422
total_return          2029
trading_volume_wa     1534
turnover_value        2431
turnover_volume       2029
vwap                  2031
dtype: int64


In [22]:
# Function to handle remaining missing data
def handle_remaining_missing_data(df_mis):
    '''
    # Step 1: Handle entirely missing variables
    for var in df_mis.columns[3:]:
        if df_mis[var].isna().all():
            # Replace with a default value, e.g., 0
            df_mis[var] = 0  # You can also use df_mis[var].mean() or another value
            '''

    # Step 2: Handle any remaining missing values after specific cases
    for var in df_mis.columns[3:]:
        df_mis[var].fillna(df_mis[var].median(), inplace=True)  # Replace remaining NaNs with median of the column

    return df_mis

# Apply the function to handle remaining missing values
df_filled_final = handle_remaining_missing_data(df_filled)

# Check if any missing values remain
print(df_filled_final.isna().sum())

company_name          0
company_code          0
date                  0
ask_price             0
asset_turnover        0
bid_price             0
cash_dividend         0
close_price           0
current_ratio         0
dividend_yield        0
dpps                  0
ebit                  0
ebitda                0
gross_pm              0
interst_cover1        0
interst_cover2        0
ltdce                 0
net_income            0
operating_pm          0
pe                    0
price_high            0
price_low             0
price_open            0
ptbv                  0
quick_ratio           0
roa                   0
roe                   0
roic                  0
shares_outstanding    0
tdce                  0
total_return          0
trading_volume_wa     0
turnover_value        0
turnover_volume       0
vwap                  0
dtype: int64


# Merge The Two Datasets #

In [24]:
# Importing S&P 500 Return Index and Volatility Index (VIX) from  2014-12-08 until 2024-12-02
sp500_index = pd.read_csv("weekly_sp500_vix.csv")
sp500_index.head()

Unnamed: 0,Date,SP500_Return (%),VIX_Close
0,2014-12-08,-3.51938,21.08
1,2014-12-15,3.412022,16.49
2,2014-12-22,0.875093,14.5
3,2014-12-29,-1.463544,17.790001
4,2015-01-05,-0.650563,17.549999


In [25]:
sp500_index.tail()

Unnamed: 0,Date,SP500_Return (%),VIX_Close
517,2024-11-04,4.656128,14.94
518,2024-11-11,-2.083547,16.139999
519,2024-11-18,1.681589,15.24
520,2024-11-25,1.056064,13.51
521,2024-12-02,0.290101,13.3


In [26]:
# Move all dates in the "sp500_index" dataset 3 days backward to align with our main dataset
# Ensure the 'Date' column is in datetime format
sp500_index['Date'] = pd.to_datetime(sp500_index['Date'])

# Subtract 3 days from the 'Date' column
sp500_index['Date'] = sp500_index['Date'] - pd.Timedelta(days=3)

# View the modified DataFrame
sp500_index.head()

Unnamed: 0,Date,SP500_Return (%),VIX_Close
0,2014-12-05,-3.51938,21.08
1,2014-12-12,3.412022,16.49
2,2014-12-19,0.875093,14.5
3,2014-12-26,-1.463544,17.790001
4,2015-01-02,-0.650563,17.549999


In [27]:
sp500_index.tail()

Unnamed: 0,Date,SP500_Return (%),VIX_Close
517,2024-11-01,4.656128,14.94
518,2024-11-08,-2.083547,16.139999
519,2024-11-15,1.681589,15.24
520,2024-11-22,1.056064,13.51
521,2024-11-29,0.290101,13.3


In [28]:
# Merge the two datasets

# Ensure the 'Date' columns in both datasets are datetime types
df_filled_final['date'] = pd.to_datetime(df_filled_final['date'])
sp500_index['Date'] = pd.to_datetime(sp500_index['Date'])

# Merge the datasets on the date column
merged_df = df_filled_final.merge(
    sp500_index.rename(columns={'Date': 'date'}),  # Rename for alignment
    on='date',
    how='left'  # Keep all rows in df_filled_final
)

# Display the merged dataset
merged_df.head()

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,roic,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap,SP500_Return (%),VIX_Close
0,3M,US88579Y1010,2014-12-05,135.6365,1.05,135.6114,2216000.0,135.6699,1.96,2.11,...,22.79,635135.0,52.61,5622.58,1782344.0,849750.1,14274.8,135.713,-3.51938,21.08
1,3M,US88579Y1010,2014-12-12,131.4896,1.05,131.4812,2216000.0,131.3642,1.96,2.18,...,22.79,635135.0,52.61,5444.14,1782344.0,849750.1,14913.8,132.0684,3.412022,16.49
2,3M,US88579Y1010,2014-12-19,138.3705,1.05,138.3621,2216000.0,138.3537,1.96,2.48,...,22.79,635135.0,52.61,5733.81,1782344.0,1006228.0,21761.2,138.1173,0.875093,14.5
3,3M,US88579Y1010,2014-12-26,139.0477,1.05,139.0226,2216000.0,139.0059,1.96,2.47,...,22.79,635135.0,52.61,5760.84,1782344.0,394893.9,8217.5,139.2474,-1.463544,17.790001
4,3M,US88579Y1010,2015-01-02,137.1414,0.94,137.1163,2561000.0,137.1665,1.54,2.5,...,23.05,609330.0,92.61,5684.61,1821209.0,349564.0,7354.4,136.9276,-0.650563,17.549999


In [29]:
merged_df.tail()

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,roic,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap,SP500_Return (%),VIX_Close
103205,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-01,384.6499,0.67,384.49,0.0,384.6399,1.05,0.0,...,8.01,51379.0,73.12,9484.27,535725.0,1593302.0,4194.4,384.1338,4.656128,14.94
103206,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-08,399.3,0.67,399.05,0.0,399.3101,1.05,0.0,...,8.01,51379.0,73.12,9846.0,535725.0,798172.1,2052.8,399.8765,-2.083547,16.139999
103207,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-15,385.8899,0.67,385.71,0.0,385.9099,1.05,0.0,...,8.01,51379.0,73.12,9515.59,535725.0,688635.1,1732.4,386.1726,1.681589,15.24
103208,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-22,397.1899,0.67,396.99,0.0,397.1899,1.05,0.0,...,8.01,51379.0,73.12,9793.72,535725.0,710837.3,1836.0,396.198,1.056064,13.51
103209,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-29,406.99,0.67,406.6201,0.0,407.0,1.05,0.0,...,8.01,51379.0,73.12,10035.61,535725.0,492827.2,1223.5,406.876,0.290101,13.3


In [30]:
# Check again for missing values after the merge
print(merged_df.isna().sum())

company_name          0
company_code          0
date                  0
ask_price             0
asset_turnover        0
bid_price             0
cash_dividend         0
close_price           0
current_ratio         0
dividend_yield        0
dpps                  0
ebit                  0
ebitda                0
gross_pm              0
interst_cover1        0
interst_cover2        0
ltdce                 0
net_income            0
operating_pm          0
pe                    0
price_high            0
price_low             0
price_open            0
ptbv                  0
quick_ratio           0
roa                   0
roe                   0
roic                  0
shares_outstanding    0
tdce                  0
total_return          0
trading_volume_wa     0
turnover_value        0
turnover_volume       0
vwap                  0
SP500_Return (%)      0
VIX_Close             0
dtype: int64


In [31]:
# Descriptive Analysis
# (to be done)

# Data Transformation, Standardization and Split #

In [32]:
# Check data types of all columns
print(merged_df.dtypes)

company_name                  object
company_code                  object
date                  datetime64[ns]
ask_price                    float64
asset_turnover               float64
bid_price                    float64
cash_dividend                float64
close_price                  float64
current_ratio                float64
dividend_yield               float64
dpps                         float64
ebit                         float64
ebitda                       float64
gross_pm                     float64
interst_cover1               float64
interst_cover2               float64
ltdce                        float64
net_income                   float64
operating_pm                 float64
pe                           float64
price_high                   float64
price_low                    float64
price_open                   float64
ptbv                         float64
quick_ratio                  float64
roa                          float64
roe                          float64
r

In [33]:
# Create a new column "weekly_return" based on close price, "return_close" will be our target feature

# Ensure the dataset is sorted by company and date
df_sorted = merged_df.sort_values(by=['company_name', 'date']).copy()

# Calculate weekly stock returns for each company
df_sorted['weekly_return'] = df_sorted.groupby('company_name')['close_price'].pct_change()

# Drop rows with NaN in 'return_close'
df_return = df_sorted.dropna(subset=['weekly_return'])

# Display the first few rows to verify
df_return.head()

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap,SP500_Return (%),VIX_Close,weekly_return
1,3M,US88579Y1010,2014-12-12,131.4896,1.05,131.4812,2216000.0,131.3642,1.96,2.18,...,635135.0,52.61,5444.14,1782344.0,849750.1,14913.8,132.0684,3.412022,16.49,-0.031737
2,3M,US88579Y1010,2014-12-19,138.3705,1.05,138.3621,2216000.0,138.3537,1.96,2.48,...,635135.0,52.61,5733.81,1782344.0,1006228.0,21761.2,138.1173,0.875093,14.5,0.053207
3,3M,US88579Y1010,2014-12-26,139.0477,1.05,139.0226,2216000.0,139.0059,1.96,2.47,...,635135.0,52.61,5760.84,1782344.0,394893.9,8217.5,139.2474,-1.463544,17.790001,0.004714
4,3M,US88579Y1010,2015-01-02,137.1414,0.94,137.1163,2561000.0,137.1665,1.54,2.5,...,609330.0,92.61,5684.61,1821209.0,349564.0,7354.4,136.9276,-0.650563,17.549999,-0.013233
5,3M,US88579Y1010,2015-01-09,135.2352,0.94,135.2018,2561000.0,135.1265,1.54,2.54,...,609330.0,92.61,5600.06,1821209.0,703335.1,18945.1,135.4806,-1.241681,20.950001,-0.014872


In [34]:
df_return.tail()

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap,SP500_Return (%),VIX_Close,weekly_return
103205,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-01,384.6499,0.67,384.49,0.0,384.6399,1.05,0.0,...,51379.0,73.12,9484.27,535725.0,1593302.0,4194.4,384.1338,4.656128,14.94,0.068177
103206,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-08,399.3,0.67,399.05,0.0,399.3101,1.05,0.0,...,51379.0,73.12,9846.0,535725.0,798172.1,2052.8,399.8765,-2.083547,16.139999,0.03814
103207,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-15,385.8899,0.67,385.71,0.0,385.9099,1.05,0.0,...,51379.0,73.12,9515.59,535725.0,688635.1,1732.4,386.1726,1.681589,15.24,-0.033558
103208,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-22,397.1899,0.67,396.99,0.0,397.1899,1.05,0.0,...,51379.0,73.12,9793.72,535725.0,710837.3,1836.0,396.198,1.056064,13.51,0.02923
103209,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-29,406.99,0.67,406.6201,0.0,407.0,1.05,0.0,...,51379.0,73.12,10035.61,535725.0,492827.2,1223.5,406.876,0.290101,13.3,0.024699


In [35]:
# Make a copy of df_return
stock = df_return.copy()

In [36]:
# Transform categorical feature (company_code)

# Ensure 'date' is datetime
stock['date'] = pd.to_datetime(stock['date'])

# Store original index order
original_index = stock.index

# Transform company_code
(company_codes, idx) = np.unique(stock['company_code'], return_index=True)
company_codes = [po for (i,po) in sorted(list(zip(idx,company_codes)))]

encode = OneHotEncoder(categories=[company_codes], drop = 'first')
comcode_encoded = encode.fit_transform(np.array(stock['company_code']).reshape(-1,1)).toarray()

for i, co in enumerate(company_codes[1:], start=1):  # Skipping the first company_code due to 'drop=first'
    stock[co] = comcode_encoded[:, i-1]

# Sort the data by original index to restore original order
stock = stock.loc[original_index]

stock

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,US9311421039,US2546871060,US94106L1098,US9497461015,US95040Q1040,US9581021055,US9694571004,IE00BDB6Q211,US3848021040,US9892071054
1,3M,US88579Y1010,2014-12-12,131.4896,1.05,131.4812,2216000.0,131.3642,1.96,2.18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3M,US88579Y1010,2014-12-19,138.3705,1.05,138.3621,2216000.0,138.3537,1.96,2.48,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3M,US88579Y1010,2014-12-26,139.0477,1.05,139.0226,2216000.0,139.0059,1.96,2.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3M,US88579Y1010,2015-01-02,137.1414,0.94,137.1163,2561000.0,137.1665,1.54,2.50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3M,US88579Y1010,2015-01-09,135.2352,0.94,135.2018,2561000.0,135.1265,1.54,2.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103205,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-01,384.6499,0.67,384.4900,0.0,384.6399,1.05,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
103206,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-08,399.3000,0.67,399.0500,0.0,399.3101,1.05,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
103207,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-15,385.8899,0.67,385.7100,0.0,385.9099,1.05,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
103208,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-22,397.1899,0.67,396.9900,0.0,397.1899,1.05,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [37]:
# Create lagged features (lag 1 period/1 week)

# Make a copy of the encoded data
stock_lag1 = stock.copy()

In [38]:
# List variables that will have lagged features
vars = [ "close_price", "ask_price", "bid_price", "price_high", "price_low", "price_open",
         "turnover_value", "turnover_volume", "vwap", "trading_volume_wa",
         "total_return", "pe", "ptbv", "dividend_yield", "operating_pm", "gross_pm",
         "net_income", "roe", "roic", "roa", "ebit", "ebitda", "dpps",
         "current_ratio", "quick_ratio", "asset_turnover",
         "tdce", "ltdce", "interst_cover1", "interst_cover2", "cash_dividend", "shares_outstanding",
         "SP500_Return (%)", "VIX_Close", "weekly_return"]

# Define lag period
lag_period = 1

# Function to create lagged features for a single stock's data
def create_lagged_features(group, lag_period, vars):
    for var in vars:
        group[f'{var}_lag{lag_period}'] = group[var].shift(lag_period)
    return group

# Apply the lagged feature creation function to each stock's data
stock_lag1 = stock_lag1.groupby('company_name').apply(create_lagged_features, lag_period=lag_period, vars=vars)
stock_lag1 = stock_lag1.reset_index(drop=True)

stock_lag1

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,asset_turnover_lag1,tdce_lag1,ltdce_lag1,interst_cover1_lag1,interst_cover2_lag1,cash_dividend_lag1,shares_outstanding_lag1,SP500_Return (%)_lag1,VIX_Close_lag1,weekly_return_lag1
0,3M,US88579Y1010,2014-12-12,131.4896,1.05,131.4812,2216000.0,131.3642,1.96,2.18,...,,,,,,,,,,
1,3M,US88579Y1010,2014-12-19,138.3705,1.05,138.3621,2216000.0,138.3537,1.96,2.48,...,1.05,52.61,51.80,60.923,69.88,2216000.0,635135.0,3.412022,16.490000,-0.031737
2,3M,US88579Y1010,2014-12-26,139.0477,1.05,139.0226,2216000.0,139.0059,1.96,2.47,...,1.05,52.61,51.80,60.923,69.88,2216000.0,635135.0,0.875093,14.500000,0.053207
3,3M,US88579Y1010,2015-01-02,137.1414,0.94,137.1163,2561000.0,137.1665,1.54,2.50,...,1.05,52.61,51.80,60.923,69.88,2216000.0,635135.0,-1.463544,17.790001,0.004714
4,3M,US88579Y1010,2015-01-09,135.2352,0.94,135.2018,2561000.0,135.1265,1.54,2.54,...,0.94,92.61,75.15,52.591,58.34,2561000.0,609330.0,-0.650563,17.549999,-0.013233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103005,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-01,384.6499,0.67,384.4900,0.0,384.6399,1.05,0.00,...,0.67,73.12,67.42,3.511,3.51,0.0,51379.0,-1.365680,21.879999,-0.040911
103006,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-08,399.3000,0.67,399.0500,0.0,399.3101,1.05,0.00,...,0.67,73.12,67.42,3.511,3.51,0.0,51379.0,4.656128,14.940000,0.068177
103007,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-15,385.8899,0.67,385.7100,0.0,385.9099,1.05,0.00,...,0.67,73.12,67.42,3.511,3.51,0.0,51379.0,-2.083547,16.139999,0.038140
103008,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-22,397.1899,0.67,396.9900,0.0,397.1899,1.05,0.00,...,0.67,73.12,67.42,3.511,3.51,0.0,51379.0,1.681589,15.240000,-0.033558


In [39]:
# Drop rows with NaN values due to lagging
stock_lag1.dropna(inplace=True)
stock_lag1

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,asset_turnover_lag1,tdce_lag1,ltdce_lag1,interst_cover1_lag1,interst_cover2_lag1,cash_dividend_lag1,shares_outstanding_lag1,SP500_Return (%)_lag1,VIX_Close_lag1,weekly_return_lag1
1,3M,US88579Y1010,2014-12-19,138.3705,1.05,138.3621,2216000.0,138.3537,1.96,2.48,...,1.05,52.61,51.80,60.923,69.88,2216000.0,635135.0,3.412022,16.490000,-0.031737
2,3M,US88579Y1010,2014-12-26,139.0477,1.05,139.0226,2216000.0,139.0059,1.96,2.47,...,1.05,52.61,51.80,60.923,69.88,2216000.0,635135.0,0.875093,14.500000,0.053207
3,3M,US88579Y1010,2015-01-02,137.1414,0.94,137.1163,2561000.0,137.1665,1.54,2.50,...,1.05,52.61,51.80,60.923,69.88,2216000.0,635135.0,-1.463544,17.790001,0.004714
4,3M,US88579Y1010,2015-01-09,135.2352,0.94,135.2018,2561000.0,135.1265,1.54,2.54,...,0.94,92.61,75.15,52.591,58.34,2561000.0,609330.0,-0.650563,17.549999,-0.013233
5,3M,US88579Y1010,2015-01-16,135.4275,0.94,135.4108,2561000.0,135.4442,1.54,2.53,...,0.94,92.61,75.15,52.591,58.34,2561000.0,609330.0,-1.241681,20.950001,-0.014872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103005,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-01,384.6499,0.67,384.4900,0.0,384.6399,1.05,0.00,...,0.67,73.12,67.42,3.511,3.51,0.0,51379.0,-1.365680,21.879999,-0.040911
103006,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-08,399.3000,0.67,399.0500,0.0,399.3101,1.05,0.00,...,0.67,73.12,67.42,3.511,3.51,0.0,51379.0,4.656128,14.940000,0.068177
103007,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-15,385.8899,0.67,385.7100,0.0,385.9099,1.05,0.00,...,0.67,73.12,67.42,3.511,3.51,0.0,51379.0,-2.083547,16.139999,0.038140
103008,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-22,397.1899,0.67,396.9900,0.0,397.1899,1.05,0.00,...,0.67,73.12,67.42,3.511,3.51,0.0,51379.0,1.681589,15.240000,-0.033558


In [40]:
# standardize data (target column 'weekly_return' is standardized as well)
X_cols = ["close_price_lag1", "ask_price_lag1", "bid_price_lag1", "price_high_lag1", "price_low_lag1", "price_open_lag1",
         "turnover_value_lag1", "turnover_volume_lag1", "vwap_lag1", "trading_volume_wa_lag1",
         "total_return_lag1", "pe_lag1", "ptbv_lag1", "dividend_yield_lag1", "operating_pm_lag1", "gross_pm_lag1",
         "net_income_lag1", "roe_lag1", "roic_lag1", "roa_lag1", "ebit_lag1", "ebitda_lag1", "dpps_lag1",
         "current_ratio_lag1", "quick_ratio_lag1", "asset_turnover_lag1",
          "tdce_lag1", "ltdce_lag1", "interst_cover1_lag1", "interst_cover2_lag1", "cash_dividend_lag1", "shares_outstanding_lag1",
         "SP500_Return (%)_lag1", "VIX_Close_lag1", "weekly_return_lag1"]

y_col = ['weekly_return']

X = stock_lag1[X_cols]
y = stock_lag1[y_col]

scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

scaler_y = StandardScaler()
y_scaled= scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

stock_lag1_scaled = stock_lag1.copy()

for i in range(len(X_cols)):
    stock_lag1_scaled[X_cols[i]] = X_scaled[:,i]

stock_lag1_scaled[y_col[0]] = y_scaled

stock_lag1_scaled

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,asset_turnover_lag1,tdce_lag1,ltdce_lag1,interst_cover1_lag1,interst_cover2_lag1,cash_dividend_lag1,shares_outstanding_lag1,SP500_Return (%)_lag1,VIX_Close_lag1,weekly_return_lag1
1,3M,US88579Y1010,2014-12-19,138.3705,1.05,138.3621,2216000.0,138.3537,1.96,2.48,...,0.701259,-0.013393,-0.151646,0.211969,0.260927,0.552046,-0.164760,1.356102,-0.220277,-0.772645
2,3M,US88579Y1010,2014-12-26,139.0477,1.05,139.0226,2216000.0,139.0059,1.96,2.47,...,0.701259,-0.013393,-0.151646,0.211969,0.260927,0.552046,-0.164760,0.271328,-0.501640,1.101534
3,3M,US88579Y1010,2015-01-02,137.1414,0.94,137.1163,2561000.0,137.1665,1.54,2.50,...,0.701259,-0.013393,-0.151646,0.211969,0.260927,0.552046,-0.164760,-0.728657,-0.036471,0.031594
4,3M,US88579Y1010,2015-01-09,135.2352,0.94,135.2018,2561000.0,135.1265,1.54,2.54,...,0.523145,0.006993,-0.132386,0.167499,0.199469,0.718940,-0.174142,-0.381032,-0.070405,-0.364375
5,3M,US88579Y1010,2015-01-16,135.4275,0.94,135.4108,2561000.0,135.4442,1.54,2.53,...,0.523145,0.006993,-0.132386,0.167499,0.199469,0.718940,-0.174142,-0.633790,0.410316,-0.400558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103005,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-01,384.6499,0.67,384.4900,0.0,384.6399,1.05,0.00,...,0.085954,-0.002940,-0.138762,-0.094452,-0.092538,-0.519941,-0.376989,-0.686811,0.541807,-0.975059
103006,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-08,399.3000,0.67,399.0500,0.0,399.3101,1.05,0.00,...,0.085954,-0.002940,-0.138762,-0.094452,-0.092538,-0.519941,-0.376989,1.888073,-0.439429,1.431824
103007,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-15,385.8899,0.67,385.7100,0.0,385.9099,1.05,0.00,...,0.085954,-0.002940,-0.138762,-0.094452,-0.092538,-0.519941,-0.376989,-0.993766,-0.269763,0.769100
103008,ZEBRA TECHNOLOGIES 'A',US9892071054,2024-11-22,397.1899,0.67,396.9900,0.0,397.1899,1.05,0.00,...,0.085954,-0.002940,-0.138762,-0.094452,-0.092538,-0.519941,-0.376989,0.616181,-0.397012,-0.812840
