# Package Installation Section #

In [1]:
# import yfinance as yf
import pandas as pd
from openpyxl import load_workbook
import re
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Download S&P500 Return Index and Volatility Index (VIX) from Yahoo Finance (Updated) #

In [None]:
# Define the tickers for S&P 500 (^GSPC) and VIX (^VIX)
tickers = ["^GSPC", "^VIX"]

# Download weekly data from Yahoo Finance
data = yf.download(tickers, start="2014-12-26", end="2024-12-31", interval="1wk", group_by="ticker")

# Extract closing prices
sp500_close = data["^GSPC"]["Close"]
vix_close = data["^VIX"]["Close"]

# Calculate weekly returns for S&P 500 (percentage change)
sp500_return = sp500_close.pct_change() * 100

# Combine data into a single DataFrame
sp500_vix = pd.DataFrame({
    "Date": sp500_close.index,
    "SP500_Return (%)": sp500_return,
    "VIX_Close": vix_close
}).set_index("Date")

# Drop rows with NaN values (e.g., first row for returns calculation)
sp500_vix.dropna(inplace=True)

# Save to a CSV file
# sp500_vix.to_csv("weekly_sp500_vix_wk1.csv")

sp500_vix

# Import Updated Stock Data for 30 Selected Stocks #

In [5]:
# Load the workbook
file_path = "Request table_2024.12.30.xlsm"
load_file = load_workbook(file_path, keep_vba=True)

# Access the worksheet
stock_data = load_file['Week_1']

# Extract data from the worksheet into a pandas DataFrame
data = stock_data.values  # Extract the data as a generator of rows
columns = next(data)  # Get the first row as column names
df_30 = pd.DataFrame(data, columns=columns)  # Create DataFrame

# Display the DataFrame
df_30

Unnamed: 0,Name,Code,2014-12-26 00:00:00,2015-01-02 00:00:00,2015-01-09 00:00:00,2015-01-16 00:00:00,2015-01-23 00:00:00,2015-01-30 00:00:00,2015-02-06 00:00:00,2015-02-13 00:00:00,...,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00,2024-12-06 00:00:00,2024-12-13 00:00:00,2024-12-20 00:00:00,2024-12-27 00:00:00
0,COPART,US2172041061(P),4.5875,4.5725,4.5150,4.3762,4.6125,4.5750,4.7087,4.7712,...,51.71,51.4900,56.12,56.670,62.7000,63.39,62.080,61.23,58.22,58.46
1,COPART - ASK PRICE,US2172041061(PA),4.5875,4.5725,4.5150,4.3762,4.6150,4.5750,4.7087,4.7712,...,51.72,51.4900,56.13,56.680,62.6800,63.39,62.070,61.20,58.23,58.48
2,COPART - BID PRICE,US2172041061(PB),4.5862,4.5700,4.5137,4.3737,4.6125,4.5737,4.7075,4.7700,...,51.71,51.4800,56.11,56.670,62.6600,63.38,62.050,61.19,58.22,58.46
3,COPART - PRICE HIGH,US2172041061(PH),4.6225,4.6025,4.5525,4.3950,4.6287,4.6250,4.7275,4.7712,...,52.05,51.8800,56.57,57.360,62.9000,64.06,62.430,62.08,58.98,58.85
4,COPART - PRICE LOW,US2172041061(PL),4.5781,4.5125,4.5075,4.3175,4.5619,4.5537,4.6687,4.7262,...,51.52,51.0028,55.80,56.455,59.0501,63.36,61.708,61.09,58.09,58.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955,CINTAS - LONG TERM DEBT % COMMON EQUITY,US1729081059(WC08226),67.27,67.2700,67.2700,67.2700,67.2700,67.2700,67.2700,67.2700,...,,,,,,,,,,
956,CINTAS - EBIT/TOT INT EXPENSE RATIO,US1729081059(WC08291),11.017,11.0170,11.0170,11.0170,11.0170,11.0170,11.0170,11.0170,...,,,,,,,,,,
957,CINTAS - INTEREST COVER,US1729081059(ICBT),11.02,11.0200,11.0200,11.0200,11.0200,11.0200,11.0200,11.0200,...,20.59,20.5900,20.59,20.590,20.5900,20.59,20.590,20.59,20.59,20.59
958,CINTAS - CASH DIVIDENDS PAID - TOTAL,US1729081059(WC04551),201891,201891.0000,201891.0000,201891.0000,201891.0000,201891.0000,201891.0000,201891.0000,...,,,,,,,,,,


# Transpose the Dataset to Long Format #

In [6]:
# Step 1: create a new column with only company names and a new column name with only variable names

df = df_30.copy()  #Make a copy of df_30

# Create a new column 'company_name' and set it to None
df['company_name'] = None

# Iterate every 32 rows to extract the company name and forward-fill
for start in range(0, len(df), 32):
    # Extract the company name at the start of each 32-row block
    company_name = df.loc[start, 'Name']

    # Assign this company name to all rows in the current block
    df.loc[start:start+31, 'company_name'] = company_name

# Forward-fill the remaining rows
df['company_name'] = df['company_name'].ffill()

# Create a new column 'var_name' and set it to None
df['var_name'] = None

# Create a list of 32 different variables
variables = ["close_price", "ask_price", "bid_price", "price_high", "price_low", "price_open",
             "turnover_value", "turnover_volume", "vwap", "trading_volume_wa",
             "total_return", "pe", "ptbv", "dividend_yield", "operating_pm", "gross_pm",
             "net_income", "roe", "roic", "roa", "ebit", "ebitda", "dpps",
             "current_ratio", "quick_ratio", "asset_turnover",
             "tdce", "ltdce", "interest_cover1", "interest_cover2", "cash_dividend", "shares_outstanding"]

# Iterate over every 32 rows to assign variable names
for start in range(0, len(df), 32):
    # Assign variable names to the 'var_name' column for the current block
    var_names_to_assign = variables[:len(df.loc[start:start+31])]
    df.loc[start:start+31, 'var_name'] = var_names_to_assign

# Move 'company_name' and 'var_name' to the correct positions
df = df[['company_name', 'var_name'] + [col for col in df.columns if col not in ['company_name', 'var_name']]]

# View the resulting DataFrame
df

Unnamed: 0,company_name,var_name,Name,Code,2014-12-26 00:00:00,2015-01-02 00:00:00,2015-01-09 00:00:00,2015-01-16 00:00:00,2015-01-23 00:00:00,2015-01-30 00:00:00,...,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00,2024-12-06 00:00:00,2024-12-13 00:00:00,2024-12-20 00:00:00,2024-12-27 00:00:00
0,COPART,close_price,COPART,US2172041061(P),4.5875,4.5725,4.5150,4.3762,4.6125,4.5750,...,51.71,51.4900,56.12,56.670,62.7000,63.39,62.080,61.23,58.22,58.46
1,COPART,ask_price,COPART - ASK PRICE,US2172041061(PA),4.5875,4.5725,4.5150,4.3762,4.6150,4.5750,...,51.72,51.4900,56.13,56.680,62.6800,63.39,62.070,61.20,58.23,58.48
2,COPART,bid_price,COPART - BID PRICE,US2172041061(PB),4.5862,4.5700,4.5137,4.3737,4.6125,4.5737,...,51.71,51.4800,56.11,56.670,62.6600,63.38,62.050,61.19,58.22,58.46
3,COPART,price_high,COPART - PRICE HIGH,US2172041061(PH),4.6225,4.6025,4.5525,4.3950,4.6287,4.6250,...,52.05,51.8800,56.57,57.360,62.9000,64.06,62.430,62.08,58.98,58.85
4,COPART,price_low,COPART - PRICE LOW,US2172041061(PL),4.5781,4.5125,4.5075,4.3175,4.5619,4.5537,...,51.52,51.0028,55.80,56.455,59.0501,63.36,61.708,61.09,58.09,58.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955,CINTAS,ltdce,CINTAS - LONG TERM DEBT % COMMON EQUITY,US1729081059(WC08226),67.27,67.2700,67.2700,67.2700,67.2700,67.2700,...,,,,,,,,,,
956,CINTAS,interest_cover1,CINTAS - EBIT/TOT INT EXPENSE RATIO,US1729081059(WC08291),11.017,11.0170,11.0170,11.0170,11.0170,11.0170,...,,,,,,,,,,
957,CINTAS,interest_cover2,CINTAS - INTEREST COVER,US1729081059(ICBT),11.02,11.0200,11.0200,11.0200,11.0200,11.0200,...,20.59,20.5900,20.59,20.590,20.5900,20.59,20.590,20.59,20.59,20.59
958,CINTAS,cash_dividend,CINTAS - CASH DIVIDENDS PAID - TOTAL,US1729081059(WC04551),201891,201891.0000,201891.0000,201891.0000,201891.0000,201891.0000,...,,,,,,,,,,


In [7]:
# Step 2: identify and count #ERROR occurrences:

# Filter rows where the 'Name' column contains '#ERROR'
error_rows = df[df['Name'] == '#ERROR']

# Group by 'var_name' and count occurrences of '#ERROR'
error_counts = error_rows.groupby('var_name').size().reset_index(name='error_count')

# View the result
error_counts

Unnamed: 0,var_name,error_count
0,current_ratio,1
1,gross_pm,1
2,interest_cover1,2
3,ptbv,2
4,quick_ratio,1
5,trading_volume_wa,1


In [8]:
# Step 3: clean the company code column and drop the "Name" and "Code" column

# Reset index to ensure integer-based indexing
df = df.reset_index(drop=True)

# Initialize the 'company_code' column
df['company_code'] = None  # Avoid SettingWithCopyWarning as this directly modifies the DataFrame

# Iterate through the DataFrame in blocks of 32 rows
for start in range(0, len(df), 32):
    # Extract the first value in the current block
    first_value = df.iloc[start, df.columns.get_loc('Code')]  # Use .iloc for integer-based indexing

    # Initialize company_code
    company_code = None

    # Check if first_value is a string and matches the desired pattern
    if isinstance(first_value, str):
        match = re.match(r'([^\(]+)', first_value)  # Match company codes
        if match:
            company_code = match.group(1)

    # Fill the current block (32 rows) with the extracted company code
    df.loc[start:start + 31, 'company_code'] = company_code

# Move the 'company_code' column to the second position
columns = list(df.columns)  # Get the list of columns
columns.remove('company_code')  # Remove 'company_code_filled' from the list
columns.insert(1, 'company_code')  # Insert it at the second position
df = df[columns]  # Reorder the DataFrame

# View the result
df

Unnamed: 0,company_name,company_code,var_name,Name,Code,2014-12-26 00:00:00,2015-01-02 00:00:00,2015-01-09 00:00:00,2015-01-16 00:00:00,2015-01-23 00:00:00,...,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00,2024-12-06 00:00:00,2024-12-13 00:00:00,2024-12-20 00:00:00,2024-12-27 00:00:00
0,COPART,US2172041061,close_price,COPART,US2172041061(P),4.5875,4.5725,4.5150,4.3762,4.6125,...,51.71,51.4900,56.12,56.670,62.7000,63.39,62.080,61.23,58.22,58.46
1,COPART,US2172041061,ask_price,COPART - ASK PRICE,US2172041061(PA),4.5875,4.5725,4.5150,4.3762,4.6150,...,51.72,51.4900,56.13,56.680,62.6800,63.39,62.070,61.20,58.23,58.48
2,COPART,US2172041061,bid_price,COPART - BID PRICE,US2172041061(PB),4.5862,4.5700,4.5137,4.3737,4.6125,...,51.71,51.4800,56.11,56.670,62.6600,63.38,62.050,61.19,58.22,58.46
3,COPART,US2172041061,price_high,COPART - PRICE HIGH,US2172041061(PH),4.6225,4.6025,4.5525,4.3950,4.6287,...,52.05,51.8800,56.57,57.360,62.9000,64.06,62.430,62.08,58.98,58.85
4,COPART,US2172041061,price_low,COPART - PRICE LOW,US2172041061(PL),4.5781,4.5125,4.5075,4.3175,4.5619,...,51.52,51.0028,55.80,56.455,59.0501,63.36,61.708,61.09,58.09,58.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955,CINTAS,US1729081059,ltdce,CINTAS - LONG TERM DEBT % COMMON EQUITY,US1729081059(WC08226),67.27,67.2700,67.2700,67.2700,67.2700,...,,,,,,,,,,
956,CINTAS,US1729081059,interest_cover1,CINTAS - EBIT/TOT INT EXPENSE RATIO,US1729081059(WC08291),11.017,11.0170,11.0170,11.0170,11.0170,...,,,,,,,,,,
957,CINTAS,US1729081059,interest_cover2,CINTAS - INTEREST COVER,US1729081059(ICBT),11.02,11.0200,11.0200,11.0200,11.0200,...,20.59,20.5900,20.59,20.590,20.5900,20.59,20.590,20.59,20.59,20.59
958,CINTAS,US1729081059,cash_dividend,CINTAS - CASH DIVIDENDS PAID - TOTAL,US1729081059(WC04551),201891,201891.0000,201891.0000,201891.0000,201891.0000,...,,,,,,,,,,


In [9]:
# Drop the "Name" and "Code" columns
df = df.drop(columns=['Name', 'Code'])

# View the result
df

Unnamed: 0,company_name,company_code,var_name,2014-12-26 00:00:00,2015-01-02 00:00:00,2015-01-09 00:00:00,2015-01-16 00:00:00,2015-01-23 00:00:00,2015-01-30 00:00:00,2015-02-06 00:00:00,...,2024-10-25 00:00:00,2024-11-01 00:00:00,2024-11-08 00:00:00,2024-11-15 00:00:00,2024-11-22 00:00:00,2024-11-29 00:00:00,2024-12-06 00:00:00,2024-12-13 00:00:00,2024-12-20 00:00:00,2024-12-27 00:00:00
0,COPART,US2172041061,close_price,4.5875,4.5725,4.5150,4.3762,4.6125,4.5750,4.7087,...,51.71,51.4900,56.12,56.670,62.7000,63.39,62.080,61.23,58.22,58.46
1,COPART,US2172041061,ask_price,4.5875,4.5725,4.5150,4.3762,4.6150,4.5750,4.7087,...,51.72,51.4900,56.13,56.680,62.6800,63.39,62.070,61.20,58.23,58.48
2,COPART,US2172041061,bid_price,4.5862,4.5700,4.5137,4.3737,4.6125,4.5737,4.7075,...,51.71,51.4800,56.11,56.670,62.6600,63.38,62.050,61.19,58.22,58.46
3,COPART,US2172041061,price_high,4.6225,4.6025,4.5525,4.3950,4.6287,4.6250,4.7275,...,52.05,51.8800,56.57,57.360,62.9000,64.06,62.430,62.08,58.98,58.85
4,COPART,US2172041061,price_low,4.5781,4.5125,4.5075,4.3175,4.5619,4.5537,4.6687,...,51.52,51.0028,55.80,56.455,59.0501,63.36,61.708,61.09,58.09,58.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955,CINTAS,US1729081059,ltdce,67.27,67.2700,67.2700,67.2700,67.2700,67.2700,67.2700,...,,,,,,,,,,
956,CINTAS,US1729081059,interest_cover1,11.017,11.0170,11.0170,11.0170,11.0170,11.0170,11.0170,...,,,,,,,,,,
957,CINTAS,US1729081059,interest_cover2,11.02,11.0200,11.0200,11.0200,11.0200,11.0200,11.0200,...,20.59,20.5900,20.59,20.590,20.5900,20.59,20.590,20.59,20.59,20.59
958,CINTAS,US1729081059,cash_dividend,201891,201891.0000,201891.0000,201891.0000,201891.0000,201891.0000,201891.0000,...,,,,,,,,,,


In [10]:
# Make a copy of df
df_short = df.copy()

df_short['company_name'].nunique()

30

In [11]:
df_short['company_code'].nunique()

30

In [12]:
# Step 4: transpose the dataset from short form to long form

# Identify date columns
date_columns = df_short.columns[3:]

# Transform from Short Form to Long Form using melt()
df_tran = df_short.melt(
    id_vars=['company_name', 'company_code', 'var_name'],  # Columns to keep
    value_vars=date_columns,  # Date columns to "unpivot"
    var_name='date',  # New column for date names
    value_name='value'  # Column for values
)

df_tran

Index([2014-12-26 00:00:00, 2015-01-02 00:00:00, 2015-01-09 00:00:00,
       2015-01-16 00:00:00, 2015-01-23 00:00:00, 2015-01-30 00:00:00,
       2015-02-06 00:00:00, 2015-02-13 00:00:00, 2015-02-20 00:00:00,
       2015-02-27 00:00:00,
       ...
       2024-10-25 00:00:00, 2024-11-01 00:00:00, 2024-11-08 00:00:00,
       2024-11-15 00:00:00, 2024-11-22 00:00:00, 2024-11-29 00:00:00,
       2024-12-06 00:00:00, 2024-12-13 00:00:00, 2024-12-20 00:00:00,
       2024-12-27 00:00:00],
      dtype='object', length=523)

In [17]:
# Convert non-numeric data
df_tran['value'] = pd.to_numeric(df_tran['value'], errors='coerce')

In [18]:
# Ensure 'date' column is in datetime format for consistency
df_tran['date'] = pd.to_datetime(df_tran['date'])

# Reshape: Pivot to separate variables (var1, var2, ..., var32) into columns
df_long = df_tran.pivot_table(
    index=['company_name', 'company_code', 'date'],  # Group by company and date
    columns='var_name',  # Pivot on variable names
    values='value',  # Values column
).reset_index()

# Clean column names
df_long.columns.name = None  # Remove column grouping name

# The resulting `df_long` is in long form with desired structure.
df_long

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,roa,roe,roic,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap
0,ALTRIA GROUP,US02209S1033,2014-12-26,50.60,0.52,50.59,3892000.0,50.60,0.90,4.11,...,16.20,141.82,30.85,1971475.0,487.49,106763.80,1513931.0,1039317.0,20650.8,50.6200
1,ALTRIA GROUP,US02209S1033,2015-01-02,48.97,0.58,48.95,4179000.0,48.97,0.86,4.25,...,17.22,177.50,34.37,1960059.0,448.58,103324.60,1773556.0,941558.1,18881.5,48.9720
2,ALTRIA GROUP,US02209S1033,2015-01-09,50.61,0.58,50.60,4179000.0,50.60,0.86,4.11,...,17.22,177.50,34.37,1960059.0,448.58,106763.80,1773556.0,1532743.0,30814.5,50.7540
3,ALTRIA GROUP,US02209S1033,2015-01-16,53.05,0.58,53.03,4179000.0,53.05,0.86,3.92,...,17.22,177.50,34.37,1960059.0,448.58,111933.20,1773556.0,1985785.0,38240.8,52.7794
4,ALTRIA GROUP,US02209S1033,2015-01-23,54.20,0.58,54.19,4179000.0,54.19,0.86,3.84,...,17.22,177.50,34.37,1960059.0,448.58,114338.50,1773556.0,1407894.0,25988.2,54.3862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15051,VERALTO,US92338C1036,2024-11-29,108.23,,108.20,,108.19,,0.33,...,,,,,,134.75,,719080.7,6645.2,108.1607
15052,VERALTO,US92338C1036,2024-12-06,106.53,,106.52,,106.53,,0.34,...,,,,,,132.68,,562336.4,5278.2,106.2803
15053,VERALTO,US92338C1036,2024-12-13,104.23,,104.22,,104.22,,0.35,...,,,,,,129.80,,483038.6,4566.4,104.4287
15054,VERALTO,US92338C1036,2024-12-20,104.23,,104.22,,104.25,,0.42,...,,,,,,129.84,,1312855.0,12711.4,103.9957


In [19]:
df_long.groupby('company_name').size().sort_values(ascending=True)

company_name
GE VERNOVA                209
VERALTO                   261
CROWDSTRIKE HOLDINGS A    465
RESMED                    523
QUANTA SERVICES           523
O REILLY AUTOMOTIVE       523
NVR                       523
MOTOROLA SOLUTIONS        523
MOODY'S                   523
META PLATFORMS A          523
MCKESSON                  523
MASTERCARD                523
IRON MOUNTAIN             523
INTUITIVE SURGICAL        523
HOME DEPOT                523
ALTRIA GROUP              523
TESLA                     523
FORTINET                  523
FAIR ISAAC                523
ENTERGY                   523
DECKERS OUTDOOR           523
COPART                    523
CINTAS                    523
CHIPOTLE MEXN.GRILL       523
CBRE GROUP CLASS A        523
ARISTA NETWORKS           523
ARCH CAP.GP.              523
AMPHENOL 'A'              523
HILTON WORLDWIDE HDG.     523
GODADDY CL.A              523
dtype: int64

In [20]:
df_long.groupby('company_name').size().sum()

15056

# Handling Missing values #

In [21]:
# Make a copy of df_long and check for missing value
df_mis = df_long.copy()

print(df_mis.isna().sum())

company_name             0
company_code             0
date                     0
ask_price             1023
asset_turnover        1514
bid_price             1023
cash_dividend         1411
close_price            553
current_ratio         1985
dividend_yield         553
dpps                  3019
ebit                  1461
ebitda                1776
gross_pm              1880
interest_cover1       3135
interest_cover2        101
ltdce                 3552
net_income            1409
operating_pm          1409
pe                    1628
price_high            1023
price_low             1023
price_open            1023
ptbv                  1116
quick_ratio           2090
roa                   1672
roe                   3291
roic                  1724
shares_outstanding    1514
tdce                  1514
total_return           553
trading_volume_wa     2027
turnover_value         553
turnover_volume        553
vwap                  1023
dtype: int64


In [22]:
# Calculate the missing value percentage for each variable
missing_percentage = df_mis.isna().mean()
# Sort the percentages in descending order and round to 2 decimal places
sorted_missing_percentage = missing_percentage.sort_values(ascending=False).round(2)

# Print the sorted percentages
print(sorted_missing_percentage)

ltdce                 0.24
roe                   0.22
interest_cover1       0.21
dpps                  0.20
quick_ratio           0.14
trading_volume_wa     0.13
current_ratio         0.13
gross_pm              0.12
ebitda                0.12
roic                  0.11
roa                   0.11
pe                    0.11
asset_turnover        0.10
tdce                  0.10
shares_outstanding    0.10
ebit                  0.10
cash_dividend         0.09
net_income            0.09
operating_pm          0.09
ptbv                  0.07
bid_price             0.07
ask_price             0.07
vwap                  0.07
price_high            0.07
price_low             0.07
price_open            0.07
turnover_volume       0.04
dividend_yield        0.04
close_price           0.04
total_return          0.04
turnover_value        0.04
interest_cover2       0.01
date                  0.00
company_code          0.00
company_name          0.00
dtype: float64


In [23]:
# Function to handle missing data
def handle_missing_data(df_mis):
    # List of variables (columns starting from the 4th column)
    variable_columns = df_mis.columns[3:]

    for var in variable_columns:
        for company in df_mis['company_name'].unique():
            # Filter data for the specific company
            company_mask = df_mis['company_name'] == company
            var_data = df_mis.loc[company_mask, var]

            # Case 1: Entire column for the variable is missing for this company
            if var_data.isna().all():
                # Calculate the mean of the variable across other companies
                mean_value = df_mis.loc[df_mis['company_name'] != company, var].mean()
                df_mis.loc[company_mask, var] = mean_value

            else:
                # Case 2: Missing data at the beginning
                if var_data.isna().iloc[0]:
                    df_mis.loc[company_mask, var] = var_data.bfill()

                # Case 3: Missing data at the end
                if var_data.isna().iloc[-1]:
                    df_mis.loc[company_mask, var] = var_data.ffill()

                # Case 4: Missing data in between (including jumps)
                if var_data.isna().sum() > 0:
                    # Interpolate and assign to the correct company and variable
                    interpolated_values = var_data.interpolate()
                    df_mis.loc[company_mask, var] = interpolated_values

    return df_mis

# Apply the function to your dataset
df_filled = handle_missing_data(df_mis)

# View the result
df_filled

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,roa,roe,roic,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap
0,ALTRIA GROUP,US02209S1033,2014-12-26,50.60,0.52,50.59,3892000.0,50.60,0.90,4.11,...,16.20,141.82,30.85,1971475.0,487.49,106763.80,1513931.0,1039317.0,20650.8,50.6200
1,ALTRIA GROUP,US02209S1033,2015-01-02,48.97,0.58,48.95,4179000.0,48.97,0.86,4.25,...,17.22,177.50,34.37,1960059.0,448.58,103324.60,1773556.0,941558.1,18881.5,48.9720
2,ALTRIA GROUP,US02209S1033,2015-01-09,50.61,0.58,50.60,4179000.0,50.60,0.86,4.11,...,17.22,177.50,34.37,1960059.0,448.58,106763.80,1773556.0,1532743.0,30814.5,50.7540
3,ALTRIA GROUP,US02209S1033,2015-01-16,53.05,0.58,53.03,4179000.0,53.05,0.86,3.92,...,17.22,177.50,34.37,1960059.0,448.58,111933.20,1773556.0,1985785.0,38240.8,52.7794
4,ALTRIA GROUP,US02209S1033,2015-01-23,54.20,0.58,54.19,4179000.0,54.19,0.86,3.84,...,17.22,177.50,34.37,1960059.0,448.58,114338.50,1773556.0,1407894.0,25988.2,54.3862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15051,VERALTO,US92338C1036,2024-11-29,108.23,0.88,108.20,0.0,108.19,1.64,0.33,...,16.40,36.34,23.77,246400.0,190.09,134.75,1297328.0,719080.7,6645.2,108.1607
15052,VERALTO,US92338C1036,2024-12-06,106.53,0.88,106.52,0.0,106.53,1.64,0.34,...,16.40,36.34,23.77,246400.0,190.09,132.68,1297328.0,562336.4,5278.2,106.2803
15053,VERALTO,US92338C1036,2024-12-13,104.23,0.88,104.22,0.0,104.22,1.64,0.35,...,16.40,36.34,23.77,246400.0,190.09,129.80,1297328.0,483038.6,4566.4,104.4287
15054,VERALTO,US92338C1036,2024-12-20,104.23,0.88,104.22,0.0,104.25,1.64,0.42,...,16.40,36.34,23.77,246400.0,190.09,129.84,1297328.0,1312855.0,12711.4,103.9957


In [24]:
# Check for missing values
print(df_filled.isna().sum())

company_name             0
company_code             0
date                     0
ask_price              555
asset_turnover         105
bid_price              555
cash_dividend            2
close_price            553
current_ratio          105
dividend_yield         553
dpps                   414
ebit                    52
ebitda                 262
gross_pm                 0
interest_cover1        471
interest_cover2        101
ltdce                  419
net_income               0
operating_pm             0
pe                    1237
price_high             555
price_low              555
price_open             555
ptbv                   384
quick_ratio            210
roa                    263
roe                    367
roic                   315
shares_outstanding     105
tdce                   105
total_return           553
trading_volume_wa      310
turnover_value         553
turnover_volume        553
vwap                   555
dtype: int64


In [25]:
# Function to handle remaining missing data
def handle_remaining_missing_data(df_mis):
    '''
    # Step 1: Handle entirely missing variables
    for var in df_mis.columns[3:]:
        if df_mis[var].isna().all():
            # Replace with a default value, e.g., 0
            df_mis[var] = 0  # You can also use df_mis[var].mean() or another value
            '''

    # Step 2: Handle any remaining missing values after specific cases
    for var in df_mis.columns[3:]:
        df_mis[var].fillna(df_mis[var].median(), inplace=True)  # Replace remaining NaNs with median of the column

    return df_mis

# Apply the function to handle remaining missing values
df_filled_final = handle_remaining_missing_data(df_filled)

# Check if any missing values remain
print(df_filled_final.isna().sum())

company_name          0
company_code          0
date                  0
ask_price             0
asset_turnover        0
bid_price             0
cash_dividend         0
close_price           0
current_ratio         0
dividend_yield        0
dpps                  0
ebit                  0
ebitda                0
gross_pm              0
interest_cover1       0
interest_cover2       0
ltdce                 0
net_income            0
operating_pm          0
pe                    0
price_high            0
price_low             0
price_open            0
ptbv                  0
quick_ratio           0
roa                   0
roe                   0
roic                  0
shares_outstanding    0
tdce                  0
total_return          0
trading_volume_wa     0
turnover_value        0
turnover_volume       0
vwap                  0
dtype: int64


# Merge The Two Datasets #

In [39]:
# Importing S&P 500 Return Index and Volatility Index (VIX) from  2014-12-29 until 2024-12-23
sp500_index = pd.read_csv("weekly_sp500_vix_wk1.csv")
sp500_index.head()

Unnamed: 0,Date,SP500_Return (%),VIX_Close
0,2014-12-29,-1.463544,17.790001
1,2015-01-05,-0.650563,17.549999
2,2015-01-12,-1.241681,20.950001
3,2015-01-19,1.604422,16.66
4,2015-01-26,-2.76974,20.969999


In [40]:
sp500_index.tail()

Unnamed: 0,Date,SP500_Return (%),VIX_Close
518,2024-12-02,0.959657,12.77
519,2024-12-09,-0.643324,13.81
520,2024-12-16,-1.987076,18.360001
521,2024-12-23,0.674267,15.95
522,2024-12-30,-0.677119,16.709999


In [41]:
# Move all dates in the "sp500_index" dataset 3 days backward to align with our main dataset
# Ensure the 'Date' column is in datetime format
sp500_index['Date'] = pd.to_datetime(sp500_index['Date'])

# Subtract 3 days from the 'Date' column
sp500_index['Date'] = sp500_index['Date'] - pd.Timedelta(days=3)

# View the modified DataFrame
sp500_index.head()

Unnamed: 0,Date,SP500_Return (%),VIX_Close
0,2014-12-26,-1.463544,17.790001
1,2015-01-02,-0.650563,17.549999
2,2015-01-09,-1.241681,20.950001
3,2015-01-16,1.604422,16.66
4,2015-01-23,-2.76974,20.969999


In [42]:
sp500_index.tail()

Unnamed: 0,Date,SP500_Return (%),VIX_Close
518,2024-11-29,0.959657,12.77
519,2024-12-06,-0.643324,13.81
520,2024-12-13,-1.987076,18.360001
521,2024-12-20,0.674267,15.95
522,2024-12-27,-0.677119,16.709999


In [43]:
# Merge the two datasets

# Ensure the 'Date' columns in both datasets are datetime types
df_filled_final['date'] = pd.to_datetime(df_filled_final['date'])
sp500_index['Date'] = pd.to_datetime(sp500_index['Date'])

# Merge the datasets on the date column
merged_df = df_filled_final.merge(
    sp500_index.rename(columns={'Date': 'date'}),  # Rename for alignment
    on='date',
    how='left'  # Keep all rows in df_filled_final
)

# Display the merged dataset
merged_df.head()

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,roic,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap,SP500_Return (%),VIX_Close
0,ALTRIA GROUP,US02209S1033,2014-12-26,50.6,0.52,50.59,3892000.0,50.6,0.9,4.11,...,30.85,1971475.0,487.49,106763.8,1513931.0,1039317.0,20650.8,50.62,-1.463544,17.790001
1,ALTRIA GROUP,US02209S1033,2015-01-02,48.97,0.58,48.95,4179000.0,48.97,0.86,4.25,...,34.37,1960059.0,448.58,103324.6,1773556.0,941558.1,18881.5,48.972,-0.650563,17.549999
2,ALTRIA GROUP,US02209S1033,2015-01-09,50.61,0.58,50.6,4179000.0,50.6,0.86,4.11,...,34.37,1960059.0,448.58,106763.8,1773556.0,1532743.0,30814.5,50.754,-1.241681,20.950001
3,ALTRIA GROUP,US02209S1033,2015-01-16,53.05,0.58,53.03,4179000.0,53.05,0.86,3.92,...,34.37,1960059.0,448.58,111933.2,1773556.0,1985785.0,38240.8,52.7794,1.604422,16.66
4,ALTRIA GROUP,US02209S1033,2015-01-23,54.2,0.58,54.19,4179000.0,54.19,0.86,3.84,...,34.37,1960059.0,448.58,114338.5,1773556.0,1407894.0,25988.2,54.3862,-2.76974,20.969999


In [44]:
merged_df.tail()

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,roic,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap,SP500_Return (%),VIX_Close
15051,VERALTO,US92338C1036,2024-11-29,108.23,0.88,108.2,0.0,108.19,1.64,0.33,...,23.77,246400.0,190.09,134.75,1297328.0,719080.7,6645.2,108.1607,0.959657,12.77
15052,VERALTO,US92338C1036,2024-12-06,106.53,0.88,106.52,0.0,106.53,1.64,0.34,...,23.77,246400.0,190.09,132.68,1297328.0,562336.4,5278.2,106.2803,-0.643324,13.81
15053,VERALTO,US92338C1036,2024-12-13,104.23,0.88,104.22,0.0,104.22,1.64,0.35,...,23.77,246400.0,190.09,129.8,1297328.0,483038.6,4566.4,104.4287,-1.987076,18.360001
15054,VERALTO,US92338C1036,2024-12-20,104.23,0.88,104.22,0.0,104.25,1.64,0.42,...,23.77,246400.0,190.09,129.84,1297328.0,1312855.0,12711.4,103.9957,0.674267,15.95
15055,VERALTO,US92338C1036,2024-12-27,103.12,0.88,103.11,0.0,103.12,1.64,0.43,...,23.77,246400.0,190.09,128.43,1297328.0,249154.3,2415.8,102.9929,-0.677119,16.709999


In [45]:
# Check again for missing values after the merge
print(merged_df.isna().sum())

company_name          0
company_code          0
date                  0
ask_price             0
asset_turnover        0
bid_price             0
cash_dividend         0
close_price           0
current_ratio         0
dividend_yield        0
dpps                  0
ebit                  0
ebitda                0
gross_pm              0
interest_cover1       0
interest_cover2       0
ltdce                 0
net_income            0
operating_pm          0
pe                    0
price_high            0
price_low             0
price_open            0
ptbv                  0
quick_ratio           0
roa                   0
roe                   0
roic                  0
shares_outstanding    0
tdce                  0
total_return          0
trading_volume_wa     0
turnover_value        0
turnover_volume       0
vwap                  0
SP500_Return (%)      0
VIX_Close             0
dtype: int64


# Data Transformation and Standardization #

In [46]:
# Check data types of all columns
print(merged_df.dtypes)

company_name                  object
company_code                  object
date                  datetime64[ns]
ask_price                    float64
asset_turnover               float64
bid_price                    float64
cash_dividend                float64
close_price                  float64
current_ratio                float64
dividend_yield               float64
dpps                         float64
ebit                         float64
ebitda                       float64
gross_pm                     float64
interest_cover1              float64
interest_cover2              float64
ltdce                        float64
net_income                   float64
operating_pm                 float64
pe                           float64
price_high                   float64
price_low                    float64
price_open                   float64
ptbv                         float64
quick_ratio                  float64
roa                          float64
roe                          float64
r

In [47]:
# Create a new column "weekly_return" based on close price

# Ensure the dataset is sorted by company and date
df_sorted = merged_df.sort_values(by=['company_name', 'date']).copy()

# Calculate weekly stock returns for each company
df_sorted['weekly_return'] = df_sorted.groupby('company_name')['close_price'].pct_change()

# Drop rows with NaN in 'return_close'
df_return = df_sorted.dropna(subset=['weekly_return'])

# Display the first few rows to verify
df_return.head()

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap,SP500_Return (%),VIX_Close,weekly_return
1,ALTRIA GROUP,US02209S1033,2015-01-02,48.97,0.58,48.95,4179000.0,48.97,0.86,4.25,...,1960059.0,448.58,103324.6,1773556.0,941558.1,18881.5,48.972,-0.650563,17.549999,-0.032213
2,ALTRIA GROUP,US02209S1033,2015-01-09,50.61,0.58,50.6,4179000.0,50.6,0.86,4.11,...,1960059.0,448.58,106763.8,1773556.0,1532743.0,30814.5,50.754,-1.241681,20.950001,0.033286
3,ALTRIA GROUP,US02209S1033,2015-01-16,53.05,0.58,53.03,4179000.0,53.05,0.86,3.92,...,1960059.0,448.58,111933.2,1773556.0,1985785.0,38240.8,52.7794,1.604422,16.66,0.048419
4,ALTRIA GROUP,US02209S1033,2015-01-23,54.2,0.58,54.19,4179000.0,54.19,0.86,3.84,...,1960059.0,448.58,114338.5,1773556.0,1407894.0,25988.2,54.3862,-2.76974,20.969999,0.021489
5,ALTRIA GROUP,US02209S1033,2015-01-30,53.12,0.58,53.11,4179000.0,53.1,0.86,3.92,...,1960059.0,448.58,112038.7,1773556.0,1873899.0,34509.5,53.6496,3.031593,17.290001,-0.020114


In [48]:
df_return.tail()

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,shares_outstanding,tdce,total_return,trading_volume_wa,turnover_value,turnover_volume,vwap,SP500_Return (%),VIX_Close,weekly_return
15051,VERALTO,US92338C1036,2024-11-29,108.23,0.88,108.2,0.0,108.19,1.64,0.33,...,246400.0,190.09,134.75,1297328.0,719080.7,6645.2,108.1607,0.959657,12.77,0.016728
15052,VERALTO,US92338C1036,2024-12-06,106.53,0.88,106.52,0.0,106.53,1.64,0.34,...,246400.0,190.09,132.68,1297328.0,562336.4,5278.2,106.2803,-0.643324,13.81,-0.015343
15053,VERALTO,US92338C1036,2024-12-13,104.23,0.88,104.22,0.0,104.22,1.64,0.35,...,246400.0,190.09,129.8,1297328.0,483038.6,4566.4,104.4287,-1.987076,18.360001,-0.021684
15054,VERALTO,US92338C1036,2024-12-20,104.23,0.88,104.22,0.0,104.25,1.64,0.42,...,246400.0,190.09,129.84,1297328.0,1312855.0,12711.4,103.9957,0.674267,15.95,0.000288
15055,VERALTO,US92338C1036,2024-12-27,103.12,0.88,103.11,0.0,103.12,1.64,0.43,...,246400.0,190.09,128.43,1297328.0,249154.3,2415.8,102.9929,-0.677119,16.709999,-0.010839


In [49]:
# Ensure the dataset is sorted by company and date
df_add_features = df_return.sort_values(by=['company_name', 'date']).copy()

# Define a function to calculate Sharpe ratio
def sharpe_ratio(series, risk_free_rate=0.0):
    excess_return = series - risk_free_rate
    return excess_return.mean() / excess_return.std() if excess_return.std() != 0 else 0

# Group by company to calculate moving statistics
def calculate_features(group):
    group['avg_4w'] = group['weekly_return'].rolling(window=4, min_periods=1).mean()
    group['avg_8w'] = group['weekly_return'].rolling(window=8, min_periods=1).mean()
    group['avg_12w'] = group['weekly_return'].rolling(window=12, min_periods=1).mean()
    group['avg_24w'] = group['weekly_return'].rolling(window=24, min_periods=1).mean()
    group['sharpe_ratio_24w'] = group['weekly_return'].rolling(window=24, min_periods=1).apply(sharpe_ratio, raw=True)
    group['skewness_24w'] = group['weekly_return'].rolling(window=24, min_periods=1).skew()
    group['kurtosis_24w'] = group['weekly_return'].rolling(window=24, min_periods=1).kurt()
    return group

# Apply the function to each company
df_add_features = df_add_features.groupby('company_name', group_keys=False).apply(calculate_features)

# Display the result
df_add_features

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,SP500_Return (%),VIX_Close,weekly_return,avg_4w,avg_8w,avg_12w,avg_24w,sharpe_ratio_24w,skewness_24w,kurtosis_24w
1,ALTRIA GROUP,US02209S1033,2015-01-02,48.97,0.58,48.95,4179000.0,48.97,0.86,4.25,...,-0.650563,17.549999,-0.032213,-0.032213,-0.032213,-0.032213,-0.032213,0.000000,,
2,ALTRIA GROUP,US02209S1033,2015-01-09,50.61,0.58,50.60,4179000.0,50.60,0.86,4.11,...,-1.241681,20.950001,0.033286,0.000536,0.000536,0.000536,0.000536,0.016370,,
3,ALTRIA GROUP,US02209S1033,2015-01-16,53.05,0.58,53.03,4179000.0,53.05,0.86,3.92,...,1.604422,16.660000,0.048419,0.016497,0.016497,0.016497,0.016497,0.471436,-1.492278,
4,ALTRIA GROUP,US02209S1033,2015-01-23,54.20,0.58,54.19,4179000.0,54.19,0.86,3.84,...,-2.769740,20.969999,0.021489,0.017745,0.017745,0.017745,0.017745,0.584066,-1.420844,2.284845
5,ALTRIA GROUP,US02209S1033,2015-01-30,53.12,0.58,53.11,4179000.0,53.10,0.86,3.92,...,3.031593,17.290001,-0.020114,0.020770,0.010173,0.010173,0.010173,0.327015,-0.338651,-2.438920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15051,VERALTO,US92338C1036,2024-11-29,108.23,0.88,108.20,0.0,108.19,1.64,0.33,...,0.959657,12.770000,0.016728,0.016211,-0.003600,0.000111,0.003423,0.114822,-0.773463,1.995722
15052,VERALTO,US92338C1036,2024-12-06,106.53,0.88,106.52,0.0,106.53,1.64,0.34,...,-0.643324,13.810000,-0.015343,0.002630,-0.006757,-0.002164,0.003561,0.119840,-0.788762,2.074987
15053,VERALTO,US92338C1036,2024-12-13,104.23,0.88,104.22,0.0,104.22,1.64,0.35,...,-1.987076,18.360001,-0.021684,0.004007,-0.009987,-0.003555,0.004091,0.140361,-0.825087,2.468859
15054,VERALTO,US92338C1036,2024-12-20,104.23,0.88,104.22,0.0,104.25,1.64,0.42,...,0.674267,15.950000,0.000288,-0.005003,0.000558,-0.004162,0.004046,0.138802,-0.819809,2.458953


In [37]:
# Drop rows with NaN values
df_add_features = df_add_features.dropna()

# Reset the index if needed (optional)
df_add_features = df_add_features.reset_index(drop=True)

# Verify the updated DataFrame
df_add_features

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,SP500_Return (%),VIX_Close,weekly_return,avg_4w,avg_8w,avg_12w,avg_24w,sharpe_ratio_24w,skewness_24w,kurtosis_24w
0,ALTRIA GROUP,US02209S1033,2015-01-23,54.20,0.58,54.19,4179000.0,54.19,0.86,3.84,...,-2.769740,20.969999,0.021489,0.017745,0.017745,0.017745,0.017745,0.584066,-1.420844,2.284845
1,ALTRIA GROUP,US02209S1033,2015-01-30,53.12,0.58,53.11,4179000.0,53.10,0.86,3.92,...,3.031593,17.290001,-0.020114,0.020770,0.010173,0.010173,0.010173,0.327015,-0.338651,-2.438920
2,ALTRIA GROUP,US02209S1033,2015-02-06,53.33,0.58,53.32,4179000.0,53.33,0.86,3.90,...,2.019977,14.690000,0.004331,0.013531,0.009200,0.009200,0.009200,0.322995,-0.199270,-1.518379
3,ALTRIA GROUP,US02209S1033,2015-02-13,54.76,0.58,54.75,4179000.0,54.76,0.86,3.80,...,0.634722,14.300000,0.026814,0.008130,0.011716,0.011716,0.011716,0.432640,-0.499518,-1.035469
4,ALTRIA GROUP,US02209S1033,2015-02-20,55.61,0.58,55.60,4179000.0,55.61,0.86,3.74,...,-0.274845,13.340000,0.015522,0.006638,0.012192,0.012192,0.012192,0.480701,-0.581625,-0.487762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14364,VERALTO,US92338C1036,2024-11-22,106.41,0.88,106.40,0.0,106.41,1.64,0.34,...,1.056064,13.510000,0.036326,0.006118,-0.003742,-0.004033,0.003369,0.113083,-0.770468,2.005351
14365,VERALTO,US92338C1036,2024-11-29,108.23,0.88,108.20,0.0,108.19,1.64,0.33,...,0.959657,12.770000,0.016728,0.016211,-0.003600,0.000111,0.003423,0.114822,-0.773463,1.995722
14366,VERALTO,US92338C1036,2024-12-06,106.53,0.88,106.52,0.0,106.53,1.64,0.34,...,-0.643324,13.810000,-0.015343,0.002630,-0.006757,-0.002164,0.003561,0.119840,-0.788762,2.074987
14367,VERALTO,US92338C1036,2024-12-13,104.23,0.88,104.22,0.0,104.22,1.64,0.35,...,-1.987076,18.360001,-0.021684,0.004007,-0.009987,-0.003555,0.004091,0.140361,-0.825087,2.468859


In [53]:
# Create a new column "next_week_return", which will be our target feature

# Shift weekly returns to get next week's performance
df_target = df_add_features.copy()  # Ensure df_return is a copy to avoid warnings
df_target['next_week_return'] = df_target.groupby('company_name')['weekly_return'].shift(-1)

# View the result
df_target

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,VIX_Close,weekly_return,avg_4w,avg_8w,avg_12w,avg_24w,sharpe_ratio_24w,skewness_24w,kurtosis_24w,next_week_return
0,ALTRIA GROUP,US02209S1033,2015-01-23,54.20,0.58,54.19,4179000.0,54.19,0.86,3.84,...,20.969999,0.021489,0.017745,0.017745,0.017745,0.017745,0.584066,-1.420844,2.284845,-0.020114
1,ALTRIA GROUP,US02209S1033,2015-01-30,53.12,0.58,53.11,4179000.0,53.10,0.86,3.92,...,17.290001,-0.020114,0.020770,0.010173,0.010173,0.010173,0.327015,-0.338651,-2.438920,0.004331
2,ALTRIA GROUP,US02209S1033,2015-02-06,53.33,0.58,53.32,4179000.0,53.33,0.86,3.90,...,14.690000,0.004331,0.013531,0.009200,0.009200,0.009200,0.322995,-0.199270,-1.518379,0.026814
3,ALTRIA GROUP,US02209S1033,2015-02-13,54.76,0.58,54.75,4179000.0,54.76,0.86,3.80,...,14.300000,0.026814,0.008130,0.011716,0.011716,0.011716,0.432640,-0.499518,-1.035469,0.015522
4,ALTRIA GROUP,US02209S1033,2015-02-20,55.61,0.58,55.60,4179000.0,55.61,0.86,3.74,...,13.340000,0.015522,0.006638,0.012192,0.012192,0.012192,0.480701,-0.581625,-0.487762,0.012228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14394,VERALTO,US92338C1036,2024-11-29,108.23,0.88,108.20,0.0,108.19,1.64,0.33,...,12.770000,0.016728,0.016211,-0.003600,0.000111,0.003423,0.114822,-0.773463,1.995722,-0.015343
14395,VERALTO,US92338C1036,2024-12-06,106.53,0.88,106.52,0.0,106.53,1.64,0.34,...,13.810000,-0.015343,0.002630,-0.006757,-0.002164,0.003561,0.119840,-0.788762,2.074987,-0.021684
14396,VERALTO,US92338C1036,2024-12-13,104.23,0.88,104.22,0.0,104.22,1.64,0.35,...,18.360001,-0.021684,0.004007,-0.009987,-0.003555,0.004091,0.140361,-0.825087,2.468859,0.000288
14397,VERALTO,US92338C1036,2024-12-20,104.23,0.88,104.22,0.0,104.25,1.64,0.42,...,15.950000,0.000288,-0.005003,0.000558,-0.004162,0.004046,0.138802,-0.819809,2.458953,-0.010839


In [54]:
df_target.tail()

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,VIX_Close,weekly_return,avg_4w,avg_8w,avg_12w,avg_24w,sharpe_ratio_24w,skewness_24w,kurtosis_24w,next_week_return
14394,VERALTO,US92338C1036,2024-11-29,108.23,0.88,108.2,0.0,108.19,1.64,0.33,...,12.77,0.016728,0.016211,-0.0036,0.000111,0.003423,0.114822,-0.773463,1.995722,-0.015343
14395,VERALTO,US92338C1036,2024-12-06,106.53,0.88,106.52,0.0,106.53,1.64,0.34,...,13.81,-0.015343,0.00263,-0.006757,-0.002164,0.003561,0.11984,-0.788762,2.074987,-0.021684
14396,VERALTO,US92338C1036,2024-12-13,104.23,0.88,104.22,0.0,104.22,1.64,0.35,...,18.360001,-0.021684,0.004007,-0.009987,-0.003555,0.004091,0.140361,-0.825087,2.468859,0.000288
14397,VERALTO,US92338C1036,2024-12-20,104.23,0.88,104.22,0.0,104.25,1.64,0.42,...,15.95,0.000288,-0.005003,0.000558,-0.004162,0.004046,0.138802,-0.819809,2.458953,-0.010839
14398,VERALTO,US92338C1036,2024-12-27,103.12,0.88,103.11,0.0,103.12,1.64,0.43,...,16.709999,-0.010839,-0.011895,0.002158,-0.006365,0.002217,0.077397,-0.711546,2.592566,


In [55]:
# standardize data

df_stand = df_target.copy() # Make a copy of df_34

X_cols = ["close_price", "ask_price", "bid_price", "price_high", "price_low", "price_open",
         "turnover_value", "turnover_volume", "vwap", "trading_volume_wa",
         "total_return", "pe", "ptbv", "dividend_yield", "operating_pm", "gross_pm",
         "net_income", "roe", "roic", "roa", "ebit", "ebitda", "dpps",
         "current_ratio", "quick_ratio", "asset_turnover",
         "tdce", "ltdce", "interest_cover1", "interest_cover2", "cash_dividend", "shares_outstanding",
         "SP500_Return (%)", "VIX_Close", "weekly_return",
          "avg_4w", "avg_8w", "avg_12w", "avg_24w", "sharpe_ratio_24w", "skewness_24w","kurtosis_24w"]

y_col = ['next_week_return']

X = df_stand[X_cols]
y = df_stand[y_col]

scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

scaler_y = StandardScaler()
y_scaled= scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

df_scaled = df_stand.copy()

for i in range(len(X_cols)):
    df_scaled[X_cols[i]] = X_scaled[:,i]

df_scaled[y_col[0]] = y_scaled

df_scaled

Unnamed: 0,company_name,company_code,date,ask_price,asset_turnover,bid_price,cash_dividend,close_price,current_ratio,dividend_yield,...,VIX_Close,weekly_return,avg_4w,avg_8w,avg_12w,avg_24w,sharpe_ratio_24w,skewness_24w,kurtosis_24w,next_week_return
0,ALTRIA GROUP,US02209S1033,2015-01-23,-0.269603,-0.463570,-0.269673,2.303231,-0.269655,-0.71306,1.605569,...,0.414175,0.363874,0.585498,0.853302,1.076538,1.539674,2.146880,-1.772019,0.489452,-0.550288
1,ALTRIA GROUP,US02209S1033,2015-01-30,-0.270921,-0.463570,-0.270993,2.303231,-0.270986,-0.71306,1.650267,...,-0.105637,-0.541470,0.722221,0.355401,0.451213,0.654782,0.914374,-0.467964,-1.510805,-0.009240
2,ALTRIA GROUP,US02209S1033,2015-02-06,-0.270665,-0.463570,-0.270736,2.303231,-0.270705,-0.71306,1.639093,...,-0.472896,-0.009498,0.395029,0.291379,0.370807,0.541000,0.895100,-0.300008,-1.121006,0.488359
3,ALTRIA GROUP,US02209S1033,2015-02-13,-0.268920,-0.463570,-0.268989,2.303231,-0.268959,-0.71306,1.583219,...,-0.527985,0.479753,0.150887,0.456847,0.578621,0.835076,1.420826,-0.661811,-0.916520,0.238441
4,ALTRIA GROUP,US02209S1033,2015-02-20,-0.267882,-0.463570,-0.267950,2.303231,-0.267921,-0.71306,1.549695,...,-0.663588,0.234028,0.083460,0.488133,0.617914,0.890680,1.651270,-0.760750,-0.684596,0.165531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14394,VERALTO,US92338C1036,2024-11-29,-0.203661,-0.069867,-0.203677,-0.387846,-0.203727,-0.25599,-0.355593,...,-0.744102,0.260260,0.516150,-0.550294,-0.379775,-0.134079,-0.103044,-0.991918,0.367025,-0.444693
14395,VERALTO,US92338C1036,2024-12-06,-0.205736,-0.069867,-0.205730,-0.387846,-0.205753,-0.25599,-0.350006,...,-0.597199,-0.437646,-0.097725,-0.757857,-0.567660,-0.117929,-0.078983,-1.010353,0.400589,-0.585027
14396,VERALTO,US92338C1036,2024-12-13,-0.208543,-0.069867,-0.208541,-0.387846,-0.208574,-0.25599,-0.344418,...,0.045504,-0.575627,-0.035496,-0.970238,-0.682487,-0.056066,0.019410,-1.054125,0.567372,-0.098735
14397,VERALTO,US92338C1036,2024-12-20,-0.208543,-0.069867,-0.208541,-0.387846,-0.208537,-0.25599,-0.305307,...,-0.294917,-0.097492,-0.442744,-0.276880,-0.732686,-0.061295,0.011935,-1.047765,0.563178,-0.345007


#