In [2]:
import numpy as np
import statsmodels.api as sm
import pandas as pd
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('final_merged_data.csv')
print(data.columns)

Index(['Company Name_x', 'TICKER', 'Quarter', 'Price', 'Total Ret',
       'Price Ret Ex-Dividend', 'Weighted Mkt Return', 'Volume', 'Dividend',
       'Price Low', 'Price High', 'Shares Outstanding', 'Excess Return',
       'Mkt Cap', 'Price Growth Rate', 'Quart Rev', 'Quart NI', 'Equity',
       'Liabilities', 'Revenue', 'Industry Code', 'Asset', 'Current Asset',
       'EPS', 'Rating', 'Revenue Growth', 'Earnings Growth Rate',
       'P/E Ratio Annual', 'P/E Ratio Quarter', 'Mean Recommendation',
       'Median Recommendation', 'Recommendation StdDev',
       'Number of Recommendations', 'Number of Upgrades',
       'Number of Downgrades', 'Buy Percentage', 'Sell Percentage',
       'Hold Percentage', 'Total Ret Avg +1D', 'Total Ret Avg +5D',
       'Total Ret Avg +10D', 'Total Ret Avg +20D', 'Excess Return Avg +1D',
       'Excess Return Avg +5D', 'Excess Return Avg +10D',
       'Excess Return Avg +20D'],
      dtype='object')


In [3]:
# SICCD to Industry Mapping (Short Names)
siccd_mapping = {
    range(100, 1000): "Agriculture",
    range(1000, 1500): "Mining",
    range(1500, 1800): "Construction",
    range(2000, 4000): "Manufacturing",
    range(4000, 5000): "Transport",
    range(5000, 6000): "Retail",
    range(6000, 6800): "Finance",
    range(7000, 9000): "Services",
    range(9100, 10000): "Public",
}

detailed_siccd_mapping = {
    range(100, 1000): "Agriculture, Forestry, and Fishing",
    range(1000, 1500): "Mining and Quarrying",
    range(1500, 1800): "Construction",
    range(2000, 2400): "Food and Tobacco Manufacturing",
    range(2400, 2700): "Paper and Printing Manufacturing",
    range(2700, 2800): "Chemical and Pharmaceutical Manufacturing",
    range(2800, 2900): "Plastics and Rubber Products Manufacturing",
    range(3000, 3400): "Metals and Machinery Manufacturing",
    range(3400, 3600): "Electronics Manufacturing",
    range(3600, 3800): "Technology and Semiconductor Manufacturing",
    range(3800, 4000): "Miscellaneous Manufacturing",
    range(4000, 4500): "Transportation and Logistics",
    range(4500, 5000): "Communication Services and Utilities",
    range(5000, 5200): "Wholesale Trade",
    range(5200, 5600): "Retail Trade",
    range(5600, 6000): "Consumer Services",
    range(6000, 6200): "Banks and Credit Institutions",
    range(6200, 6400): "Insurance Companies",
    range(6400, 6800): "Real Estate and Investment Services",
    range(7000, 7300): "Hotels and Entertainment Services",
    range(7300, 7900): "Professional and Business Services",
    range(7900, 8000): "Healthcare and Social Services",
    range(8000, 8900): "Educational and Research Services",
    range(8900, 9000): "Other Services",
    range(9100, 9700): "Public Administration and Government",
    range(9700, 10000): "International Organizations"
}

def map_sic_to_industry(sic_code):
    for key, value in detailed_siccd_mapping.items():
        if sic_code in key:
            return value
    return "Others"
data['Real Industry'] = data['Industry Code'].apply(map_sic_to_industry)

In [6]:
selected_features = [
    'Weighted Mkt Return',
    'Price Growth Rate',
    'Revenue Growth',
    'Earnings Growth Rate',
    'Mkt Cap',
    'Volume'
]

results_table = []

# Iterate through industries
for industry, group in data.groupby('Real Industry'):
    # Select features and target variable
    X = group[selected_features]
    y = group['Total Ret Avg +5D']

    # Add a constant term
    X = sm.add_constant(X)

    # Clean the data by removing inf, -inf, and NaN values
    X = X.replace([np.inf, -np.inf], np.nan).dropna()
    y = y[X.index]

    # Check for remaining missing values in the target variable
    if y.isna().sum() > 0:
        valid_indices = y.dropna().index
        X = X.loc[valid_indices]
        y = y.loc[valid_indices]

    # Standardize features for numerical stability
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

    # Fit the regression model
    model = sm.OLS(y, X_scaled).fit()

    # Extract beta coefficients and R^2
    betas = model.params.to_dict()  # Get coefficients as a dictionary
    r_squared = model.rsquared  # R-squared value

    # Append results for the industry
    results_table.append({
        'Industry': industry,
        'R^2': r_squared,
        **{f"Beta_{key}": value for key, value in betas.items()}
    })

# Create a DataFrame for the results
results_df = pd.DataFrame(results_table)


# Show the results table
results_df

Unnamed: 0,Industry,R^2,Beta_const,Beta_Weighted Mkt Return,Beta_Price Growth Rate,Beta_Revenue Growth,Beta_Earnings Growth Rate,Beta_Mkt Cap,Beta_Volume
0,Banks and Credit Institutions,0.627031,0.0,0.006196,0.027862,0.002585,0.001324,-0.001587,0.006473
1,Communication Services and Utilities,0.410365,0.0,-6.8e-05,0.033518,-0.002206,0.001259,-0.003614,-0.000264
2,Construction,0.465495,0.0,0.000892,0.030045,0.002865,0.00014,0.000601,-0.002282
3,Consumer Services,0.383279,0.0,-0.000621,0.023891,0.000637,-0.000153,0.001893,0.004566
4,Educational and Research Services,0.589659,0.0,0.000774,0.035571,0.000662,-0.00125,-0.000873,0.003433
5,Electronics Manufacturing,0.521806,0.0,0.001304,0.031553,0.002974,0.00193,-0.001386,0.002931
6,Food and Tobacco Manufacturing,0.627094,0.0,0.001284,0.039643,0.002771,0.000204,0.000607,0.000694
7,Healthcare and Social Services,0.671279,0.0,-0.001529,0.032534,-0.000637,0.000988,0.000799,-0.0026
8,Hotels and Entertainment Services,0.339107,0.0,0.003859,0.016392,-0.000561,0.001501,0.004559,-0.004335
9,Insurance Companies,0.460596,0.0,0.003647,0.024429,-4e-05,-0.000295,-0.000947,4.6e-05


In [59]:
#data['Next Quarter Excess Ret'] = data.groupby('TICKER')['Excess Return'].shift(-1)

X = data[selected_features]
#y = data['Next Quarter Excess Ret']
y = data['Total Ret Avg +5D']
X = sm.add_constant(X)

X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y[X.index]

# Check for remaining missing values in the target variable and clean if necessary
if y.isna().sum() > 0:
    valid_indices = y.dropna().index
    X = X.loc[valid_indices]
    y = y.loc[valid_indices]

# Standardize features for numerical stability
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

model = sm.OLS(y, X_scaled).fit()

print(model.summary())


                                 OLS Regression Results                                
Dep. Variable:      Total Ret Avg +5D   R-squared (uncentered):                   0.414
Model:                            OLS   Adj. R-squared (uncentered):              0.414
Method:                 Least Squares   F-statistic:                              1104.
Date:                Wed, 04 Dec 2024   Prob (F-statistic):                        0.00
Time:                        20:31:25   Log-Likelihood:                          17995.
No. Observations:                9379   AIC:                                 -3.598e+04
Df Residuals:                    9373   BIC:                                 -3.594e+04
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

  return np.sqrt(eigvals[0]/eigvals[-1])
