In [None]:
import pandas as pd
from pathlib import Path

# Load the daily merged data
data_folder = Path("data")
df_merge_daily = pd.read_csv(data_folder / 'df_merge_daily.csv', index_col=0, parse_dates=True)

# Display basic information
print("Data loaded successfully!")
print(f"Shape: {df_merge_daily.shape}")
print(f"\nFirst few rows:")
print(df_merge_daily.head())
print(f"\nData types:")
print(df_merge_daily.dtypes)
print(f"\nBasic statistics:")
print(df_merge_daily.describe())

# OLS

In [None]:
def linear_regression_statsmodels(fdf, product ='A',other_products_flag = False  ,temporal_columns_flag=False):
    """
    Perform OLS regression using statsmodels to predict log_sales_{product} from df_model.
    """
    temporal_col_names = [ 'dayofmonth', 'month', 'year','is_weekend'] #'dayofweek',
    # 'dayofyear' and 'month' are correlated - exclude 'dayofyear'
    col_name_product = [f'log_sales_{product}']
    col_name_product_price = [f'log_sell_price_{product}']
    col_names_price = [ 'log_sell_price_A','log_sell_price_B','log_sell_price_C','log_sell_price_D','log_sell_price_E']
    if other_products_flag == False:
        col_names_price = col_name_product_price
    if temporal_columns_flag == False:
        temporal_col_names = []
    df_model = fdf[col_name_product+col_names_price+temporal_col_names].copy(deep=True)
    print(f"Running OLS regression for product: {product}")
    print("-"*70)
    print(df_model.columns.tolist())
    # Use df_model
    df_work = df_model.copy(deep=True)

    target = f'log_sales_{product}'
    if target not in df_work.columns:
        raise KeyError(f"{target} not found in df_model columns: {df_work.columns.tolist()}")

    # Select numeric features (exclude target)
    X = df_work.select_dtypes(include=["number"]).drop(columns=[target], errors='ignore')
    y = df_work[target]

    # Drop rows with NaNs
    mask = X.notna().all(axis=1) & y.notna()
    X = X.loc[mask]
    y = y.loc[mask]

    # Remove constant columns
    X = X.loc[:, X.std() > 0]

    print(f"Target: {target}")
    print(f"Number of observations: {len(y)}")
    print(f"Features ({len(X.columns)}): {X.columns.tolist()}")

    # Fit OLS
    X_sm = sm.add_constant(X)
    model = sm.OLS(y, X_sm)
    results = model.fit()
    beta0, beta1 = results.params[0], results.params[1]

    print("\n" + "="*70)
    print("STATSMODELS OLS REGRESSION SUMMARY")
    print("="*70)
    print(results.summary())

    # Store results
    _sm_ols_results = results
    
    return _sm_ols_results, beta0, beta1


In [None]:
lr_results = linear_regression_statsmodels(df_merge_daily, product ='A')
