In [None]:
import pandas as pd
import numpy as np

In [None]:
Base_DF = pd.read_csv("./AggregatedLiquor2.csv")

In [None]:
Edited_DF = Base_DF

Edited_DF.drop(columns= ['Unnamed: 0', 'Change Rate'], inplace= True)
Edited_DF.dropna(inplace= True)

In [None]:
Edited_DF.columns

Index(['AlcFamily', 'Month', 'Year', 'County', 'Bottle Volume (ml)',
       'State Bottle Cost', 'State Bottle Retail', 'Bottles Sold',
       'Sale (Dollars)', 'Volume Sold (Liters)', 'Volume Sold (Gallons)',
       'Date', 'is_holiday_season', 'days_in_month', 'Season',
       'prev_year_bottle_volume_ml', 'prev_year_state_bottle_cost',
       'prev_year_volume_sold_liters', 'prev_year_volume_sold_gallons',
       'prev_year_sale_bottles', '3m_rolling_bottle_volume_ml',
       '3m_rolling_state_bottle_cost', '3m_rolling_volume_sold_liters',
       '3m_rolling_volume_sold_gallons', '3m_rolling_average',
       'Per Capita Income', 'Personal Income', 'LABORFORCE', 'EMPLOYMENT',
       'UNEMPLOYMENT', 'UNEMPLOYMENT RATE', 'Value', 'Population'],
      dtype='object')

In [None]:
def ridge_regression(X, y, lambda_):
    X_with_intercept = np.column_stack((np.ones(X.shape[0]), X))

    I = np.eye(X_with_intercept.shape[1])
    I[0, 0] = 0

    beta_hat = np.linalg.solve(X_with_intercept.T @ X_with_intercept + lambda_ * I, X_with_intercept.T @ y)

    return beta_hat

In [None]:
from sklearn.model_selection import KFold
import numpy as np

def manual_mse(y_true, y_pred):
    squared_errors = (np.array(y_true) - np.array(y_pred)) ** 2
    mse = np.mean(squared_errors)
    return mse

def ols(X, y):
    X_with_intercept = np.column_stack((np.ones(X.shape[0]), X))
    beta_hat = np.linalg.solve(X_with_intercept.T @ X_with_intercept, X_with_intercept.T @ y)
    return beta_hat

def ols_cross_validation(X, y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_errors = []
    baseline_errors = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # OLS Model
        beta_hat = ols(X_train, y_train)
        X_val_with_intercept = np.column_stack((np.ones(X_val.shape[0]), X_val))
        y_pred = X_val_with_intercept @ beta_hat

        mse = manual_mse(y_val, y_pred)
        fold_errors.append(mse)

        y_baseline_pred = np.full_like(y_val, y_train.mean())
        baseline_mse = manual_mse(y_val, y_baseline_pred)
        baseline_errors.append(baseline_mse)

    average_mse = np.mean(fold_errors)
    average_baseline_mse = np.mean(baseline_errors)

    return average_mse, fold_errors, average_baseline_mse, baseline_errors

In [None]:
import pandas as pd

# Model 1

Liquor_X = pd.get_dummies(Edited_DF, columns=['AlcFamily', 'Season', 'is_holiday_season'], drop_first=True)


print(Liquor_X.columns)

selected_columns = [
    'AlcFamily_Gins',
    'AlcFamily_Liqueurs',
    'AlcFamily_Rums',
    'AlcFamily_Schnapps',
    'AlcFamily_Specialty/Other',
    'AlcFamily_Vodkas',
    'AlcFamily_Whiskies',
    'Season_Spring',
    'Season_Summer',
    'Season_Winter',
    'days_in_month',
    'prev_year_sale_bottles',
    '3m_rolling_average',
    'Per Capita Income',
    'Personal Income',
    'LABORFORCE',
    'EMPLOYMENT',
    'UNEMPLOYMENT',
    'UNEMPLOYMENT RATE',
    'Value',
    'Population'
]

Liquor_X = Liquor_X[selected_columns]

columns_to_convert = [
    'AlcFamily_Gins',
    'AlcFamily_Liqueurs',
    'AlcFamily_Rums',
    'AlcFamily_Schnapps',
    'AlcFamily_Specialty/Other',
    'AlcFamily_Vodkas',
    'AlcFamily_Whiskies',
    'Season_Spring',
    'Season_Summer',
    'Season_Winter',
]

Liquor_X[columns_to_convert] = Liquor_X[columns_to_convert].astype(int)

Liquor_Y = Edited_DF['Bottles Sold']


Index(['Month', 'Year', 'County', 'Bottle Volume (ml)', 'State Bottle Cost',
       'State Bottle Retail', 'Bottles Sold', 'Sale (Dollars)',
       'Volume Sold (Liters)', 'Volume Sold (Gallons)', 'Date',
       'days_in_month', 'prev_year_sale_bottles', '3m_rolling_average',
       'Per Capita Income', 'Personal Income', 'LABORFORCE', 'EMPLOYMENT',
       'UNEMPLOYMENT', 'UNEMPLOYMENT RATE', 'Value', 'Population',
       'AlcFamily_Gins', 'AlcFamily_Liqueurs', 'AlcFamily_Rums',
       'AlcFamily_Schnapps', 'AlcFamily_Specialty/Other', 'AlcFamily_Vodkas',
       'AlcFamily_Whiskies', 'Season_Spring', 'Season_Summer',
       'Season_Winter'],
      dtype='object')


In [None]:
average_mse, fold_errors, average_baseline_mse, baseline_errors = ols_cross_validation(Liquor_X, Liquor_Y, k=5)

print("Cross-Validation Results: OLS Model vs Baseline Model")
print("------------------------------------------------------")
print(f"Average OLS MSE: {average_mse:.4f}")
print(f"Average Baseline MSE: {average_baseline_mse:.4f}\n")

print(f"{'Fold':<6}{'OLS MSE':<15}{'Baseline MSE':<15}")
print("-" * 36)
for i, (ols_mse, baseline_mse) in enumerate(zip(fold_errors, baseline_errors), 1):
    print(f"{i:<6}{ols_mse:<15.4f}{baseline_mse:<15.4f}")

Cross-Validation Results: OLS Model vs Baseline Model
------------------------------------------------------
Average OLS MSE: 2290394.4601
Average Baseline MSE: 75378940.5856

Fold  OLS MSE        Baseline MSE   
------------------------------------
1     1841231.2144   69826901.0872  
2     2411962.6539   74912450.4438  
3     1834517.8163   79051183.8596  
4     2323070.3678   84557380.7616  
5     3041190.2483   68546786.7756  


In [None]:
# Model 2 (More Features)
import pandas as pd

Liquor_X = pd.get_dummies(Edited_DF, columns=['AlcFamily', 'Season'], drop_first=True)

selected_columns = [
    'AlcFamily_Gins',
    'AlcFamily_Liqueurs',
    'AlcFamily_Rums',
    'AlcFamily_Schnapps',
    'AlcFamily_Specialty/Other',
    'AlcFamily_Vodkas',
    'AlcFamily_Whiskies',
    'Season_Spring',
    'Season_Summer',
    'Season_Winter',
    'days_in_month',
    'prev_year_sale_bottles',
    '3m_rolling_average',
    'prev_year_bottle_volume_ml',
    'prev_year_state_bottle_cost',
    'prev_year_volume_sold_liters',
    'prev_year_volume_sold_gallons',
    '3m_rolling_bottle_volume_ml',
    '3m_rolling_state_bottle_cost',
    '3m_rolling_volume_sold_liters',
    '3m_rolling_volume_sold_gallons',
    'Per Capita Income',
    'Personal Income',
    'LABORFORCE',
    'EMPLOYMENT',
    'UNEMPLOYMENT',
    'UNEMPLOYMENT RATE',
    'Value',
    'Population'
]

Liquor_X = Liquor_X[selected_columns]

Liquor_X = Liquor_X[selected_columns]

columns_to_convert = [
    'AlcFamily_Gins',
    'AlcFamily_Liqueurs',
    'AlcFamily_Rums',
    'AlcFamily_Schnapps',
    'AlcFamily_Specialty/Other',
    'AlcFamily_Vodkas',
    'AlcFamily_Whiskies',
    'Season_Spring',
    'Season_Summer',
    'Season_Winter',
]

Liquor_X[columns_to_convert] = Liquor_X[columns_to_convert].astype(int)

Liquor_Y = Edited_DF['Bottles Sold']

print(Liquor_X.columns)


Index(['AlcFamily_Gins', 'AlcFamily_Liqueurs', 'AlcFamily_Rums',
       'AlcFamily_Schnapps', 'AlcFamily_Specialty/Other', 'AlcFamily_Vodkas',
       'AlcFamily_Whiskies', 'Season_Spring', 'Season_Summer', 'Season_Winter',
       'days_in_month', 'prev_year_sale_bottles', '3m_rolling_average',
       'prev_year_bottle_volume_ml', 'prev_year_state_bottle_cost',
       'prev_year_volume_sold_liters', 'prev_year_volume_sold_gallons',
       '3m_rolling_bottle_volume_ml', '3m_rolling_state_bottle_cost',
       '3m_rolling_volume_sold_liters', '3m_rolling_volume_sold_gallons',
       'Per Capita Income', 'Personal Income', 'LABORFORCE', 'EMPLOYMENT',
       'UNEMPLOYMENT', 'UNEMPLOYMENT RATE', 'Value', 'Population'],
      dtype='object')


In [None]:
average_mse, fold_errors, average_baseline_mse, baseline_errors = ols_cross_validation(Liquor_X, Liquor_Y, k=5)

print("Cross-Validation Results: OLS Model vs Baseline Model")
print("------------------------------------------------------")
print(f"Average OLS MSE: {average_mse:.4f}")
print(f"Average Baseline MSE: {average_baseline_mse:.4f}\n")

print(f"{'Fold':<6}{'OLS MSE':<15}{'Baseline MSE':<15}")
print("-" * 36)
for i, (ols_mse, baseline_mse) in enumerate(zip(fold_errors, baseline_errors), 1):
    print(f"{i:<6}{ols_mse:<15.4f}{baseline_mse:<15.4f}")

Cross-Validation Results: OLS Model vs Baseline Model
------------------------------------------------------
Average OLS MSE: 1818789.5335
Average Baseline MSE: 75378940.5856

Fold  OLS MSE        Baseline MSE   
------------------------------------
1     1187472.3973   69826901.0872  
2     2030461.4707   74912450.4438  
3     1747049.4415   79051183.8596  
4     1750400.7150   84557380.7616  
5     2378563.6429   68546786.7756  
