In [238]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from pathlib import Path

comp_dir = Path('/Users/rishav/Documents/Sem_4/Machine_learning and Pattern_recognition/store-sales-time-series-forecasting')
data_dir = Path("../input/ts-course-data")

store_sales = pd.read_csv(
    comp_dir / 'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()

family_sales = (
    store_sales
    .groupby(['family', 'date'])
    .mean()
    .unstack('family')
    .loc['2017']
)


  store_sales = pd.read_csv(
  store_sales


In [239]:

class BoostedHybrid:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        self.y_columns = None  


In [240]:
def fit(self, X_1, X_2, y):
    self.model_1.fit(X_1, y)
    y_fit = pd.DataFrame(
        self.model_1.predict(X_1), 
        index=X_1.index, columns=y.columns,
    )

    y_resid = y - y_fit
    y_resid = y_resid.stack().squeeze() 

    self.model_2.fit(X_2, y_resid)

    self.y_columns = y.columns
    self.y_fit = y_fit
    self.y_resid = y_resid


BoostedHybrid.fit = fit

In [241]:

def predict(self, X_1, X_2):
    y_pred = pd.DataFrame(
        self.model_1.predict(X_1), 
        index=X_1.index, columns=self.y_columns,
    )
    y_pred = y_pred.stack().squeeze() 

    y_pred += self.model_2.predict(X_2)
    return y_pred.unstack()


BoostedHybrid.predict = predict

In [242]:
y = store_sales.unstack(['store_nbr', 'family']).loc["2017"]


fourier = CalendarFourier(freq='M', order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X_1= dp.in_sample()
X_1['NewYear'] = (X_1.index.dayofyear == 1)
X_2 = family_sales.drop('sales', axis=1).stack()  

le = LabelEncoder()  
X_2 = X_2.reset_index('family')
X_2['family'] = le.fit_transform(X_2['family'])

X_2["day"] = X_2.index.day 

  index = pd.date_range("2020-01-01", freq=freq, periods=1)
  X_2 = family_sales.drop('sales', axis=1).stack()  # onpromotion feature


In [243]:
model = BoostedHybrid(
    model_1=XGBRegressor(),
    model_2=LinearRegression(),
)
model.fit(X_1, X_2, y)

y_pred = model.predict(X_1, X_2)
y_pred = y_pred.clip(0.0)
actual_sales = y.values.flatten()
predicted_sales = y_pred.values.flatten()

squared_log_diff = np.square(np.log1p(actual_sales) - np.log1p(predicted_sales))

mean_squared_log_diff = np.mean(squared_log_diff)

rmsle = np.sqrt(mean_squared_log_diff)

print(f"Root Mean Squared Logarithmic Error (RMSLE): {rmsle}")


  y_resid = y_resid.stack().squeeze() # wide to long


Root Mean Squared Logarithmic Error (RMSLE): 0.015034277278826332


  y_pred = y_pred.stack().squeeze()  # wide to long


In [244]:

test_data = pd.read_csv(
    comp_dir / 'test.csv',
    usecols=['store_nbr', 'family', 'date', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

X_1_test = dp.out_of_sample(steps=16)
X_1_test['NewYear'] = (X_1_test.index.dayofyear == 1)

X_2_test = test_data.copy()
X_2_test['date'] = X_2_test.date.dt.to_period('D')
X_2_test = X_2_test.set_index(['store_nbr', 'family', 'date']).sort_index()
family_sales = (
    X_2_test
    .groupby(['family', 'date'])
    .mean()
    .unstack('family')
    .loc['2017']
)
X_2 = family_sales.stack()  
le = LabelEncoder()  
X_2 = X_2.reset_index('family')
X_2['family'] = le.fit_transform(X_2['family'])

X_2["day"] = X_2.index.day 

y_pred_test = model.predict(X_1_test, X_2)
y_pred_test = y_pred_test.clip(0.0)  




  test_data = pd.read_csv(
  X_2_test
  X_2 = family_sales.stack()  # onpromotion feature
  y_pred = y_pred.stack().squeeze()  # wide to long


In [245]:
print(y_pred_test.columns.tolist()[333])
df_test = pd.read_csv(
    comp_dir / 'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
y_submit = y_pred_test.stack(['store_nbr', 'family']).reset_index()
y_submit = y_submit.rename(columns={0: 'sales'})
y_submit = y_submit.join(df_test[['id']])  
y_submit = y_submit.reindex(columns=['id','sales'])
y_submit.to_csv('/Users/rishav/Downloads/submission.csv', index=False)



  df_test = pd.read_csv(


('sales', '19', 'BEVERAGES')


  y_submit = y_pred_test.stack(['store_nbr', 'family']).reset_index()
