In [83]:
from  datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np, pandas as pd
import matplotlib.pyplot as plot 
import random
import lightgbm as lgb
import joblib

In [84]:
def load_df():
    
    cal_cat_cols = ['weekday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    sales_cat_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    price_cat_cols = ['store_id', 'item_id']
    
    calendars = pd.read_csv('/Users/hshan/Downloads/M5/calendar.csv')
    sales = pd.read_csv('/Users/hshan/Downloads/M5/sales_train_validation.csv')
    prices = pd.read_csv('/Users/hshan/Downloads/M5/sell_prices.csv')
    
    n=1913 + 56
    for i in range(1914,(n+1)):
        col = f'd_{i}'
        sales[col] = pd.Series()
    
    ind_var = ['id'] + sales_cat_cols
    val_var = [col for col in sales.columns if col.startswith('d_')]
    df = pd.melt(sales, id_vars = ind_var, value_vars = val_var, var_name = 'd', value_name='sales')
    df = df.merge(calendars, on = 'd', copy = False)
    df = df.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    cat_cols = cal_cat_cols + sales_cat_cols
    label_encoder = LabelEncoder()
    for col in cat_cols:
        df[col] = df[col].fillna('').astype('category')
        df[col] = label_encoder.fit_transform(df[col])

    unused_cols = ['date','wm_yr_wk', 'weekday']
    
    df.drop(unused_cols, inplace = True, axis = 1)
    
    return (df)

In [85]:
def lag_features(df):
    '''max lag should not be exceeding 57 in this case'''
    num = [1,7,28]
    lags = num
    windows = num
    lag_cols = [f'lag_{lag}' for lag in lags]
    

    for lag, lag_col in zip(lags, lag_cols):
        df[lag_col] = df[['id','sales']].groupby('id')['sales'].shift(lag)
        
    for window in windows:
        for lag, lag_col in zip(lags, lag_cols):
            mean_col = f'mean_{lag}_{window}'
            df[mean_col] = df[['id',lag_col]].groupby('id')[lag_col].transform(lambda x: x.rolling(window).mean())
    

    return (df)

categorical_cols = ['weekday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'] + \
    ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
removed_cols = ['id', 'date', 'sales', 'd', 'wm_yr_wk', 'weekday']

def pca(x_features_df, n_components):
    pca = PCA(n_components)
    pca_cols = []
    for i in range(n_components):
        pca_col = f'pc_{i+1}'
        pca_cols.append(pca_col)
    p_components = pca.fit_transform(x_features_df)
    pca_features_df = pd.DataFrame(data = p_components, columns = pca_cols)
    
    return(pca_features_df)



In [None]:
df = load_df()
df = lag_features(df)

In [None]:
cols = []
for i in range(1942, 1970):
    col = f'd_{i}'
    cols.append(col)
val_df = pd.DataFrame(df.loc[df['d'].isin(cols)])
val_df.fillna(0, inplace=True)

In [None]:
df_cols = list(val_df.columns)
x_features =[]
for feature in df_cols:
    if feature not in removed_cols:
        x_features.append(feature)
n_components = int(0.65*(len(df_cols)-2))
pca_features_df = pca(val_df[x_features], n_components)

In [None]:
model = joblib.load('/Users/hshan/model.sav')
result = model.predict(pca_features_df)

In [None]:
result_df=pd.DataFrame(val_df[['id','d']])
result_df['sales'] = result
df_unmelted = result_df.pivot(index='id', columns='d')
df_unmelted = df_unmelted['sales'].reset_index()
df_unmelted.columns.name = None

In [None]:
f_cols =[]
for i in range(0, 28):
    f_col = f'F{i+1}'
    f_cols.append(f_col)
df_cols = ['id'] + f_cols
df_unmelted.columns = df_cols

In [None]:
for i in range(0,len(df_unmelted)):
    df_unmelted['id'][i] = df_unmelted['id'][i].replace('validation','evaluation')

In [77]:
df_unmelted.to_csv('/Users/hshan/Downloads/M5/submission1.csv')

NameError: name 'df_unmelted' is not defined