<a href="https://colab.research.google.com/github/ikyath/M5-Forecasting-Accuracy-Kaggle/blob/master/M5_Forecast_Encoder_Decoder_BetterFeatures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
pwd


In [0]:
cd /content/drive/My\ Drive/Data\ Science


In [0]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from tqdm.notebook import tqdm as tqdm
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Dropout
from keras.layers import RepeatVector,TimeDistributed
from numpy import array
from keras.models import Sequential, load_model

In [0]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [0]:
def read_data(PATH):
    print('Reading files...')
    calendar = pd.read_csv(f'{PATH}/calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))
    sell_prices = pd.read_csv(f'{PATH}/sell_prices.csv')
    sell_prices = reduce_mem_usage(sell_prices)
    print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))
    sales_train_validation = pd.read_csv(f'{PATH}/sales_train_validation.csv')
    print('Sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    submission = pd.read_csv(f'{PATH}/sample_submission.csv')
    return calendar, sell_prices, sales_train_validation, submission

In [0]:
calendar, selling_prices, sales, submission = read_data("/content/drive/My Drive/Data Science")

In [0]:
sales = pd.read_csv('sales_train_validation.csv')
calendar = pd.read_csv('calendar.csv')
selling_prices = pd.read_csv('sell_prices.csv')
submission_file = pd.read_csv('sample_submission.csv')

In [0]:
sales.head()

In [0]:
calendar.head()

In [0]:
days = range(1, 1970)
time_series_columns = [f'd_{i}' for i in days]
transfer_cal = pd.DataFrame(calendar[['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']].values.T, index=['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI'], columns= time_series_columns)
transfer_cal = transfer_cal.fillna(0)
event_name_1_se = transfer_cal.loc['event_name_1'].apply(lambda x: x if re.search("^\d+$", str(x)) else np.nan).fillna(10)
event_name_2_se = transfer_cal.loc['event_name_2'].apply(lambda x: x if re.search("^\d+$", str(x)) else np.nan).fillna(10)

In [0]:
transfer_cal.head()

In [0]:
def transform(data):
    
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    cat = ['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI','month','wday']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])

    # # week day
    # week_period = 7 / (2 * np.pi)
    # dow_norm = data["wday"].values / week_period
    # wday_cos = pd.DataFrame(np.cos(dow_norm))
    # wday_sin = pd.DataFrame(np.sin(dow_norm))
    
    
    
    # # month
    # month_period = 12 / (2 * np.pi)
    # dow_norm = data["month"].values / month_period
    # month_cos = pd.DataFrame(np.cos(dow_norm))
    # month_sin = pd.DataFrame(np.sin(dow_norm))
    # # print(month_sin)

    # #print(df["date"])

    # # day
    # day_period = 31 / (2 * np.pi)
    # dow_norm = data["date"].dt.day / day_period
    # day_cos = pd.DataFrame(np.cos(dow_norm))
    # day_sin = pd.DataFrame(np.sin(dow_norm))
    # print(day_sin)
    
    data['wday_cos'] = np.cos(data['wday']/(7/(2*np.pi)))
    data['wday_sin'] = np.sin(data['wday']/(7/(2*np.pi)))

    data['month_cos'] = np.cos(data['month']/(12/(2*np.pi)))
    data['month_sin'] = np.sin(data['month']/(12/(2*np.pi)))

    data['day_cos'] = np.cos(data["date"].dt.day/(31/(2*np.pi)))
    data['day_sin'] = np.sin(data["date"].dt.day/(31/(2*np.pi)))


    # # month
    # month = pd.get_dummies(data["month"], drop_first=False, prefix="month")
    # month = (month - month.mean()) / month.std()

    # day
    # day = pd.get_dummies(data["date"].dt.day, drop_first=False, prefix="date")
    # day = (day - day.mean()) / day.std()

    # data = pd.merge(data,day_cos, how="left",left_on=date, left_index=True, right_index=True,)
    # data = pd.merge(data,day_sin, how="left", left_on=date,left_index=True, right_index=True)
    # data = pd.merge(data,wday_cos, how="left",left_on=wday ,left_index=True, right_index=True)
    # data = pd.merge(data,wday_sin, how="left", left_on=wday,left_index=True, right_index=True)
    # data = pd.merge(data,month_cos, how="left", left_on=month,left_index=True, right_index=True)
    # data = pd.merge(data,month_sin, how="left", left_on=month,left_index=True, right_index=True)


    
    return data

In [0]:
calendar.head()

In [0]:
calendar['date'] = pd.to_datetime(calendar['date'])
calendar = calendar[calendar['date']>= '2016-1-27']  #reduce memory
calendar= transform(calendar)
# Attempts to convert events into time series data.


In [0]:
calendar.head()

In [0]:
transfer_cal = pd.DataFrame(calendar[['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI','day_cos','day_sin','wday_cos','wday_sin','month_cos','month_sin']].values.T,
                            index=['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI','day_cos','day_sin','wday_cos','wday_sin','month_cos','month_sin'])
transfer_cal

In [0]:
calendar.tail()

In [0]:
calendar.shape

In [0]:
price_fea = calendar[['wm_yr_wk','date']].merge(selling_prices, on = ['wm_yr_wk'], how = 'left')
price_fea['id'] = price_fea['item_id']+'_'+price_fea['store_id']+'_validation'


In [0]:
price_fea.head()

In [0]:
df = price_fea.pivot('id','date','sell_price')

In [0]:
df.head()

In [0]:
price_fea.head()

In [0]:
price_df = sales.merge(df,on=['id'],how= 'left').iloc[:,-145:]
price_df.index = sales.id
price_df.head()

In [0]:
transfer_cal

In [0]:
transfer_cal.loc['event_name_1'][-(100+28):-(28)].shape

In [0]:
days = range(1, 1913 + 1)
time_series_columns = [f'd_{i}' for i in days]
time_series_data = sales[time_series_columns]  #Get time series data

In [0]:
time_series_data.shape

In [0]:
X = []   #build a data with two features(salse and event1)
for i in tqdm(range(time_series_data.shape[0])):
    X.append([list(t) for t in zip(transfer_cal.loc['event_name_1'][-(100+28):-(28)],
                                   transfer_cal.loc['event_type_1'][-(100+28):-(28)],
                                   transfer_cal.loc['event_name_2'][-(100+28):-(28)],     
                                   transfer_cal.loc['event_type_2'][-(100+28):-(28)],
                                   transfer_cal.loc['snap_CA'][-(100+28):-(28)],
                                   transfer_cal.loc['snap_TX'][-(100+28):-(28)],
                                   transfer_cal.loc['snap_WI'][-(100+28):-(28)],
                                   transfer_cal.loc['day_sin'][-(100+28):-(28)],
                                   transfer_cal.loc['day_cos'][-(100+28):-(28)],
                                   transfer_cal.loc['wday_sin'][-(100+28):-(28)],
                                   transfer_cal.loc['wday_cos'][-(100+28):-(28)],
                                   transfer_cal.loc['month_sin'][-(100+28):-(28)],
                                   transfer_cal.loc['month_cos'][-(100+28):-(28)],
                                   price_df.iloc[i][-(100+28):-(28)],
                                   time_series_data.iloc[i][-100:])]) 

X = np.asarray(X, dtype=np.float32)

In [0]:
X.shape

In [0]:
def Normalize(list):
    list = np.array(list)
    low, high = np.percentile(list, [0, 100])
    delta = high - low
    if delta != 0:
        for i in range(0, len(list)):
            list[i] = (list[i]-low)/delta
    return  list,low,high

def FNoramlize(list,low,high):
    delta = high - low
    if delta != 0:
        for i in range(0, len(list)):
            list[i] = list[i]*delta + low
    return list

def Normalize2(list,low,high):
    list = np.array(list)
    delta = high - low
    if delta != 0:
        for i in range(0, len(list)):
            list[i] = (list[i]-low)/delta
    return  list

In [0]:
np.random.seed(7)

 ## I only use the last 56 days for train_data.
if __name__ == '__main__':
    n_steps = 28
    train_n,train_low,train_high = Normalize(X[:,-(n_steps*4):,:])
    X_train = train_n[:,-28*4:-28,:]
    print(X_train.shape)
    y = train_n[:,-28:,14]  #     
    # reshape from [samples, timesteps] into [samples, timesteps, features]
    n_features = 15
    n_out_seq_length =28
    num_y = 1
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], n_features))
    y = y.reshape((y.shape[0], y.shape[1], 1))
    print(X_train.shape)
    # define model

    model = Sequential()

    
    model.add(LSTM(128, activation='tanh', input_shape=(72, n_features),return_sequences=True))
    model.add(LSTM(64, activation='tanh',return_sequences=False))
    model.add(RepeatVector(n_out_seq_length))
    model.add(LSTM(32, activation='tanh',return_sequences=True))
    model.add(LSTM(16, activation='tanh',return_sequences=True))
    model.add(Dropout(0.1))  
    model.add(TimeDistributed(Dense(num_y)))   # num_y means the shape of y,in some problem(like translate), it can be many.
                                                #In that case, you should set the  activation= 'softmax'
    model.compile(optimizer='adam', loss='mse')
    # demonstrate prediction
    model.fit(X_train, y, epochs=20, batch_size=1000)

In [0]:
x_input = array(X_train[:,-72:])
x_input = x_input.reshape((30490, 72, n_features))
print(x_input.shape)
#x_input = Normalize2(x_input,train_low,train_high)
yhat = model.predict(x_input[:,-72:], verbose=0)
x_input=np.concatenate((x_input[:,:,14].reshape(x_input.shape[0],x_input.shape[1]),yhat.astype(np.float32).reshape(x_input.shape[0],x_input.shape[1]-44)),axis=1).reshape((x_input.shape[0],x_input.shape[1]+28,1));
#print(yhat)
print(x_input.shape)

In [0]:
x_input = FNoramlize(x_input,train_low,train_high)
# x_input = np.rint(x_input)

In [0]:
forecast = pd.DataFrame(x_input.reshape(x_input.shape[0],x_input.shape[1])).iloc[:,-28:]
forecast.columns = [f'F{i}' for i in range(1, forecast.shape[1] + 1)]
forecast[forecast < 0] =0
forecast.head()

In [0]:
validation_ids = sales['id'].values
evaluation_ids = [i.replace('validation', 'evaluation') for i in validation_ids]

In [0]:
ids = np.concatenate([validation_ids, evaluation_ids])

In [0]:
predictions = pd.DataFrame(ids, columns=['id'])
forecast = pd.concat([forecast]*2).reset_index(drop=True)
predictions = pd.concat([predictions, forecast], axis=1)

In [0]:
predictions.to_csv('submission.csv', index=False)  #Generate the csv file.