### This notebook is Final submission for the hackathon 

https://datahack.analyticsvidhya.com/contest/women-in-the-loop-a-data-science-hackathon-by-bain/

My public and private leaderboard scores are 119.0311429264 (43rd place) and 126.2250155814 (30th place) respectively

In [None]:
import pandas as pd
import numpy as np
import warnings

import tensorflow as tf
import keras
from keras.layers import LeakyReLU
from keras import optimizers
from keras.utils import plot_model
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, Flatten
from keras.layers import CuDNNLSTM

from sklearn.model_selection import train_test_split


In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test_QkPvNLx.csv')

In [None]:
train_df.shape

In [None]:
train_df.head(3)

In [None]:
test_df.shape

In [None]:
# Fill null values with zeroes 
train_df.Competition_Metric = train_df.Competition_Metric.fillna(0)

In [None]:
# df_course_1 = train_df.loc[train_df.Course_ID ==1]

In [None]:
def create_lag_features(df, sales_cols, columns_list, lag_days):
    temp = df.copy()
    for i in range(lag_days, 0, -1):
        temp = pd.concat([temp[columns_list],df[sales_cols].shift(i)], axis=1)
        columns_list = columns_list +[sales_col+'_t_'+str(i) for sales_col in sales_cols]
        temp.columns = columns_list
    return temp

In [None]:
original_column_list = ['ID', 'Day_No', 'Course_ID', 'Course_Domain', 'Course_Type', 'Public_Holiday', 'Competition_Metric']
sales_cols = ['Sales', 'User_Traffic', 'Long_Promotion','Short_Promotion']
lag_days = 60
train_lag_df = pd.DataFrame()
# Create lag features for each unique course
for course_id in train_df.Course_ID.unique():
    column_list = original_column_list.copy()
    temp_df = create_lag_features(train_df.loc[train_df.Course_ID ==course_id], sales_cols, column_list, lag_days)
    train_lag_df = train_lag_df.append(temp_df)
print("Created lag features for " + str(lag_days) + " days")

In [None]:
# Add User traffic, sales information to lag features dataframe
train_lag_df['User_Traffic'] = train_df['User_Traffic']
train_lag_df['Sales_Today'] = train_df['Sales']

In [None]:
# Drop null values
train_lag_df = train_lag_df.dropna()

In [None]:
train_lag_df.sample(2)

In [None]:
train_lag_df.shape

In [None]:
# Use test information in train i.e add lag from test because test preceeds train
derived_test_df = pd.DataFrame()
actual_training_df = pd.DataFrame()
train_target_columns = ['Short_Promotion', 'Public_Holiday', 'Long_Promotion', 'Competition_Metric', 'Sales']
train_target_append_columns = [col+'_t_+60' for col in train_target_columns if 'Sales' not in col]
for course_id in train_df.Course_ID.unique():
    train_lag_course_df = train_lag_df.loc[train_lag_df.Course_ID==course_id]
    train_course_df = train_df[train_df.Course_ID==course_id]
    train_target_df = train_course_df[train_target_columns].shift(-60)
    train_target_df.columns = train_target_append_columns + ['Sales']
    temp_actual_training_df = pd.concat([train_lag_course_df, train_target_df], axis=1)
    derived_test_df = derived_test_df.append(temp_actual_training_df[temp_actual_training_df['Sales'].isna()],
                                            verify_integrity=True)
    actual_training_df = actual_training_df.append(temp_actual_training_df.dropna(), verify_integrity=True)
    del temp_actual_training_df
    del train_target_df
    del train_course_df
    del train_lag_course_df
print("Created target for train data and derived test data")
print("Actual test data shape ", test_df.shape)
print("Derived test data from train shape ", derived_test_df.shape)

In [None]:
print("Checking whether course ID in derived test is matching actual test")
(derived_test_df.sort_values(by=['Course_ID','Day_No'])['Course_ID'].reset_index(drop=True)==test_df.sort_values(by=['Course_ID','Day_No'])['Course_ID'].reset_index(drop=True)).value_counts()

In [None]:
# derived_test_df_copy = derived_test_df.copy()
# derived_test_df = derived_test_df_copy.copy()

In [None]:

derived_test_columns = ['Short_Promotion', 'Public_Holiday', 'Long_Promotion', 'Competition_Metric']
for col in derived_test_columns:
    derived_test_df = derived_test_df.sort_values(by=['Course_ID','Day_No']).reset_index(drop=True)
    derived_test_df[col+'_t_+60'] = test_df.sort_values(by=['Course_ID','Day_No'])[col]
print("Derived test data is prepared")

In [None]:
derived_test_df.head()

In [None]:
# actual_training_df.reset_index(drop = True).to_csv('actual_train_new.csv',index=False)
# derived_test_df.reset_index(drop= True).to_csv('derived_test_new.csv',index=False)

In [None]:
model_train_df = actual_training_df.reset_index(drop = True)
model_test_df = derived_test_df.reset_index(drop= True)

In [None]:
def overall_preprocessing(df, is_test=False):
    df.Competition_Metric = df.Competition_Metric.fillna(0)
    df['Competition_Metric_t_+60'] = df['Competition_Metric_t_+60'].fillna(0)
    course_type = pd.get_dummies(df['Course_Type'])
    course_domain = pd.get_dummies(df['Course_Domain'])
    
    user_traffic_columns = [col for col in df.columns if 'User_Traffic' in col]
    
    df[user_traffic_columns] = df[user_traffic_columns]/100
    df_processed = pd.concat([df, course_type, course_domain], axis=1)
    df_processed['Day_No'] = df_processed['Day_No'].mod(365)
    df_processed = df_processed.drop(columns = ['ID','Course_Type','Course_Domain'])
    if is_test:
        del df_processed['Sales']
        print("Test shape: " + str(df_processed.shape))
        return df_processed
    else:
        target = df_processed[['Sales']]
        del df_processed['Sales'] 
        print("Train shape: "+str(df_processed.shape))
        return df_processed, target

In [None]:
# model_encoded_train_df, model_target_df = overall_preprocessing(model_train_df)
# model_encoded_test_df = overall_preprocessing(model_test_df, True)
# model_encoded_train_df.head(2)
# model_encoded_test_df.head(2)
# model_encoded_test_df.isnull().any().value_counts()

In [None]:
# Create train, test, cross validation splits
train_cross_val_df = pd.DataFrame()
holdout_df = pd.DataFrame()
for course_id in list(actual_training_df.Course_ID.unique()):
  temp_model_train_df = actual_training_df.loc[actual_training_df.Course_ID==course_id]
  train_cross_val_df = train_cross_val_df.append(temp_model_train_df[:-60], ignore_index=True)
  holdout_df = holdout_df.append(temp_model_train_df[-60:], ignore_index=True)    
  del temp_model_train_df

model_train_df = actual_training_df.reset_index(drop = True)
model_train_cross_val_df = train_cross_val_df.reset_index(drop=True)
model_holdout_df = holdout_df.reset_index(drop=True)
model_test_df = derived_test_df.reset_index(drop= True)

model_encoded_train_df, model_target_df = overall_preprocessing(model_train_df)
model_encoded_train_cross_val_df, model_target_train_cross_val_df = overall_preprocessing(model_train_cross_val_df)
model_encoded_holdout_df, model_target_holdout_df = overall_preprocessing(model_holdout_df)
model_encoded_test_df = overall_preprocessing(model_test_df, True)

X = model_encoded_train_cross_val_df
y = model_target_train_cross_val_df.values


In [None]:
# Save train, cross validation dataframe
pd.concat([model_encoded_train_df, model_target_df], axis=1).to_csv('model_encoded_train_df.csv', index=False)
pd.concat([model_encoded_train_cross_val_df, model_target_train_cross_val_df], axis=1).to_csv('model_encoded_train_cross_val_df.csv', index=False)

In [None]:
model_target_holdout_df.head()

In [None]:
# pd.concat([model_encoded_holdout_df, model_target_holdout_df], axis=1).to_csv('model_encoded_holdout_df.csv', index=False)
# model_encoded_test_df.to_csv('model_encoded_test_df.csv', index=False)

In [None]:
print('model_encoded_train_cross_val set shape', model_encoded_train_cross_val_df.shape)
print('model_encoded_holdout set shape', model_encoded_holdout_df.shape)
X_holdout = model_encoded_holdout_df
y_holdout = model_target_holdout_df.values
X_holdout.head()

In [None]:
import gc
del train_df, actual_training_df, train_cross_val_df, holdout_df, derived_test_df
del model_train_cross_val_df
del model_holdout_df
del model_train_df
del model_encoded_holdout_df, model_target_holdout_df
gc.collect()

In [None]:
X.isnull().any().value_counts()

### LSTM for Time Series Forecasting

* Now the LSTM model actually sees the input data as a sequence, so it's able to learn patterns from sequenced data (assuming it exists) better than the other ones, especially patterns from long sequences.
* Input shape **[samples, timesteps, features]**.

In [None]:
df_pred_ID = test_df['ID']
#save_submission(df_pred_ID, model_mlp.predict(model_encoded_test_df).flatten(),'MLP_Time_series_70_lag_traffic_Sales_feature_added_leaky_relu')


In [None]:
def save_submission(df_pred_ID, prediction, filename):
    result = pd.concat([df_pred_ID,pd.DataFrame({'Sales':list(prediction)})],axis=1)
    result.to_csv('./' + filename + '.csv', index=False)

In [None]:
from keras.layers import CuDNNLSTM

In [None]:
epochs = 10
batch = 64
lr = 0.0002
adam = optimizers.Adam(lr)
leaky_relu_alpha =0.01

In [None]:
# Create train, validation model dataframes
X_train, X_valid, Y_train, Y_valid = train_test_split(X, y, test_size=0.3, random_state=5)
print('Train set shape', X_train.shape)
print('Validation set shape', X_valid.shape)
X_train_series = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
X_valid_series = X_valid.values.reshape((X_valid.shape[0], X_valid.shape[1], 1))
print('Train series shape', X_train_series.shape)
print('Validation series shape', X_valid_series.shape)
X_series = model_encoded_train_cross_val_df.values.reshape((model_encoded_train_cross_val_df.shape[0],
                                                 model_encoded_train_cross_val_df.shape[1], 1))
X_full_train = model_encoded_train_df.values.reshape((model_encoded_train_df.shape[0],
                                                 model_encoded_train_df.shape[1], 1))
test_2d = model_encoded_test_df.values.reshape((model_encoded_test_df.shape[0], model_encoded_test_df.shape[1], 1))

In [None]:
# Init model layers
model_lstm = Sequential()
model_lstm.add(CuDNNLSTM(512, input_shape=(X_train_series.shape[1], X_train_series.shape[2])))
model_lstm.add(LeakyReLU(alpha=leaky_relu_alpha))
model_lstm.add(Dense(512, kernel_initializer='normal'))
model_lstm.add(LeakyReLU(alpha=leaky_relu_alpha))
model_lstm.add(Dense(128, kernel_initializer='normal'))
model_lstm.add(LeakyReLU(alpha=leaky_relu_alpha))
model_lstm.add(Dense(32, kernel_initializer='normal'))
model_lstm.add(LeakyReLU(alpha=leaky_relu_alpha))
model_lstm.add(Dense(1))
model_lstm.compile(loss='mse', optimizer=adam, metrics=['msle'])
model_lstm.summary()

In [None]:
# Fit the model
epochs = 35
batch = 60
callback = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto', restore_best_weights=True)]
lstm_history = model_lstm.fit(X_train_series, Y_train,
                              validation_data=(X_valid_series, Y_valid),
                            epochs=epochs, verbose=1, batch_size= batch, callbacks=callback)

In [None]:
# Fit on holdout
X_holdout_series = X_holdout.values.reshape((X_holdout.shape[0], X_holdout.shape[1], 1))
model_lstm.evaluate(X_holdout_series,y_holdout)

In [None]:
# Fit on validation
epochs= 10
batch = 128
lstm_history = model_lstm.fit(X_series, model_target_train_cross_val_df.values,
                              validation_data=(X_holdout_series,y_holdout),
                            epochs=epochs, verbose=1, batch_size= batch, callbacks=callback)

In [None]:
# Get holdout score
#X_holdout_series = X_holdout.values.reshape((X_holdout.shape[0], X_holdout.shape[1], 1))
model_lstm.evaluate(X_holdout_series,y_holdout)

In [None]:
# Fit on the full data available
epochs= 2
batch = 256
lstm_history = model_lstm.fit(X_full_train, model_target_df.values,
                              validation_data=(X_holdout_series,y_holdout),
                            epochs=epochs, verbose=1, batch_size= batch, callbacks=callback)

In [None]:
# Convert to submission format
save_submission(df_pred_ID, model_lstm.predict(test_2d).flatten(),'LSTM_2d_Time_series_60days_lag_with_long_shhort_promotion')