# 0- Import the packages

In [None]:
# data manipulation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)
import datetime
import time

# ploting
'''from matplotlib import style
style.use('ggplot')
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
'''
# modeling
import statsmodels.formula.api as smf
'''import statsmodels.api as sm
import statsmodels.tsa.arima_process as sta
import statsmodels.graphics.tsaplots as sgt
from statsmodels.tsa.stattools import acf, pacf
import statsmodels.tsa.statespace as sts
'''
from sklearn import preprocessing,metrics , model_selection
import warnings
warnings.filterwarnings(action='once')
import sys

In [None]:
# baseline solution 1 : mean (score : 1.69158)
'''
sales_train_validation, calendar, sell_prices, sample_submission = read_data('data')
sales_train_validation = reduce_mem_usage(sales_train_validation)

mean_sales=sales_train_validation.mean(axis=1,numeric_only=True)
mean=mean_sales.append(mean_sales).reset_index(drop=True)
for col in ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10',
          'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 
          'F20', 'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28']:
    sample_submission[col]=mean  
    
sample_submission.to_csv("submission_mean.csv",index=False)
'''
# baseline solution 2 : zeros before trimmed + mean (score : 1.20619)
'''
sales_train_validation, calendar, sell_prices, sample_submission = read_data('data')
sales_train_validation = reduce_mem_usage(sales_train_validation)

my_array=sales_train_validation.iloc[:,6:].values
sub=[np.mean(np.trim_zeros(row)) for row in my_array]
for col in ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10',
          'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 
          'F20', 'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28']:
    sample_submission[col]=sub+sub  
    
sample_submission.to_csv("submission_mean_wo_zero.csv",index=False)
'''

# 1- Reading data and data manipulation (melt) 


**WE FIRST WILL ONLY TAKE FIRST 10000 TIME SERIES TO EASE THE LOAD ON CPU/RAM **


In [2]:
def read_data(path):
    """ path : string """
    df1 = pd.read_csv(path+'/sales_train_validation_last_year.csv') # dropped to have only last y cols
                                                                    # sales_train_validation_last_year = sales_train_validation.drop(columns=['d_'+str(i) for i in range(1,1547)])
    df2 = pd.read_csv(path+'/calendar.csv',parse_dates=[0])
    df3 = pd.read_csv(path+'/sell_prices.csv')
    df4 = pd.read_csv('data/sample_submission.csv')
    return df1,df2,df3,df4

def reduce_memory(dfs,verbose=False):
    return [reduce_memory_usage(df) for df in dfs]
    
def reduce_memory_usage(df, verbose=False):
    """reduce memory usage of integer & float columns based on their value range"""
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")
    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem)
             )
    return df

def data_manipulation(sales_train_validation,n_samples=10000):
    sales_train_validation = sales_train_validation[:n_samples]
    # unpivote the columns 
    value_vars=sales_train_validation.columns.to_list()[6:]  # remove the first 6 variables ie 'id','item_id', 'dept_id', 'cat_id', 'store_id','state_id'
    id_vars=['id','item_id', 'dept_id', 'cat_id', 'store_id','state_id']
    sales = pd.melt(sales_train_validation,id_vars=id_vars, value_vars=value_vars,var_name='d', value_name='sales_count')
    
    # join with calendar
    sales=pd.merge(sales,calendar,left_on='d',right_on='d',how="left")
    # join with selling price
    # seperate test dataframes
    '''sales_validation_id = [row for row in sample_submission['id'] if 'validation' in row]
    sales_evaluation_id = [row for row in sample_submission['id'] if 'evaluation' in row]

    sales_validation = sample_submission[sample_submission['id'].isin(sales_validation_id)]
    sales_evaluation = sample_submission[sample_submission['id'].isin(sales_evaluation_id)]
    '''
    # free memory
    del id_vars,value_vars
    return sales #,sales_validation,sales_evaluation

def feature_engineering(sales):
    # drop useless columns
    useless_cols=["d","wm_yr_wk","wday","weekday"]
    sales.drop(columns=useless_cols,inplace=True)
    
    # fill nan
    nan_cols=["event_name_1","event_type_1","event_name_2","event_type_2"]
    for col in nan_cols:
        sales[col].fillna('Nothing',inplace=True)
    
    # encode categorical
    '''cat_cols=["item_id","dept_id","store_id","state_id","cat_id","weekday"]+nan_cols
    encoder = preprocessing.OneHotEncoder()
    for col in cat_cols:
        sales[col]= encoder.fit_transform(sales[col])''' 

    # feature engineering
    '''source : https://kanoki.org/2019/09/09/how-to-shift-a-column-in-pandas/ 
    => use groupby + transform to see the separate value for each group.'''
    
    # sales series features
    '''1- groupby id (ie timeseries (10000 series)
       2- take the sales_count of 28 days before of that same series
       3- the sales found at column lag_28 in row i will be found in the column sales_count at 28*n_sample + i 
       4- temp.iloc[28*10000+i]==sales.iloc[i,6]'''
    sales['lag_1d']=sales.groupby(['id'])['sales_count'].transform(lambda x: x.shift(1))
    sales['lag_2d']=sales.groupby(['id'])['sales_count'].transform(lambda x: x.shift(2))
    sales['lag_3d']=sales.groupby(['id'])['sales_count'].transform(lambda x: x.shift(3))
    sales['lag_1w']=sales.groupby(['id'])['sales_count'].transform(lambda x: x.shift(7))
    sales['lag_2w']=sales.groupby(['id'])['sales_count'].transform(lambda x: x.shift(14))
    sales['lag_4w']=sales.groupby(['id'])['sales_count'].transform(lambda x: x.shift(28))
    
    sales['rolling_avg_1w']=sales.groupby(['id'])['sales_count'].transform(lambda x : x.rolling(7,min_periods=1).mean())
    sales['rolling_avg_1m']=sales.groupby(['id'])['sales_count'].transform(lambda x : x.rolling(30,min_periods=1).mean())
    sales['rolling_avg_6m']=sales.groupby(['id'])['sales_count'].transform(lambda x : x.rolling(180,min_periods=1).mean())
    sales['rolling_avg_1y']=sales.groupby(['id'])['sales_count'].transform(lambda x : x.rolling(365,min_periods=1).mean())
    
    sales['rolling_std_1w']=sales.groupby(['id'])['sales_count'].transform(lambda x : x.rolling(7,min_periods=1).std())
    sales['rolling_std_1m']=sales.groupby(['id'])['sales_count'].transform(lambda x : x.rolling(30,min_periods=1).std())
    
    # time features 
    date_cols=["date","year","month"]
    sales['dayofweek']=sales.date.dt.dayofweek
    sales['dayofmonth']=sales.date.dt.day
    
    
    # drop nan due to the rolling function
    initial_len=len(sales)
    sales.dropna(inplace=True)
    print('Dropped',initial_len-len(sales), 'rows out of', initial_len, 'initially')
    
    sales = reduce_memory_usage(sales,verbose=True)
    return sales

def model_fitting(y, feature_set, sales):
    # Fit model on feature_set and calculate RSS
    formula = y + '~' + '+'.join(feature_set)
    # fit the regression model
    model = smf.ols(formula=formula, data=sales).fit()
    return model
def model_eval(target, feature_set, sales):
    y=sales[target]
    X=sales.drop(columns=[target])
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33, random_state=42)
    X_train[target]=y_train
    model=model_fitting(target, feature_set, X_train)
    train_score = np.sqrt(metrics.mean_squared_error(model.predict(X_train),y_train))
    print("train :",train_score)
    test_score = np.sqrt(metrics.mean_squared_error(model.predict(X_test),y_test))    
    print("test :",test_score)
    return model,train_score,test_score

def predict(test):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    final.to_csv('submission.csv', index = False)
    
    pass

    

In [24]:
df4 = pd.read_csv('data/sample_submission.csv')

In [None]:
###########################################################
start = time.time()
sales_train_validation, calendar, sell_prices, sample_submission = read_data('data')
sales_train_validation, calendar, sell_prices = reduce_memory([sales_train_validation, calendar, sell_prices])
end = time.time()
print("it has taken",end-start,"seconds for data loading ") # 210 s  - 9s
###########################################################
start = time.time()
sales = data_manipulation(sales_train_validation,n_samples=len(sales_train_validation))    
end = time.time()
print("it has taken",end-start,"seconds for data manipulation") # 15 s - 25 s
###########################################################
start = time.time()
sales = feature_engineering(sales)
end = time.time()
print("it has taken",end-start,"seconds") #  235 s

In [3]:
path= 'data'
sales = pd.read_csv(path+'/features_sales_half_last_year.csv.csv')

In [None]:
categorical_features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','month', 'event_name_1',
                        'event_type_1','event_name_2', 'event_type_2','dayofweek']
categorical_features = ['C('+feature+')' for feature in categorical_features]
numerical_features = ['snap_CA', 'snap_TX', 'snap_WI','lag_1d', 'lag_2d', 'lag_3d', 'lag_1w', 'lag_2w',
                      'lag_4w', 'rolling_avg_1w', 'rolling_avg_1m', 'rolling_avg_6m', 'rolling_avg_1y',
                      'rolling_std_1w', 'rolling_std_1m','dayofmonth','year']
                      # eventually move to categorical :  dayofmonth, year

feature_set = categorical_features + numerical_features

time_features = ['lag_1d', 'lag_2d', 'lag_3d', 'lag_1w', 'lag_2w','lag_4w', 'rolling_avg_1w','rolling_avg_1m',
                 'rolling_avg_6m', 'rolling_std_1w', 'rolling_std_1m','rolling_avg_1y',#'C(store_id)',#'C(dept_id)', #'C(month)','year',
                 'C(dayofweek)','dayofmonth','snap_CA', 'snap_TX', 'snap_WI']
                # R2 = 0.575 for time_features and n_samples=1000
                # R2 = 0.572 for numerical_features and n_samples=1000
                # R2 = 0.731 for time_features and sales[:int(len(sales)/10)] wo month, year, avg_1y,
                # R2 = 0.731 for time_features and sales[:int(len(sales)/10)] wo month, year, avg_1y, dept_id, store_id
                # R2 = 0.759 for time_features and sales[:int(len(sales)/5)] wo month, year, avg_1y
                # R2 = 0.759 for time_features and sales[:int(len(sales)/5)] wo month, year, avg_1y, dept_id, store_id
                # R2 = 0.770 for time_features and sales[:int(len(sales)/3)] wo month, year, avg_1y, dept_id, store_id
                # R2 = 0.771 for time_features and sales[:int(len(sales)/3)] wo month, year, avg_1y, dept_id, store_id
                # cpu not enough for 1 y
#model = model_fitting('sales_count', time_features, sales[:int(len(sales)/3)])
#model.summary()

In [11]:
results = pd.DataFrame(data=[],columns=['method',"train_score","test_score",'feature_set','datasets','addi_details'])

In [17]:
row=['Linear Regression',12,12,['fd','fd'],100/i,'']


In [33]:
sales.cat_id.value_counts()

FOODS        2270460
HOUSEHOLD    1654260
HOBBIES       892700
Name: cat_id, dtype: int64

In [29]:
time_features = [#lag_1d', 'lag_2d', 'lag_3d', 'lag_1w', 'lag_2w','lag_4w', 'rolling_avg_1w','rolling_avg_1m',
                 'rolling_avg_6m', 'rolling_std_1w', 'rolling_std_1m',#'rolling_avg_1y',#'C(store_id)',#'C(dept_id)', #'C(month)','year',
                 'C(dayofweek)','dayofmonth','snap_CA', 'snap_TX', 'snap_WI']
                
            ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','month', 'event_name_1',
                 'event_type_1','event_name_2', 'event_type_2','dayofweek']

categorical_features = ['item_id', 'dept_id', 'store_id', 'state_id','month', 'event_name_1',
                        'event_type_1','event_name_2', 'event_type_2','dayofweek']
categorical_features = ['C('+feature+')' for feature in categorical_features]

for i in [10,5,3] :
    for feature_set in [time_features]:
        print(100/i,"% of the datasets")
        model,train_score,test_score = model_eval('sales_count', feature_set, sales[:int(len(sales)/i)])
        row=['Linear Regression',train_score,test_score,feature_set,100/i,'']
        results.loc[loc]=row
        loc+=1


10.0 % of the datasets


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


train : 2.044346243901079
test : 1.9901372066602743
20.0 % of the datasets


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


train : 2.1344193379556895
test : 2.072108830536073
33.333333333333336 % of the datasets


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


train : 2.157164052000654
test : 2.145317692173323


In [28]:
results

Unnamed: 0,method,train_score,test_score,feature_set,datasets,addi_details
0,Linear Regression,1.80557,1.78015,"[lag_1d, lag_2d, lag_3d, lag_1w, lag_2w, lag_4...",10.0,
1,Linear Regression,1.80557,1.78015,"[lag_1d, lag_2d, lag_3d, lag_1w, lag_2w, lag_4...",10.0,
2,Linear Regression,1.83034,1.76586,"[lag_1d, lag_2d, lag_3d, lag_1w, lag_2w, lag_4...",20.0,
3,Linear Regression,1.83034,1.76586,"[lag_1d, lag_2d, lag_3d, lag_1w, lag_2w, lag_4...",20.0,
4,Linear Regression,1.86322,1.81045,"[lag_1d, lag_2d, lag_3d, lag_1w, lag_2w, lag_4...",33.333333,
5,Linear Regression,1.86322,1.81045,"[lag_1d, lag_2d, lag_3d, lag_1w, lag_2w, lag_4...",33.333333,


In [None]:
#TEST
''' 
test=pd.DataFrame([2,3,4,5,7])
test['id']=['a','b','b','b','a']

print(test)
print("--------")
print(test.groupby(['id']).rolling(2,min_periods=1).mean())
print("--------")
print("with transform: ")
print(test.groupby(['id'])[0].transform(lambda x: x.rolling(2,min_periods=1).mean()))
'''  

In [None]:
train_score

In [None]:
'''function needed for calculating interval of prediction
    fit = modal 
    exog = new dataframe'''
    
def transform_exog_to_model(fit, exog):
    transform=True
    self=fit

    # The following is lifted straight from statsmodels.base.model.Results.predict()
    if transform and hasattr(self.model, 'formula') and exog is not None:
        from patsy import dmatrix
        exog = dmatrix(self.model.data.orig_exog.design_info.builder, exog)

    if exog is not None:
        exog = np.asarray(exog)
        if exog.ndim == 1 and (self.model.exog.ndim == 1 or self.model.exog.shape[1] == 1):
            exog = exog[:, None]
        exog = np.atleast_2d(exog)  # needed in count model shape[1]

    # end lifted code
    return exog

In [None]:
lasttime=pd.Timestamp('2017-12-22 16:00:00')

x_pred_index_no = range(40000,40500)
x_pred_time = [lasttime+i*pd.Timedelta('1:00:00') for i in range (1, len(x_pred_index_no)+1)]

newdf = pd.DataFrame(index=x_pred_time,columns=['index_no'], data= x_pred_index_no)

newdf['year']=newdf.index.year-2012
newdf['monthofyear']=newdf.index.month
newdf['dayofmonth']=newdf.index.day
newdf['dayofweek']=newdf.index.dayofweek
newdf['hour']=newdf.index.hour

y_pred = model.predict(newdf)
transformed_exog = transform_exog_to_model(model, newdf)
from statsmodels.sandbox.regression.predstd import wls_prediction_std
prstd, iv_l, iv_u = wls_prediction_std(model, transformed_exog, weights=[1])

train1_partial=train1['2017-12':]
fig, ax = plt.subplots(figsize=(24, 6))
ax.plot(train1_partial['index_no'], train1_partial['TrafficVolume'])
ax.scatter(train1_partial['index_no'], train1_partial['TrafficVolume'])
fig.suptitle('Prediction Intervals')
ax.grid(True)
ax.plot(list(x_pred_index_no), y_pred, '-', color='red', linewidth=2)
# interval for observations
ax.fill_between(x_pred_index_no, iv_l, iv_u, color='#888888', alpha=0.3)
ax.axis('tight')
plt.show()