In [1]:
import sys
import os
from os import P_ALL
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis
import seaborn as sns
import logging
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer
from scipy.stats import zscore
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import importlib

# **import scripts**

In [2]:
sys.path.append('/content/drive/MyDrive/store-sales-analysis/scripts')
import Feature_engineering as Fe
#import model_building as model
importlib.reload(Fe)

<module 'Feature_engineering' from '/content/drive/MyDrive/store-sales-analysis/scripts/Feature_engineering.py'>

In [4]:
data = pd.read_csv('/content/drive/MyDrive/store-sales-analysis/data/data.csv')
test = pd.read_csv('/content/drive/MyDrive/store-sales-analysis/data/test.csv')
data.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,HolidayPeriod
0,1,c,a,1270.0,9.0,2008.0,0,,,,5,2015-07-31,0.315269,555,1,1,0,1,Regular Day
1,1,c,a,1270.0,9.0,2008.0,0,,,,4,2015-07-30,0.286714,546,1,1,0,1,Regular Day
2,1,c,a,1270.0,9.0,2008.0,0,,,,3,2015-07-29,0.257572,523,1,1,0,1,Regular Day
3,1,c,a,1270.0,9.0,2008.0,0,,,,2,2015-07-28,0.285634,560,1,1,0,1,Regular Day
4,1,c,a,1270.0,9.0,2008.0,0,,,,1,2015-07-27,0.405853,612,1,1,0,1,Regular Day


# **Handling NaN values**
Promo2SinceWeek and Promo2SinceYear are missing, it might mean that the store never participated in Promo2.

Approach: Fill with 0 (Assume No Promotion)
Since missing values indicate no promotion, fill with 0:

PromoInterval is missing, it could mean the store has no recurring promotions.

In [5]:
data['Promo2SinceWeek'].fillna(0, inplace=True)
data['Promo2SinceYear'].fillna(0, inplace=True)
data['PromoInterval'].fillna('None', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Promo2SinceWeek'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Promo2SinceYear'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

In [6]:
data.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,HolidayPeriod
0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,,5,2015-07-31,0.315269,555,1,1,0,1,Regular Day
1,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,,4,2015-07-30,0.286714,546,1,1,0,1,Regular Day
2,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,,3,2015-07-29,0.257572,523,1,1,0,1,Regular Day
3,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,,2,2015-07-28,0.285634,560,1,1,0,1,Regular Day
4,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,,1,2015-07-27,0.405853,612,1,1,0,1,Regular Day


# **Extract datetime features**

In [8]:
Fe.Extract_datetime_features(data)
data.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,...,Open,Promo,StateHoliday,SchoolHoliday,HolidayPeriod,Weekday,IsWeekend,DaysToNextHoliday,DaysSinceLastHoliday,MonthPart
0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,,...,1,1,0,1,Regular Day,4,0,0,1,End
1,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,,...,1,1,0,1,Regular Day,3,0,0,1,End
2,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,,...,1,1,0,1,Regular Day,2,0,0,1,End
3,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,,...,1,1,0,1,Regular Day,1,0,0,1,End
4,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,,...,1,1,0,1,Regular Day,0,0,0,1,End


# **one Hotencoding**

In [11]:
# Convert all boolean columns to integer (0 or 1)
for col in data.select_dtypes(include=['bool']).columns:
    data[col] = data[col].astype(int)
catagorical_columns = ['StoreType', 'Assortment', 'PromoInterval']
data = Fe.one_hot_encode(data, catagorical_columns)
data.head()

Unnamed: 0,Store,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,DayOfWeek,Date,Sales,...,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",PromoInterval_None
0,1,1270.0,9.0,2008.0,0,0.0,0.0,5,2015-07-31,0.315269,...,0,1,0,1,0,0,0,0,0,1
1,1,1270.0,9.0,2008.0,0,0.0,0.0,4,2015-07-30,0.286714,...,0,1,0,1,0,0,0,0,0,1
2,1,1270.0,9.0,2008.0,0,0.0,0.0,3,2015-07-29,0.257572,...,0,1,0,1,0,0,0,0,0,1
3,1,1270.0,9.0,2008.0,0,0.0,0.0,2,2015-07-28,0.285634,...,0,1,0,1,0,0,0,0,0,1
4,1,1270.0,9.0,2008.0,0,0.0,0.0,1,2015-07-27,0.405853,...,0,1,0,1,0,0,0,0,0,1
