This notebook imports the data, merges it into a single data frame, fills missing values, moves the target column to the end, and writes the final dataframe to csv so that it can be used in other notebooks

In [27]:
import pandas as pd

In [28]:
#import each dataset from GitHub
url = 'https://raw.githubusercontent.com/jamesdinardo/Retail-Forecasting/master/features_data.csv'
features = pd.read_csv(url)
print('Shape of features data: {}'.format(features.shape))
features.head()

Shape of features data: (8190, 12)


Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False
1,1,12/02/2010,38.51,2.548,,,,,,211.24217,8.106,True
2,1,19/02/2010,39.93,2.514,,,,,,211.289143,8.106,False
3,1,26/02/2010,46.63,2.561,,,,,,211.319643,8.106,False
4,1,05/03/2010,46.5,2.625,,,,,,211.350143,8.106,False


In [29]:
url = 'https://raw.githubusercontent.com/jamesdinardo/Retail-Forecasting/master/sales_data.csv'
sales = pd.read_csv(url)
print('Shape of sales data: {}'.format(sales.shape))
sales.head()

Shape of sales data: (421570, 5)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,05/02/2010,24924.5,False
1,1,1,12/02/2010,46039.49,True
2,1,1,19/02/2010,41595.55,False
3,1,1,26/02/2010,19403.54,False
4,1,1,05/03/2010,21827.9,False


In [30]:
url = 'https://raw.githubusercontent.com/jamesdinardo/Retail-Forecasting/master/stores_data.csv'
stores = pd.read_csv(url)
print('Shape of stores data: {}'.format(stores.shape))
stores.head()

Shape of stores data: (45, 3)


Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [31]:
#merge the three dataframes into one
df = pd.merge(features, sales, on=['Store', 'Date', 'IsHoliday'])
df = pd.merge(df, stores, on='Store')
print('Shape of final dataframe: {}'.format(df.shape))
df.head()

Shape of final dataframe: (421570, 16)


Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Weekly_Sales,Type,Size
0,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,1,24924.5,A,151315
1,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,2,50605.27,A,151315
2,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,3,13740.12,A,151315
3,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,4,39954.04,A,151315
4,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,5,32229.38,A,151315


In [32]:
#get basic info about df
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421570 entries, 0 to 421569
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         421570 non-null  int64  
 1   Date          421570 non-null  object 
 2   Temperature   421570 non-null  float64
 3   Fuel_Price    421570 non-null  float64
 4   MarkDown1     150681 non-null  float64
 5   MarkDown2     111248 non-null  float64
 6   MarkDown3     137091 non-null  float64
 7   MarkDown4     134967 non-null  float64
 8   MarkDown5     151432 non-null  float64
 9   CPI           421570 non-null  float64
 10  Unemployment  421570 non-null  float64
 11  IsHoliday     421570 non-null  bool   
 12  Dept          421570 non-null  int64  
 13  Weekly_Sales  421570 non-null  float64
 14  Type          421570 non-null  object 
 15  Size          421570 non-null  int64  
dtypes: bool(1), float64(10), int64(3), object(2)
memory usage: 51.9+ MB


In [33]:
#move the target column "Weekly_Sales" to the end
columns = list(df.columns.values)
order = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 13]
columns = [columns[i] for i in order]
print(columns)

df = df[columns]

['Store', 'Date', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday', 'Dept', 'Type', 'Size', 'Weekly_Sales']


In [34]:
#impute missing markdown values to 0
df = df.fillna(value={'MarkDown1':0, 'MarkDown2':0, 'MarkDown3':0, 'MarkDown4':0, 'MarkDown5':0})

In [35]:
df.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Type,Size,Weekly_Sales
0,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,1,A,151315,24924.5
1,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,2,A,151315,50605.27
2,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,3,A,151315,13740.12
3,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,4,A,151315,39954.04
4,1,05/02/2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,5,A,151315,32229.38


In [36]:
df.tail()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Type,Size,Weekly_Sales
421565,45,26/10/2012,58.85,3.882,4018.91,58.08,100.0,211.94,858.33,192.308899,8.667,False,93,B,118221,2487.8
421566,45,26/10/2012,58.85,3.882,4018.91,58.08,100.0,211.94,858.33,192.308899,8.667,False,94,B,118221,5203.31
421567,45,26/10/2012,58.85,3.882,4018.91,58.08,100.0,211.94,858.33,192.308899,8.667,False,95,B,118221,56017.47
421568,45,26/10/2012,58.85,3.882,4018.91,58.08,100.0,211.94,858.33,192.308899,8.667,False,97,B,118221,6817.48
421569,45,26/10/2012,58.85,3.882,4018.91,58.08,100.0,211.94,858.33,192.308899,8.667,False,98,B,118221,1076.8


In [37]:
#export the finished dataframe so that it can be used by other notebooks
df.to_csv('final_df.csv', index=False)