Sketch for prediction of SalesAmount
===

# Setup

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_validate, train_test_split
from sklearn.ensemble import RandomForestRegressor, BaggingClassifier, BaggingRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

from os import path

# Loading Data

In [3]:
### For now, let's work with the dataset prepared by Nishi
data = pd.read_csv("../merged_data/data_ml_merged.csv")

data.set_index(['SalesKey'], inplace=True)
data['DateKey'] = pd.to_datetime(data['DateKey'], infer_datetime_format=True)

data.drop(columns = ["CurrencyKey", "Status_x", "Status_y"], inplace = True)

print(data.shape)
data.head()

(3406089, 43)


Unnamed: 0_level_0,DateKey,channelKey,StoreKey,ProductKey,PromotionKey,CurrencyKey,UnitCost,UnitPrice,SalesQuantity,ReturnQuantity,...,GeographyKey,StoreType,Status_y,EmployeeCount,SellingAreaSize,GeographyType,ContinentName,CityName,StateProvinceName,RegionCountryName
SalesKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2007-01-02,1,209,956,10,1,91.05,198.0,8,0,...,738,Store,On,32,680,City,Europe,Baildon,England,United Kingdom
2,2007-02-12,4,308,766,2,1,10.15,19.9,4,0,...,693,Reseller,On,15,450,City,North America,Seattle,Washington,United States
3,2008-01-24,1,156,1175,11,1,209.03,410.0,9,0,...,449,Store,On,26,680,City,Europe,Cambridge,England,United Kingdom
4,2008-01-13,2,306,1429,10,1,132.9,289.0,8,0,...,586,Online,On,7,1000,City,Europe,Berlin,Berlin,Germany
5,2008-01-22,2,306,1133,10,1,144.52,436.2,24,0,...,586,Online,On,7,1000,City,Europe,Berlin,Berlin,Germany


In [8]:
data.CalendarQuarterLabel.unique()

array(['Q1', 'Q3', 'Q4', 'Q2'], dtype=object)

In [4]:
data.columns

Index(['DateKey', 'channelKey', 'StoreKey', 'ProductKey', 'PromotionKey',
       'CurrencyKey', 'UnitCost', 'UnitPrice', 'SalesQuantity',
       'ReturnQuantity', 'ReturnAmount', 'DiscountQuantity', 'DiscountAmount',
       'TotalCost', 'SalesAmount', 'DiscountPercent', 'CalendarYear',
       'CalendarQuarterLabel', 'CalendarWeekLabel', 'IsWorkDay', 'IsHoliday',
       'MonthNumber', 'CalendarDayOfWeekNumber', 'ProductSubcategoryKey',
       'BrandName', 'ClassID', 'StyleID', 'ColorID', 'Weight',
       'WeightUnitMeasureID', 'StockTypeID', 'Status_x', 'ProductCategoryKey',
       'GeographyKey', 'StoreType', 'Status_y', 'EmployeeCount',
       'SellingAreaSize', 'GeographyType', 'ContinentName', 'CityName',
       'StateProvinceName', 'RegionCountryName'],
      dtype='object')

# Pre-processing

## Transform select features (i.e., unify weight feature with WeightUnitMeasureID information)

## Scaling and Encoding of select scales

In [None]:
preprocessor = StandardScaler()

## Seasonal data - aggregating by DateKey

In [12]:
### "full" feature set expects data to be preprocessed already, otherwise stick with simple feature set

aggfeatures_simple = {"UnitCost": "mean", "UnitPrice": "mean", "ReturnQuantity": "sum",
                      "ReturnAmount": "sum", "DiscountQuantity": "sum", "DiscountAmount": "sum",
                      "TotalCost": "sum", "SalesQuantity": "sum", "SalesAmount": "sum"}
aggfeatures = {"UnitCost": "mean", "UnitPrice": "mean", "ReturnQuantity": "sum", "ReturnAmount": "sum",
               "DiscountQuantity": "sum", "DiscountAmount": "sum", "DiscountPercent": "mean", "IsWorkDay": "mean",
               "IsHoliday": "mean", "Weight": "sum", "EmployeeCount": "mean", "SellingAreaSize": "mean",
               "TotalCost": "sum", "SalesQuantity": "sum", "SalesAmount": "sum"}

data_daily = data.groupby("DateKey").agg(aggfeatures_simple)
print(data_daily.shape)

data_weekly = data.groupby(["CalendarYear", "CalendarWeekLabel"]).agg(aggfeatures_simple)
print(data_weekly.shape)

data_monthly = data.groupby(["CalendarYear", "MonthNumber"]).agg(aggfeatures_simple)
print(data_monthly.shape)

data_quarterly = data.groupby(["CalendarYear", "CalendarQuarterLabel"]).agg(aggfeatures_simple)
print(data_quarterly.shape)

data_yearly = data.groupby("CalendarYear").agg(aggfeatures_simple)
print(data_yearly.shape)

(1096, 9)
(159, 9)
(36, 9)
(12, 9)
(3, 9)
