# First Effort: Preprocessing Data

# Setup

##NOTE: Add outliers to preprocessing steps

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from os import path

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder

# Loading data

In [2]:
ind_ratio = 0.8

ind_size = int(3406088 * ind_ratio)
sampleindices = np.random.choice(np.arange(1,3406088), (ind_size), replace = False)
sampleindices

array([1458800, 1619043, 2884104, ..., 3355393,  581113, 1806215],
      shape=(2724870,))

In [3]:
df = pd.read_csv("../processed_data/data_ml_merged_v2.csv", skiprows=sampleindices, header = 0)

df.head()

Unnamed: 0,SalesKey,DateKey,channelKey,StoreKey,ProductKey,PromotionKey,UnitCost,UnitPrice,SalesQuantity,ReturnQuantity,...,ProductCategoryKey,GeographyKey,StoreType,EmployeeCount,SellingAreaSize,GeographyType,ContinentName,CityName,StateProvinceName,RegionCountryName
0,6,2007-07-02,3,200,2365,3,183.94,399.99,36,0,...,8.0,894,Catalog,120,462,City,North America,North Harford,Maryland,United States
1,11,2007-02-04,2,199,1825,2,16.31,32.0,4,0,...,,800,Online,325,500,City,North America,Bethesda,Maryland,United States
2,13,2007-07-25,1,171,739,3,78.19,236.0,12,0,...,3.0,932,Store,25,700,City,North America,South Portland,Maine,United States
3,25,2007-06-06,1,108,2351,1,183.94,399.99,10,0,...,8.0,860,Store,17,460,City,North America,Lakeland,Florida,United States
4,26,2007-12-10,1,144,2226,4,61.17,119.99,6,0,...,8.0,882,Store,47,1125,City,North America,Morristown,New Jersey,United States


In [4]:
df.describe()

Unnamed: 0,SalesKey,channelKey,StoreKey,ProductKey,PromotionKey,UnitCost,UnitPrice,SalesQuantity,ReturnQuantity,ReturnAmount,...,ProductSubcategoryKey,ClassID,StyleID,ColorID,Weight,StockTypeID,ProductCategoryKey,GeographyKey,EmployeeCount,SellingAreaSize
count,681219.0,681219.0,681219.0,681219.0,681219.0,681219.0,681219.0,681219.0,681219.0,681219.0,...,613428.0,613428.0,613428.0,613428.0,613428.0,613428.0,613428.0,681219.0,681219.0,681219.0
mean,1702623.0,1.659493,198.958447,1274.392988,8.470335,137.106687,320.849081,15.677155,0.144205,45.690282,...,28.92866,1.751048,2.232983,6.675667,21.925142,1.52866,4.964997,773.022218,55.730131,9477.941032
std,982869.8,1.029981,94.854407,712.142735,8.227628,167.842974,429.783323,33.981653,0.362036,201.39273,...,12.549658,0.654001,1.574596,3.689715,37.501961,0.706144,2.291755,114.094442,79.865276,27359.234829
min,6.0,1.0,1.0,1.0,1.0,0.48,0.95,2.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.17,1.0,1.0,424.0,7.0,450.0
25%,851499.0,1.0,126.0,663.0,1.0,32.19,68.0,9.0,0.0,0.0,...,20.0,1.0,1.0,2.0,4.0,1.0,3.0,710.0,17.0,480.0
50%,1702292.0,1.0,200.0,1267.0,4.0,84.12,190.0,10.0,0.0,0.0,...,28.0,2.0,1.0,7.0,6.9,1.0,4.0,790.0,25.0,500.0
75%,2552612.0,2.0,300.0,1902.0,14.0,166.2,369.0,13.0,0.0,0.0,...,42.0,2.0,3.0,8.0,23.2,2.0,8.0,871.0,47.0,700.0
max,3406089.0,4.0,310.0,2517.0,28.0,1060.22,3199.99,2600.0,5.0,6399.98,...,48.0,3.0,16.0,16.0,239.0,3.0,8.0,952.0,325.0,93800.0


In [5]:
df.shape

(681219, 41)

In [6]:
df.isna().sum()

SalesKey                       0
DateKey                        0
channelKey                     0
StoreKey                       0
ProductKey                     0
PromotionKey                   0
UnitCost                       0
UnitPrice                      0
SalesQuantity                  0
ReturnQuantity                 0
ReturnAmount                   0
DiscountQuantity               0
DiscountAmount                 0
TotalCost                      0
SalesAmount                    0
DiscountPercent                0
CalendarYear                   0
CalendarQuarterLabel           0
CalendarWeekLabel              0
IsWorkDay                      0
IsHoliday                      0
MonthNumber                    0
CalendarDayOfWeekNumber        0
ProductSubcategoryKey      67791
BrandName                  67791
ClassID                    67791
StyleID                    67791
ColorID                    67791
Weight                     67791
WeightUnitMeasureID        67791
StockTypeI

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681219 entries, 0 to 681218
Data columns (total 41 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   SalesKey                 681219 non-null  int64  
 1   DateKey                  681219 non-null  object 
 2   channelKey               681219 non-null  int64  
 3   StoreKey                 681219 non-null  int64  
 4   ProductKey               681219 non-null  int64  
 5   PromotionKey             681219 non-null  int64  
 6   UnitCost                 681219 non-null  float64
 7   UnitPrice                681219 non-null  float64
 8   SalesQuantity            681219 non-null  int64  
 9   ReturnQuantity           681219 non-null  int64  
 10  ReturnAmount             681219 non-null  float64
 11  DiscountQuantity         681219 non-null  int64  
 12  DiscountAmount           681219 non-null  float64
 13  TotalCost                681219 non-null  float64
 14  Sale

In [8]:
df.isnull().sum()

SalesKey                       0
DateKey                        0
channelKey                     0
StoreKey                       0
ProductKey                     0
PromotionKey                   0
UnitCost                       0
UnitPrice                      0
SalesQuantity                  0
ReturnQuantity                 0
ReturnAmount                   0
DiscountQuantity               0
DiscountAmount                 0
TotalCost                      0
SalesAmount                    0
DiscountPercent                0
CalendarYear                   0
CalendarQuarterLabel           0
CalendarWeekLabel              0
IsWorkDay                      0
IsHoliday                      0
MonthNumber                    0
CalendarDayOfWeekNumber        0
ProductSubcategoryKey      67791
BrandName                  67791
ClassID                    67791
StyleID                    67791
ColorID                    67791
Weight                     67791
WeightUnitMeasureID        67791
StockTypeI

In [9]:
df.dtypes

SalesKey                     int64
DateKey                     object
channelKey                   int64
StoreKey                     int64
ProductKey                   int64
PromotionKey                 int64
UnitCost                   float64
UnitPrice                  float64
SalesQuantity                int64
ReturnQuantity               int64
ReturnAmount               float64
DiscountQuantity             int64
DiscountAmount             float64
TotalCost                  float64
SalesAmount                float64
DiscountPercent            float64
CalendarYear                 int64
CalendarQuarterLabel        object
CalendarWeekLabel           object
IsWorkDay                   object
IsHoliday                   object
MonthNumber                  int64
CalendarDayOfWeekNumber      int64
ProductSubcategoryKey      float64
BrandName                   object
ClassID                    float64
StyleID                    float64
ColorID                    float64
Weight              

In [10]:
# Instantiate a SimpleImputer object with your strategy of choice
imputer = SimpleImputer(strategy="constant", fill_value=0)
# Call the "fit" method on the object
imputer.fit(df[['ProductSubcategoryKey']])
df['ProductSubcategoryKey'] = imputer.transform(df[['ProductSubcategoryKey']])
imputer.fit(df[['ClassID']])
df['ClassID'] = imputer.transform(df[['ClassID']])
imputer.fit(df[['StyleID']])
df['StyleID'] = imputer.transform(df[['StyleID']])  # Fixed this line
imputer.fit(df[['ColorID']])
df['ColorID'] = imputer.transform(df[['ColorID']])

imputer.fit(df[['Weight']])
df['Weight'] = imputer.transform(df[['Weight']])
imputer.fit(df[['StockTypeID']])
df['StockTypeID'] = imputer.transform(df[['StockTypeID']])
imputer.fit(df[['ProductCategoryKey']])
df['ProductCategoryKey'] = imputer.transform(df[['ProductCategoryKey']])

In [11]:
df['BrandName'] = df['BrandName'].fillna('N/A')

In [12]:
#Basic column transforms
#1. Promo key 1 means no discount. ALl other promo keys indicate some discount. We change promo keys to 1 and 0. 1 means no disc, 0 means disc
#2. Stock type ID 1 means High, 3 means low. THis is opposite of how the ML model learns. So, we switch
#3. Weights are in different units. Convert all to grams.
#4. Convert quarter values from Q1, Q2...to 1,2...This is neeeded for cyclical engg. later
#5. Set Sales Key as index of dataframe
#6. Ensure DateKey column is in DateTime format

#1. 
df['PromotionKey'] = np.where(df['PromotionKey'] == 1, 1, 0)

#2. 
df['StockTypeID'] = df['StockTypeID'].replace({1: 3, 3: 1})

#3. 
conversion_factors = {
    'pounds': 453.592,    
    'ounces': 28.3495,   
    'grams': 1.0        
}

# Convert weights to grams
df['Weight'] = df['Weight'] * df['WeightUnitMeasureID'].map(conversion_factors)

# Rename the Weight column
df = df.rename(columns={'Weight': 'Weight(grams)'})

# Delete the WeightUnitMeasureID column
df = df.drop('WeightUnitMeasureID', axis=1)

#4.
df['CalendarQuarterLabel'] = df['CalendarQuarterLabel'].str.extract('(\d+)').astype(int)

#5.
df = df.set_index('SalesKey')

#6.
df['DateKey'] = pd.to_datetime(df['DateKey'])

In [13]:
#Cyclical engineering to ensure proximity of dec to jan usw.
months_in_year = 12
df['sin_MonthNumber'] = np.sin(2*np.pi*df.MonthNumber/months_in_year)
df['cos_MonthNumber'] = np.cos(2*np.pi*df.MonthNumber/months_in_year)
df.drop(columns=['MonthNumber'], inplace=True)
df.columns

Index(['DateKey', 'channelKey', 'StoreKey', 'ProductKey', 'PromotionKey',
       'UnitCost', 'UnitPrice', 'SalesQuantity', 'ReturnQuantity',
       'ReturnAmount', 'DiscountQuantity', 'DiscountAmount', 'TotalCost',
       'SalesAmount', 'DiscountPercent', 'CalendarYear',
       'CalendarQuarterLabel', 'CalendarWeekLabel', 'IsWorkDay', 'IsHoliday',
       'CalendarDayOfWeekNumber', 'ProductSubcategoryKey', 'BrandName',
       'ClassID', 'StyleID', 'ColorID', 'Weight(grams)', 'StockTypeID',
       'ProductCategoryKey', 'GeographyKey', 'StoreType', 'EmployeeCount',
       'SellingAreaSize', 'GeographyType', 'ContinentName', 'CityName',
       'StateProvinceName', 'RegionCountryName', 'sin_MonthNumber',
       'cos_MonthNumber'],
      dtype='object')

In [14]:
#Cyclical engineering to ensure proximity of sat to sun usw.
days_in_week = 7
df['sin_CalendarDayOfWeekNumber'] = np.sin(2*np.pi*df.CalendarDayOfWeekNumber/days_in_week)
df['cos_CalendarDayOfWeekNumber'] = np.cos(2*np.pi*df.CalendarDayOfWeekNumber/days_in_week)
df.drop(columns=['CalendarDayOfWeekNumber'], inplace=True)
df.columns

Index(['DateKey', 'channelKey', 'StoreKey', 'ProductKey', 'PromotionKey',
       'UnitCost', 'UnitPrice', 'SalesQuantity', 'ReturnQuantity',
       'ReturnAmount', 'DiscountQuantity', 'DiscountAmount', 'TotalCost',
       'SalesAmount', 'DiscountPercent', 'CalendarYear',
       'CalendarQuarterLabel', 'CalendarWeekLabel', 'IsWorkDay', 'IsHoliday',
       'ProductSubcategoryKey', 'BrandName', 'ClassID', 'StyleID', 'ColorID',
       'Weight(grams)', 'StockTypeID', 'ProductCategoryKey', 'GeographyKey',
       'StoreType', 'EmployeeCount', 'SellingAreaSize', 'GeographyType',
       'ContinentName', 'CityName', 'StateProvinceName', 'RegionCountryName',
       'sin_MonthNumber', 'cos_MonthNumber', 'sin_CalendarDayOfWeekNumber',
       'cos_CalendarDayOfWeekNumber'],
      dtype='object')

In [15]:
#Cyclical engineering to ensure proximity of Q1 to Q4 usw.
quarters_in_year = 4
df['sin_CalendarQuarterLabel'] = np.sin(2*np.pi*df.CalendarQuarterLabel/quarters_in_year)
df['cos_CalendarQuarterLabel'] = np.cos(2*np.pi*df.CalendarQuarterLabel/quarters_in_year)
df.drop(columns=['CalendarQuarterLabel'], inplace=True)
df.columns

Index(['DateKey', 'channelKey', 'StoreKey', 'ProductKey', 'PromotionKey',
       'UnitCost', 'UnitPrice', 'SalesQuantity', 'ReturnQuantity',
       'ReturnAmount', 'DiscountQuantity', 'DiscountAmount', 'TotalCost',
       'SalesAmount', 'DiscountPercent', 'CalendarYear', 'CalendarWeekLabel',
       'IsWorkDay', 'IsHoliday', 'ProductSubcategoryKey', 'BrandName',
       'ClassID', 'StyleID', 'ColorID', 'Weight(grams)', 'StockTypeID',
       'ProductCategoryKey', 'GeographyKey', 'StoreType', 'EmployeeCount',
       'SellingAreaSize', 'GeographyType', 'ContinentName', 'CityName',
       'StateProvinceName', 'RegionCountryName', 'sin_MonthNumber',
       'cos_MonthNumber', 'sin_CalendarDayOfWeekNumber',
       'cos_CalendarDayOfWeekNumber', 'sin_CalendarQuarterLabel',
       'cos_CalendarQuarterLabel'],
      dtype='object')

In [16]:
print(df.columns)

Index(['DateKey', 'channelKey', 'StoreKey', 'ProductKey', 'PromotionKey',
       'UnitCost', 'UnitPrice', 'SalesQuantity', 'ReturnQuantity',
       'ReturnAmount', 'DiscountQuantity', 'DiscountAmount', 'TotalCost',
       'SalesAmount', 'DiscountPercent', 'CalendarYear', 'CalendarWeekLabel',
       'IsWorkDay', 'IsHoliday', 'ProductSubcategoryKey', 'BrandName',
       'ClassID', 'StyleID', 'ColorID', 'Weight(grams)', 'StockTypeID',
       'ProductCategoryKey', 'GeographyKey', 'StoreType', 'EmployeeCount',
       'SellingAreaSize', 'GeographyType', 'ContinentName', 'CityName',
       'StateProvinceName', 'RegionCountryName', 'sin_MonthNumber',
       'cos_MonthNumber', 'sin_CalendarDayOfWeekNumber',
       'cos_CalendarDayOfWeekNumber', 'sin_CalendarQuarterLabel',
       'cos_CalendarQuarterLabel'],
      dtype='object')


In [17]:
print(df['sin_MonthNumber'].head(10))

SalesKey
6    -5.000000e-01
11    8.660254e-01
13   -5.000000e-01
25    1.224647e-16
26   -2.449294e-16
31    8.660254e-01
32   -2.449294e-16
33    5.000000e-01
41   -8.660254e-01
51    5.000000e-01
Name: sin_MonthNumber, dtype: float64


In [18]:
#List of columns divided by numerical and text so that I know whether to scale or encode

'''
Numerical:'channelKey', 'StoreKey', 'ProductKey', 'PromotionKey', 'UnitCost', 'UnitPrice', 'TotalCost',
'ReturnQuantity','ReturnAmount', 'DiscountQuantity', 'DiscountAmount',
'DiscountPercent', 'CalendarYear', 'ProductSubcategoryKey', 'ClassID', 'StyleID', 'ColorID', 'Weight(grams)', 
'StockTypeID', 'ProductCategoryKey', 'GeographyKey',  'EmployeeCount', 'SellingAreaSize',
'sin_MonthNumber', 'cos_MonthNumber', 'sin_CalendarDayOfWeekNumber','cos_CalendarDayOfWeekNumber', 'sin_CalendarQuarterLabel',
'cos_CalendarQuarterLabel'




Text:
Categories: 'IsWorkDay', 'IsHoliday', 'BrandName', 'StoreType', 'GeographyType', 'ContinentName', 
'CityName', 'StateProvinceName', 'RegionCountryName'

'''

"\nNumerical:'channelKey', 'StoreKey', 'ProductKey', 'PromotionKey', 'UnitCost', 'UnitPrice', 'TotalCost',\n'ReturnQuantity','ReturnAmount', 'DiscountQuantity', 'DiscountAmount',\n'DiscountPercent', 'CalendarYear', 'ProductSubcategoryKey', 'ClassID', 'StyleID', 'ColorID', 'Weight(grams)', \n'StockTypeID', 'ProductCategoryKey', 'GeographyKey',  'EmployeeCount', 'SellingAreaSize',\n'sin_MonthNumber', 'cos_MonthNumber', 'sin_CalendarDayOfWeekNumber','cos_CalendarDayOfWeekNumber', 'sin_CalendarQuarterLabel',\n'cos_CalendarQuarterLabel'\n\n\n\n\nText:\nCategories: 'IsWorkDay', 'IsHoliday', 'BrandName', 'StoreType', 'GeographyType', 'ContinentName', \n'CityName', 'StateProvinceName', 'RegionCountryName'\n\n"

In [19]:
rb_scaler = RobustScaler()

cols_to_scale = [
    'channelKey', 'StoreKey', 'ProductKey', 'PromotionKey', 
    'UnitCost', 'UnitPrice', 'TotalCost', 'ReturnQuantity', 'ReturnAmount', 
    'DiscountQuantity', 'DiscountAmount', 'DiscountPercent', 'CalendarYear', 
    'ProductSubcategoryKey', 'ClassID', 'StyleID', 'ColorID', 'Weight(grams)', 
    'StockTypeID', 'ProductCategoryKey', 'GeographyKey', 'EmployeeCount', 
    'SellingAreaSize', 'sin_MonthNumber', 'cos_MonthNumber', 
    'sin_CalendarDayOfWeekNumber', 'cos_CalendarDayOfWeekNumber', 
    'sin_CalendarQuarterLabel', 'cos_CalendarQuarterLabel'
]

rb_scaler = RobustScaler()
df[cols_to_scale] = rb_scaler.fit_transform(df[cols_to_scale])

In [None]:
ohe = OneHotEncoder(sparse_output=False)

cols_to_encode = [
   'IsWorkDay', 'IsHoliday', 'BrandName', 'StoreType', 'GeographyType', 
   'ContinentName', 'CityName', 'StateProvinceName', 'RegionCountryName'
]
ohe.fit(df[cols_to_encode])
df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
df = data.drop(columns=cols_to_encode)

  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature_names_out()] = ohe.transform(df[cols_to_encode])
  df[ohe.get_feature