# Pre-processing data
Following along [Sklearn guide](https://scikit-learn.org/stable/modules/preprocessing.html#)

In [5]:
import pandas as pd
from sklearn import preprocessing

In [3]:
# Data from https://www.kaggle.com/kyanyoga/sample-sales-data
df = pd.read_csv('sales_data_sample.csv', encoding='latin')

In [4]:
df.head()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022.0,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium


 ## Pre pre-procressing
 Separate categorical and numerical features
 Impute missing

In [8]:
df.dtypes

ORDERNUMBER           int64
QUANTITYORDERED       int64
PRICEEACH           float64
ORDERLINENUMBER       int64
SALES               float64
ORDERDATE            object
STATUS               object
QTR_ID                int64
MONTH_ID              int64
YEAR_ID               int64
PRODUCTLINE          object
MSRP                  int64
PRODUCTCODE          object
CUSTOMERNAME         object
PHONE                object
ADDRESSLINE1         object
ADDRESSLINE2         object
CITY                 object
STATE                object
POSTALCODE           object
COUNTRY              object
TERRITORY            object
CONTACTLASTNAME      object
CONTACTFIRSTNAME     object
DEALSIZE             object
dtype: object

In [23]:
categorical_cols = list(df.select_dtypes(include=['object', 'int64']).columns)
numerical_cols = list(df.select_dtypes(exclude=['object', 'int64']).columns)
# price is actually numerical
categorical_cols.remove('MSRP')
numerical_cols.append('MSRP')
print(categorical_cols)
print(numerical_cols)
assert len(categorical_cols) + len(numerical_cols) == len(df.columns)

['ORDERNUMBER', 'QUANTITYORDERED', 'ORDERLINENUMBER', 'ORDERDATE', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID', 'PRODUCTLINE', 'PRODUCTCODE', 'CUSTOMERNAME', 'PHONE', 'ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE', 'COUNTRY', 'TERRITORY', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME', 'DEALSIZE']
['PRICEEACH', 'SALES', 'MSRP']


## Standardization & Scaling

In [37]:
df_numerical = df[numerical_cols]

In [36]:
scaler = preprocessing.StandardScaler().fit(df_numerical)
df_standardized = pd.DataFrame(scaler.transform(df_numerical))

df_standardized.columns = df_numerical.columns
df_standardized.head()

Unnamed: 0,PRICEEACH,SALES,MSRP
0,0.596978,-0.370825,-0.142246
1,-0.11445,-0.427897,-0.142246
2,0.549384,0.179443,-0.142246
3,-0.019759,0.104701,-0.142246
4,0.810158,0.89674,-0.142246


In [39]:
scaler = preprocessing.MinMaxScaler().fit(df_numerical)