# Preprocessing with Scikit-Learn

#### Import Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#### Load dataset

In [2]:
data = pd.read_csv('Walmart.csv')

In [3]:
data.head()

Unnamed: 0,Order ID,Order Date,Ship Date,Customer Name,Country,City,State,Category,Product Name,Sales,Quantity,Profit
0,CA-2013-138688,13-06-2013,17-06-2013,Darrin Van Huff,United States,Los Angeles,California,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2.0,6.87
1,CA-2011-115812,09-06-2011,14-06-2011,Brosina Hoffman,United States,Los Angeles,California,Furnishings,Eldon Expressions Wood and Plastic Desk Access...,48.86,7.0,14.17
2,CA-2011-115812,09-06-2011,14-06-2011,Brosina Hoffman,United States,Los Angeles,California,Art,Newell 322,7.28,4.0,1.97
3,CA-2011-115812,09-06-2011,14-06-2011,Brosina Hoffman,United States,Los Angeles,California,Phones,Mitel 5320 IP Phone VoIP phone,907.15,4.0,90.72
4,CA-2011-115812,09-06-2011,14-06-2011,Brosina Hoffman,United States,Los Angeles,California,Binders,DXL Angle-View Binders with Locking Rings by S...,18.5,3.0,5.78


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3203 entries, 0 to 3202
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Order ID       3203 non-null   object 
 1   Order Date     3203 non-null   object 
 2   Ship Date      3203 non-null   object 
 3   Customer Name  3203 non-null   object 
 4   Country        3203 non-null   object 
 5   City           3203 non-null   object 
 6   State          3203 non-null   object 
 7   Category       3203 non-null   object 
 8   Product Name   3203 non-null   object 
 9   Sales          3203 non-null   float64
 10  Quantity       3203 non-null   float64
 11  Profit         3203 non-null   float64
dtypes: float64(3), object(9)
memory usage: 300.4+ KB


In [5]:
data.shape

(3203, 12)

In [6]:
for col in data.columns:
    print(col, data[col].unique().shape)

Order ID (1611,)
Order Date (845,)
Ship Date (911,)
Customer Name (686,)
Country (1,)
City (169,)
State (11,)
Category (17,)
Product Name (1494,)
Sales (2295,)
Quantity (14,)
Profit (2164,)


In [7]:
data.dtypes

Order ID          object
Order Date        object
Ship Date         object
Customer Name     object
Country           object
City              object
State             object
Category          object
Product Name      object
Sales            float64
Quantity         float64
Profit           float64
dtype: object

### Encoding

#### Ordinal Encoder

In [8]:
ordinal = OrdinalEncoder()

#### Data with categorical values

In [9]:
cat_data = data[['City', 'Category', 'City', 'State']]

In [10]:
cat_data.head()

Unnamed: 0,City,Category,City.1,State
0,Los Angeles,Labels,Los Angeles,California
1,Los Angeles,Furnishings,Los Angeles,California
2,Los Angeles,Art,Los Angeles,California
3,Los Angeles,Phones,Los Angeles,California
4,Los Angeles,Binders,Los Angeles,California


#### Fit encoder and Transform data

In [11]:
ordinal_data = ordinal.fit_transform(cat_data)

In [12]:
pd.DataFrame(ordinal_data, columns=cat_data.columns)

Unnamed: 0,City,Category,City.1,State
0,80.0,10.0,80.0,1.0
1,80.0,9.0,80.0,1.0
2,80.0,2.0,80.0,1.0
3,80.0,13.0,80.0,1.0
4,80.0,3.0,80.0,1.0
...,...,...,...,...
3198,80.0,0.0,80.0,1.0
3199,31.0,9.0,31.0,1.0
3200,31.0,13.0,31.0,1.0
3201,31.0,12.0,31.0,1.0


#### Encoding values of 'Category' column

In [13]:
ordinal.categories_[1]

array(['Accessories', 'Appliances', 'Art', 'Binders', 'Bookcases',
       'Chairs', 'Copiers', 'Envelopes', 'Fasteners', 'Furnishings',
       'Labels', 'Machines', 'Paper', 'Phones', 'Storage', 'Supplies',
       'Tables'], dtype=object)

#### OneHot Encoder

In [14]:
onehot = OneHotEncoder()

#### Data with categorical values

In [15]:
cat_data = data[['Category', 'State']]

#### Fit encoder and Transform data

In [16]:
onehot_data = onehot.fit_transform(cat_data).toarray()

#### Features after transforming data

In [17]:
onehot.get_feature_names_out()

array(['Category_Accessories', 'Category_Appliances', 'Category_Art',
       'Category_Binders', 'Category_Bookcases', 'Category_Chairs',
       'Category_Copiers', 'Category_Envelopes', 'Category_Fasteners',
       'Category_Furnishings', 'Category_Labels', 'Category_Machines',
       'Category_Paper', 'Category_Phones', 'Category_Storage',
       'Category_Supplies', 'Category_Tables', 'State_Arizona',
       'State_California', 'State_Colorado', 'State_Idaho',
       'State_Montana', 'State_Nevada', 'State_New Mexico',
       'State_Oregon', 'State_Utah', 'State_Washington', 'State_Wyoming'],
      dtype=object)

### Scaling

In [18]:
pd.DataFrame(onehot_data, columns=onehot.get_feature_names_out()).loc[:10]

Unnamed: 0,Category_Accessories,Category_Appliances,Category_Art,Category_Binders,Category_Bookcases,Category_Chairs,Category_Copiers,Category_Envelopes,Category_Fasteners,Category_Furnishings,...,State_California,State_Colorado,State_Idaho,State_Montana,State_Nevada,State_New Mexico,State_Oregon,State_Utah,State_Washington,State_Wyoming
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [19]:
data[['Category', 'State']].loc[:10]

Unnamed: 0,Category,State
0,Labels,California
1,Furnishings,California
2,Art,California
3,Phones,California
4,Binders,California
5,Appliances,California
6,Tables,California
7,Phones,California
8,Binders,Washington
9,Storage,Utah


In [20]:
data.head()

Unnamed: 0,Order ID,Order Date,Ship Date,Customer Name,Country,City,State,Category,Product Name,Sales,Quantity,Profit
0,CA-2013-138688,13-06-2013,17-06-2013,Darrin Van Huff,United States,Los Angeles,California,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2.0,6.87
1,CA-2011-115812,09-06-2011,14-06-2011,Brosina Hoffman,United States,Los Angeles,California,Furnishings,Eldon Expressions Wood and Plastic Desk Access...,48.86,7.0,14.17
2,CA-2011-115812,09-06-2011,14-06-2011,Brosina Hoffman,United States,Los Angeles,California,Art,Newell 322,7.28,4.0,1.97
3,CA-2011-115812,09-06-2011,14-06-2011,Brosina Hoffman,United States,Los Angeles,California,Phones,Mitel 5320 IP Phone VoIP phone,907.15,4.0,90.72
4,CA-2011-115812,09-06-2011,14-06-2011,Brosina Hoffman,United States,Los Angeles,California,Binders,DXL Angle-View Binders with Locking Rings by S...,18.5,3.0,5.78


In [21]:
data.select_dtypes('float').describe()

Unnamed: 0,Sales,Quantity,Profit
count,3203.0,3203.0,3203.0
mean,226.493266,3.82891,33.849138
std,524.876911,2.260947,174.109155
min,0.99,1.0,-3399.98
25%,19.44,2.0,3.85
50%,60.84,3.0,11.17
75%,215.81,5.0,33.0
max,13999.96,14.0,6719.98


#### MinMax Scaling

In [22]:
minmax = MinMaxScaler()

#### Data with numerical values

In [23]:
num_data = data.select_dtypes('float')
num_data.head()

Unnamed: 0,Sales,Quantity,Profit
0,14.62,2.0,6.87
1,48.86,7.0,14.17
2,7.28,4.0,1.97
3,907.15,4.0,90.72
4,18.5,3.0,5.78


#### Fit scaler and Transform data

In [24]:
minmax_data = minmax.fit_transform(num_data)

In [25]:
pd.DataFrame(minmax_data, columns=num_data.columns)

Unnamed: 0,Sales,Quantity,Profit
0,0.000974,0.076923,0.336647
1,0.003420,0.461538,0.337368
2,0.000449,0.230769,0.336162
3,0.064730,0.230769,0.344932
4,0.001251,0.153846,0.336539
...,...,...,...
3198,0.002518,0.000000,0.337472
3199,0.006498,0.076923,0.337512
3200,0.018401,0.076923,0.337884
3201,0.002044,0.230769,0.337284


#### Standard Scaler

In [26]:
standard = StandardScaler()

#### Fit scaler and Transform data

In [27]:
standard_data = standard.fit_transform(num_data)

In [28]:
pd.DataFrame(standard_data, columns=num_data.columns)

Unnamed: 0,Sales,Quantity,Profit
0,-0.403726,-0.809040,-0.154980
1,-0.338481,1.402768,-0.113045
2,-0.417712,0.075683,-0.183127
3,1.296996,0.075683,0.326690
4,-0.396332,-0.366678,-0.161241
...,...,...,...
3198,-0.362529,-1.251401,-0.107014
3199,-0.256354,-0.809040,-0.104658
3200,0.061141,-0.809040,-0.083059
3201,-0.375181,0.075683,-0.117928
