# E-Commerce Project

In [None]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('ecommerce_data.csv')

In [3]:
df.head()

Unnamed: 0,is_mobile,n_products_viewed,visit_duration,is_returning_visitor,time_of_day,user_action
0,1,0,0.65751,0,3,0
1,1,1,0.568571,0,2,1
2,1,0,0.042246,1,1,0
3,1,1,1.659793,1,1,2
4,0,1,2.014745,1,1,2


## Preprocessing the data

In [4]:
def get_data():
    df = pd.read_csv('ecommerce_data.csv')  # load the data
    data = pd.as_matrix() # convert the dataframe to numpy array, easy to work with
    
    X = data[:, :-1]  # extract the data
    Y = data[:, -1]   # extract the label
    
    # normalize the numerical columns (z = (x - mu) / sigma)
    X[:, 1] = (X[:, 1] - X[:, 1].mean()) / X[:, 1].std()   # for n_products_viewed column
    X[:, 2] = (X[:, 2] - X[:, 2].mean()) / X[:, 2].std()   # for visit_duration column
    
    # account for the categorical column -- time_of_day
    # encode with One-Hot encoding
    # there are 4 different categorical values 0,1,2,3
    N, D = X.shape
    X2 = np.zeros((N, D+3))          # create 4 new columns    
    X2[:, 0:(D-1)] = X[:, 0:(D-1)]   # copy the values except the last column
    
    # encode the variable with a loop
    for n in range(N):
        t = int(X[n, D-1])   # get the categorical value, cast to integer
        X2[n, t+D-1] = 1     # set the corresponding row and the column with 1
    
    # another way to encode
    Z = np.zeros((N, 4))                              # create an N by 4 zeros arrays
    Z[np.arange(N), X[:, D-1].astype(np.int32)] = 1   # index Z directly, set the position with 1
    # X2[:, -4:] = Z                                  # copy to X2 the last 4 columns
    
    # test if the difference is very small (which means they are the same)
    assert(np.abs(X2[:, -4:] - Z).sum() < 10e-10)
    
    return X2, Y
    

In [None]:
# for logistic class, we only want binary data, so we don't want the full dataset
def get_binary_data():
    X, Y = get_data()
    X2 = X[Y <= 1]   # get the data, where Y is 0 or 1
    Y2 = Y[Y <= 1]   # get the label, where Y is 0 or 1    
    return X2, Y2    