In [1]:
'''Import necessary packages'''
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from pandas.api.types import is_string_dtype,is_numeric_dtype
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [2]:
%pwd

'/home/huixiang/6015/week4'

In [3]:
path = '/home/huixiang/6015/week4'

In [4]:
df = pd.read_csv(f'{path}/BlackFriday.csv')
print(df.shape)
df.head(2)

(537577, 12)


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200


In [5]:
def str_to_cat(my_df):
    for key, value in my_df.items():
        if is_string_dtype(value):
            my_df[key] = value.astype('category') # check what does .cat.asordered() do
    return my_df

In [6]:
# Now comes the hardest part: define the imputer method
def mydf_to_nums(df, label, value):#convert Nan. to 0 by add 1
    if not is_numeric_dtype(value): # if this is not a numeric type, we add it by one
        df[label] = value.cat.codes + 1
        
def my_imputer(df, label, value, imputed_table):
    if is_numeric_dtype(value): #only impute numeric columns
        if pd.isnull(value).sum() or (label in imputed_table): # if there's null value in this column or this column
                                                               # has been imputed
            df[label + '_na'] = pd.isnull(value) #add another column to mark if this is a imputed value or not
            filler = imputed_table[label] if label in imputed_table else value.median() #check table first, otherwise
                                                                                        # we calculate median
            df[label] = value.fillna(filler)
            imputed_table[label] = filler # add the imputed number in the table
    return imputed_table

def my_preprocessor(df, imputed_table):
    
    if imputed_table is None:
        imputed_table = dict()
    for label, value in df.items():
        imputed_table = my_imputer(df, label, value, imputed_table)
    for label, value in df.items():
        mydf_to_nums(df, label, value)
    df = pd.get_dummies(df, dummy_na = True) #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html?highlight=get_dummies#pandas.get_dummies
    result = [df, imputed_table]
    
    return result
    
    

In [7]:
df = str_to_cat(df)

In [8]:
df, impute_table = my_preprocessor(df, None)

In [9]:
impute_table

{'Product_Category_2': 9.0, 'Product_Category_3': 14.0}

In [10]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1:]
print(X.shape, y.shape)

(537577, 13) (537577, 1)


In [11]:
X.head(2)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Product_Category_2_na
0,1000001,671,1,1,10,1,3,0,3,9.0,14.0,8370,True
1,1000001,2375,1,1,10,1,3,0,1,6.0,14.0,15200,False


In [12]:
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537577 entries, 0 to 537576
Data columns (total 13 columns):
User_ID                       537577 non-null int64
Product_ID                    537577 non-null int16
Gender                        537577 non-null int8
Age                           537577 non-null int8
Occupation                    537577 non-null int64
City_Category                 537577 non-null int8
Stay_In_Current_City_Years    537577 non-null int8
Marital_Status                537577 non-null int64
Product_Category_1            537577 non-null int64
Product_Category_2            537577 non-null float64
Product_Category_3            537577 non-null float64
Purchase                      537577 non-null int64
Product_Category_2_na         537577 non-null bool
dtypes: bool(1), float64(2), int16(1), int64(5), int8(4)
memory usage: 32.3 MB
None


In [13]:
impute_table

{'Product_Category_2': 9.0, 'Product_Category_3': 14.0}

In [14]:
X_scaled = pd.DataFrame(preprocessing.scale(X))

  """Entry point for launching an IPython kernel.


In [15]:
X_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-1.744554,-1.019705,-1.751138,-1.844439,0.293877,-1.369841,0.108962,-0.831545,-0.612032,-0.136864,0.172177,-0.193507,1.489729
1,-1.744554,0.679918,-1.751138,-1.844439,0.293877,-1.369841,0.108962,-0.831545,-1.145266,-0.84411,0.172177,1.177699,-0.671263
2,-1.744554,-0.840167,-1.751138,-1.844439,0.293877,-1.369841,0.108962,-0.831545,1.787522,-0.136864,0.172177,-1.588402,1.489729
3,-1.744554,-0.864106,-1.751138,-1.844439,0.293877,-1.369841,0.108962,-0.831545,1.787522,1.041879,0.172177,-1.661681,-0.671263
4,-1.74397,1.036998,0.571057,2.591521,1.213543,1.263032,1.659557,-0.831545,0.721053,-0.136864,0.172177,-0.274012,1.489729


In [16]:
X_scaled.columns = X.columns

In [17]:
X_scaled.describe()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Product_Category_2_na
count,537577.0,537577.0,537577.0,537577.0,537577.0,537577.0,537577.0,537577.0,537577.0,537577.0,537577.0,537577.0,537577.0
mean,1.449724e-14,-1.926964e-16,8.682448e-15,-5.272989e-15,-1.187239e-15,8.572032e-16,-9.861743e-16,2.249424e-15,-1.770383e-15,1.259792e-15,1.742834e-14,-2.559215e-16,-8.675725e-15
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-1.744554,-1.687984,-1.751138,-1.844439,-1.238898,-1.369841,-1.441634,-0.831545,-1.145266,-1.787104,-4.487181,-1.836745,-0.6712628
25%,-0.8731071,-0.7633651,0.5710572,-0.3657857,-0.9323427,-1.369841,-0.6663361,-0.831545,-1.145266,-0.3726127,0.1721773,-0.6962152,-0.6712628
50%,0.02283783,-0.04621225,0.5710572,-0.3657857,-0.1659551,-0.05340416,0.1089617,-0.831545,-0.07879774,-0.1368641,0.1721773,-0.2553414,-0.6712628
75%,0.8312881,0.8395065,0.5710572,0.373541,0.9069876,1.263032,0.8842595,1.202581,0.7210535,1.041879,0.1721773,0.5499158,1.489729
max,1.77798,1.924711,0.5710572,2.591521,1.826653,1.263032,1.659557,1.202581,3.387224,1.984873,1.866489,2.936577,1.489729


In [18]:
# Build the deep neural network
def base_model():
    model = Sequential()
    model.add(Dense(15, input_dim=13, kernel_initializer='uniform', activation='relu'))
    model.add(Dense(6, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    return model

In [19]:
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X_scaled, y, test_size = 0.15)

In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.25)

In [21]:
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(342705, 13) (114235, 13) (342705, 1) (114235, 1)


In [22]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 342705 entries, 51076 to 139002
Data columns (total 13 columns):
User_ID                       342705 non-null float64
Product_ID                    342705 non-null float64
Gender                        342705 non-null float64
Age                           342705 non-null float64
Occupation                    342705 non-null float64
City_Category                 342705 non-null float64
Stay_In_Current_City_Years    342705 non-null float64
Marital_Status                342705 non-null float64
Product_Category_1            342705 non-null float64
Product_Category_2            342705 non-null float64
Product_Category_3            342705 non-null float64
Purchase                      342705 non-null float64
Product_Category_2_na         342705 non-null float64
dtypes: float64(13)
memory usage: 36.6 MB
None


In [23]:
X_valid.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Product_Category_2_na
199945,-1.226585,-0.081122,0.571057,1.112868,1.826653,-1.369841,-0.666336,-0.831545,-0.878649,-1.079858,0.172177,0.071098,-0.671263
55124,-0.313725,1.075898,0.571057,-1.105112,-1.238898,-0.053404,-0.666336,-0.831545,-0.078798,-0.372613,0.172177,-1.183063,-0.671263
380870,0.915866,0.050539,0.571057,0.373541,0.293877,-1.369841,-0.666336,1.202581,-1.145266,-0.372613,1.442911,1.255796,-0.671263
310414,1.656654,1.347199,-1.751138,-0.365786,-1.238898,1.263032,1.659557,-0.831545,0.721053,-0.136864,0.172177,-0.31557,1.489729
362936,-0.660204,0.458488,0.571057,2.591521,1.213543,1.263032,-0.666336,-0.831545,1.520905,1.277627,0.172177,-1.569531,-0.671263


In [24]:
estimator = KerasRegressor(build_fn=base_model, epochs=10, batch_size=100, verbose=True)
#kfold = KFold(n_splits=10, random_state=42)

In [25]:
#estimator.fit(X_train, y_train, epochs=100, batch_size=32, verbose=True)
estimator.fit(X_train, y_train)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa3890fd940>

In [34]:
y_pred = estimator.predict(X_valid)
print("The MSE result for validation dataset is {:.3f}".format(mean_squared_error(y_valid, y_pred)))

The result for validation dataset is 0.024


In [27]:
X_valid.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Product_Category_2_na
199945,-1.226585,-0.081122,0.571057,1.112868,1.826653,-1.369841,-0.666336,-0.831545,-0.878649,-1.079858,0.172177,0.071098,-0.671263
55124,-0.313725,1.075898,0.571057,-1.105112,-1.238898,-0.053404,-0.666336,-0.831545,-0.078798,-0.372613,0.172177,-1.183063,-0.671263
380870,0.915866,0.050539,0.571057,0.373541,0.293877,-1.369841,-0.666336,1.202581,-1.145266,-0.372613,1.442911,1.255796,-0.671263
310414,1.656654,1.347199,-1.751138,-0.365786,-1.238898,1.263032,1.659557,-0.831545,0.721053,-0.136864,0.172177,-0.31557,1.489729
362936,-0.660204,0.458488,0.571057,2.591521,1.213543,1.263032,-0.666336,-0.831545,1.520905,1.277627,0.172177,-1.569531,-0.671263


In [32]:
y_pred_test = estimator.predict(X_test)
print("The MSE result for test dataset is {:.3f}".format(mean_squared_error(y_pred_test, y_test)))

The result for test dataset is 0.024
