In [0]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from collections import defaultdict
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from keras.models import Model
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout
from keras.layers.embeddings import Embedding

In [0]:
data = pd.read_csv('./BlackFriday.csv')

In [0]:
data.dtypes

In [0]:
data.head()

In [0]:
data.isnull().sum()

In [0]:
data.shape

In [0]:
data.drop(['User_ID', 'Product_Category_2', 'Product_Category_3'], axis = 1, inplace = True)

In [0]:
data.head()

In [0]:
label_dict = defaultdict(LabelEncoder)
data[['Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1']] = data[['Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1']].apply(lambda x: label_dict[x.name].fit_transform(x))

In [0]:
X = data
y = X.pop('Purchase')

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=9)

In [0]:
cat_cols_dict = {col: list(data[col].unique()) for col in ['Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1']}

In [0]:
train_input_list = []
test_input_list = []

for col in cat_cols_dict.keys():
    raw_values = np.unique(data[col])
    value_map = {}
    for i in range(len(raw_values)):
        value_map[raw_values[i]] = i       
    train_input_list.append(X_train[col].map(value_map).values)
    test_input_list.append(X_test[col].map(value_map).fillna(0).values)

In [0]:
cols_out_dict = {
    'Product_ID': 20,
    'Gender': 1,
    'Age': 2,
    'Occupation': 6,
    'City_Category': 1,
    'Stay_In_Current_City_Years': 2,
    'Marital_Status': 1,
    'Product_Category_1': 9
}

In [0]:
inputs = []
embeddings = []

for col in cat_cols_dict.keys():
    
    inp = Input(shape=(1,), name = 'input_' + col)
    embedding = Embedding(len(cat_cols_dict[col]), cols_out_dict[col], input_length=1, name = 'embedding_' + col)(inp)
    embedding = Reshape(target_shape=(cols_out_dict[col],))(embedding)
    inputs.append(inp)
    embeddings.append(embedding)


x = Concatenate()(embeddings)
x = Dense(4, activation='relu')(x)
x = Dense(2, activation='relu')(x)
output = Dense(1, activation='relu')(x)

model = Model(inputs, output)

model.compile(loss='mae', optimizer='adam')

In [0]:
model.summary()

In [0]:
model.fit(train_input_list, y_train, validation_data = (test_input_list, y_test), epochs=20, batch_size=128)

In [0]:
model.evaluate(test_input_list, y_test)

In [0]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(test_input_list)
np.sqrt(mean_squared_error(y_test, y_pred))

In [0]:
embedding_Occupation = model.get_layer('embedding_Occupation').get_weights()[0]
embedding_Product_ID = model.get_layer('embedding_Product_ID').get_weights()[0]
embedding_Product_Category_1 = model.get_layer('embedding_Product_Category_1').get_weights()[0]

In [0]:
pca = PCA(n_components=2)
Y = pca.fit_transform(embedding_Product_ID[:40])
plt.figure(figsize=(8,8))
plt.scatter(-Y[:, 0], -Y[:, 1])
for i, txt in enumerate(label_dict['Product_ID'].inverse_transform(cat_cols_dict['Product_ID'])[:40]):
    plt.annotate(txt, (-Y[i, 0],-Y[i, 1]), xytext = (-20, 8), textcoords = 'offset points')
plt.show()

In [0]:
pca = PCA(n_components=2)
Y = pca.fit_transform(embedding_Product_Category_1)
plt.figure(figsize=(8,8))
plt.scatter(-Y[:, 0], -Y[:, 1])
for i, txt in enumerate(label_dict['Product_Category_1'].inverse_transform(cat_cols_dict['Product_Category_1'])):
    plt.annotate(txt, (-Y[i, 0],-Y[i, 1]), xytext = (-20, 8), textcoords = 'offset points')
plt.show()

In [0]:
pca = PCA(n_components=2)
Y = pca.fit_transform(embedding_Occupation)
plt.figure(figsize=(8,8))
plt.scatter(-Y[:, 0], -Y[:, 1])
for i, txt in enumerate(label_dict['Occupation'].inverse_transform(cat_cols_dict['Occupation'])):
    plt.annotate(txt, (-Y[i, 0],-Y[i, 1]), xytext = (-20, 8), textcoords = 'offset points')
plt.show()

In [0]:
model.save ('black-friday.model')