In [1]:
import pandas as pd
import math
import numpy as np


from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler  


from keras.models import Model
from keras.layers.convolutional import Conv2D, MaxPooling2D, UpSampling2D
from keras.models import Input
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
train = pd.read_csv('train.csv')

In [3]:
product = pd.read_csv('product_attributes.csv')

In [4]:
test = pd.read_csv('../test_nFNPSyV.csv')

In [5]:
sample = pd.read_csv('../sample_submission_qfCnaKZ.csv')

In [6]:
train.head()

Unnamed: 0,UserId,productid,Quantity,OrderDate
0,18075,12322648,1,01/04/18
1,6820,12371370,1,01/04/18
2,6820,12973004,1,01/04/18
3,6820,12657560,1,01/04/18
4,6820,11659914,1,01/04/18


In [7]:
train['OrderDate'] = pd.to_datetime(train['OrderDate'])

In [8]:
# Creating total Quantity of products bought per user as feature for similarity
user_transactions = train.groupby('UserId')['Quantity'].sum().reset_index()

user_transactions = user_transactions.rename({'Quantity':'User_transactions'},axis=1)

In [9]:
user_transactions.head()

Unnamed: 0,UserId,User_transactions
0,0,1
1,1,1
2,2,1
3,3,18
4,4,2


In [10]:
train = train.sort_values(['UserId','OrderDate'])

In [11]:
train['days_diff'] =  train.groupby(['UserId'])['OrderDate'].diff().fillna(0)

In [12]:
train.head()

Unnamed: 0,UserId,productid,Quantity,OrderDate,days_diff
21155,0,11659624,1,2018-05-16,0 days
100672,1,12406904,1,2018-08-20,0 days
27942,2,11660064,1,2018-05-06,0 days
66578,3,12658228,1,2018-07-14,0 days
66579,3,12360440,1,2018-07-14,0 days


In [13]:
train['days_diff'] = train['days_diff'].dt.days

In [14]:
# Creating avg time delta between transactions of users as feature for similarity
user_days = train.groupby('UserId')['days_diff'].mean().reset_index()

In [15]:
user_days.head()

Unnamed: 0,UserId,days_diff
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.055556
4,4,22.0


In [16]:
# Quantity of products bought per user to same datafrane as avg time delta between transactions of users
user_days['User_transactions'] = user_transactions['User_transactions'].values 
                                

In [17]:
train = train.drop('days_diff',axis=1)

In [18]:
products = list(product['productid'].unique())

In [19]:
images = []
# reading all images
def preprocess_input(x):
    x /= 255.
    x -= 0.5
    x *= 2.
    return x

for i in products:
    image = load_img('./images/{}.jpg'.format(i), target_size=(28, 28))
    # convert the image pixels to a numpy array
    image = img_to_array(image)
    # reshape data for the model
    image = np.expand_dims(image, axis=0)
    # prepare the image for the  model
    image = preprocess_input(image)
    #image = list(image.reshape(192))
    
    images.append(image)

In [20]:
images[0].shape

(1, 28, 28, 3)

In [21]:
images1 = np.array(images).reshape(3015,28,28,3)

In [22]:
# training auto encoder to get better representation of image feature
input_img = Input(shape=(28,28,3))
x = Conv2D(16,(3,3), activation='relu', padding='same')(input_img)
x = MaxPooling2D((2,2), padding='same')(x)
x = Conv2D(8,(3,3), activation='relu', padding='same')(x)
x = MaxPooling2D((2,2), padding='same')(x)
x = Conv2D(8,(3,3), activation='relu', padding='same')(x)
encoded = MaxPooling2D((2,2), padding='same', name='encoder')(x)

x = Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu')(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x)

autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

In [23]:
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('encoder').output)


In [24]:
autoencoder.fit(images1,images1,epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a1b934390>

In [25]:
encode_images= encoder.predict(images1).reshape(3015,128)

In [26]:
image_features_encoded = pd.DataFrame(encode_images)

image_features_encoded['productid'] = products

In [27]:
image_features_encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,productid
0,4.54239,2.478498,3.516694,0.516474,3.854292,3.014205,2.798985,4.115569,3.358993,2.595816,...,1.984623,4.647944,0.0,4.255222,0.0,1.956323,0.0,0.764129,0.227381,11145600
1,4.917471,2.477316,3.509499,0.496301,4.086509,3.427862,2.699298,4.138956,4.107854,1.412087,...,1.702601,4.703337,0.0,4.289351,0.0,1.831033,0.0,0.74186,0.130798,11145602
2,4.806842,2.494856,3.612464,0.513181,4.229758,3.550334,2.654141,4.14697,4.03482,1.648726,...,1.691626,4.668511,0.0,4.304811,0.0,1.861156,0.0,0.77353,0.14415,11145612
3,4.750402,2.485854,3.593747,0.532218,4.212662,3.646115,2.555592,4.116335,3.805065,1.605781,...,1.729888,4.634759,0.0,4.28022,0.0,1.931842,0.0,0.789063,0.188124,11145614
4,4.62769,2.475378,3.446891,0.513855,3.99258,3.367963,2.643557,4.123085,3.630613,2.286406,...,2.304986,4.638753,0.0,4.278831,0.0,1.929774,0.0,0.778772,0.204336,11145620


In [28]:
# creating normalized quantity of products sold feature
total_quantity = train['Quantity'].sum()

product_sold = train.groupby('productid')['Quantity'].sum().reset_index()

product_sold['weight'] = product_sold['Quantity']/total_quantity

product_sold = product_sold.drop('Quantity', axis=1)

In [29]:
product_sold.shape

(3026, 2)

In [30]:
baseline = train.groupby('productid')['UserId'].count().reset_index().sort_values('UserId',ascending=False).head(10)

In [31]:
# Top 10 products having most transactions
top10 =  baseline['productid'].tolist()

In [32]:
train.head()

Unnamed: 0,UserId,productid,Quantity,OrderDate
21155,0,11659624,1,2018-05-16
100672,1,12406904,1,2018-08-20
27942,2,11660064,1,2018-05-06
66578,3,12658228,1,2018-07-14
66579,3,12360440,1,2018-07-14


In [33]:
product.head()

Unnamed: 0,productid,attribute_name,attributevalue
0,11145600,Fit,37
1,11145600,Sleeve Length,23
2,11145600,Fabric,16
3,11145600,Color,25
4,11145600,Neckline,51


In [34]:
product_wide = product.pivot_table(index='productid',columns='attribute_name',values='attributevalue').reset_index()

In [35]:
product_wide = product_wide.merge(image_features_encoded, on='productid')

In [36]:
product_wide.head()

Unnamed: 0,productid,Category,Collection,Color,Fabric,Fit,Material,Neckline,Season,Sleeve Length,...,118,119,120,121,122,123,124,125,126,127
0,11139192,1.0,,41.0,2.0,0.0,,,42.0,,...,0.853717,2.620972,4.697667,0.0,4.276981,0.0,1.84552,0.0,0.742682,0.148138
1,11139194,1.0,,25.0,2.0,0.0,,,42.0,,...,1.88395,2.99263,4.671412,0.0,4.268265,0.0,1.884751,0.0,0.753026,0.178028
2,11139524,1.0,,95.0,16.0,14.0,,4.0,42.0,23.0,...,2.107737,2.589474,4.637054,0.0,4.22733,0.0,1.985649,0.0,0.757315,0.201884
3,11139560,1.0,,41.0,2.0,14.0,,4.0,42.0,127.0,...,1.632451,2.067833,4.644006,0.0,4.242846,0.0,1.959817,0.0,0.761835,0.20678
4,11139588,1.0,,25.0,58.0,0.0,,,42.0,,...,2.33401,2.915866,4.604879,0.0,4.063735,0.0,2.159934,0.0,0.620177,0.281061


In [37]:
product_wide = product_wide.merge(product_sold, on= 'productid',how='inner')

In [38]:
train_agg = train.groupby(['UserId','productid'])['Quantity'].sum().reset_index()

In [39]:
train_agg.shape

(74915, 3)

In [40]:
train_agg.describe()

Unnamed: 0,UserId,productid,Quantity
count,74915.0,74915.0,74915.0
mean,14493.035173,12328770.0,1.793539
std,8248.096978,558379.3,4.705072
min,0.0,11139190.0,1.0
25%,7354.5,11659940.0,1.0
50%,14580.0,12407460.0,1.0
75%,21813.5,12658330.0,1.0
max,27777.0,14129480.0,275.0


In [41]:
train_agg.head()

Unnamed: 0,UserId,productid,Quantity
0,0,11659624,1
1,1,12406904,1
2,2,11660064,1
3,3,12360440,3
4,3,12371354,3


In [42]:
product_wide.head()

Unnamed: 0,productid,Category,Collection,Color,Fabric,Fit,Material,Neckline,Season,Sleeve Length,...,119,120,121,122,123,124,125,126,127,weight
0,11139192,1.0,,41.0,2.0,0.0,,,42.0,,...,2.620972,4.697667,0.0,4.276981,0.0,1.84552,0.0,0.742682,0.148138,0.000186
1,11139194,1.0,,25.0,2.0,0.0,,,42.0,,...,2.99263,4.671412,0.0,4.268265,0.0,1.884751,0.0,0.753026,0.178028,0.000186
2,11139524,1.0,,95.0,16.0,14.0,,4.0,42.0,23.0,...,2.589474,4.637054,0.0,4.22733,0.0,1.985649,0.0,0.757315,0.201884,0.000804
3,11139560,1.0,,41.0,2.0,14.0,,4.0,42.0,127.0,...,2.067833,4.644006,0.0,4.242846,0.0,1.959817,0.0,0.761835,0.20678,7.4e-05
4,11139588,1.0,,25.0,58.0,0.0,,,42.0,,...,2.915866,4.604879,0.0,4.063735,0.0,2.159934,0.0,0.620177,0.281061,2.2e-05


In [43]:
final_train = train_agg.merge(product_wide, on='productid', how='left').fillna(-1)

In [44]:
final_train = pd.get_dummies(final_train,columns = ['Category','Fit','Material','Season','Collection'])

In [45]:
final_user_mean = final_train.groupby('UserId').mean().reset_index()


In [46]:
final_user_mean = final_user_mean.drop('productid',axis=1)

In [47]:
final_user_mean = final_user_mean.merge(user_days, on ='UserId', how ='inner')

In [48]:
final_user_mean.head()

Unnamed: 0,UserId,Quantity,Color,Fabric,Neckline,Sleeve Length,0,1,2,3,...,Collection_105.0,Collection_109.0,Collection_110.0,Collection_135.0,Collection_143.0,Collection_168.0,Collection_178.0,Collection_212.0,days_diff,User_transactions
0,0,1.0,32.0,27.0,8.0,3.0,4.921899,2.478848,3.714604,0.517505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1,1.0,18.0,2.0,8.0,3.0,4.191393,2.669297,5.228006,0.296683,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2,1.0,84.0,55.0,4.0,23.0,4.665867,2.47418,3.574455,0.503062,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,3,3.0,36.833333,29.333333,3.0,1.666667,4.49297,2.387988,4.461617,0.568939,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,18
4,4,1.0,31.5,69.0,1.5,1.0,4.617154,2.45144,3.816949,0.521133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,2


In [49]:
scaler = StandardScaler()  

In [50]:
final_user_mean.shape

(27778, 187)

In [51]:
# Using Nearest Neighbours = 25 to get most similar users
nbrs = NearestNeighbors(n_neighbors=25, algorithm='auto',metric='cosine').fit(scaler.fit_transform(final_user_mean.iloc[:,1:]))


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [52]:
distances, indices = nbrs.kneighbors(scaler.fit_transform(final_user_mean.iloc[:,1:]))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [53]:
indices.shape

(27778, 25)

In [54]:
nearest_n = pd.DataFrame(indices)

In [55]:
nearest_n['UserId'] = nearest_n.index

In [56]:
nearest_n.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,UserId
0,17813,22179,8063,8474,1015,10420,517,23480,7117,253,...,11328,3382,18362,0,21502,21339,26712,10085,25880,0
1,9810,20206,11106,10189,1,2146,23257,4647,2223,30,...,15897,14689,4033,11667,23102,13968,2351,26254,21817,1
2,16608,2819,10630,13554,7661,27499,2,17556,14575,5577,...,668,6992,10605,13784,9256,23149,8638,4901,26746,2
3,3,768,19459,11762,7388,23332,22510,12335,20087,25231,...,2520,1978,4906,12057,25390,4443,10088,11691,4455,3
4,4,12231,19833,25383,17326,12921,1707,26545,10560,11777,...,559,19895,12412,3000,181,27,3448,10765,12466,4


In [57]:
nearest_n_long = nearest_n.melt(id_vars='UserId').sort_values(['UserId','variable'])

In [58]:
final_merged = nearest_n_long.merge(train_agg,left_on=['value'], right_on='UserId').sort_values(['UserId_x','variable'])



In [59]:
top10

[12658512,
 12407154,
 12407730,
 12371378,
 11659624,
 12407400,
 12406904,
 12407742,
 12407398,
 12360424]

In [60]:
final_merged.head()

Unnamed: 0,UserId_x,variable,value,UserId_y,productid,Quantity
0,0,0,17813,17813,11659624,1
52,0,1,22179,22179,11659624,1
98,0,2,8063,8063,11659624,1
149,0,3,8474,8474,11659624,1
201,0,4,1015,1015,11659624,1


In [61]:
# Predicting list of products bought by similar users as recommendations
final_pred = final_merged.groupby('UserId_x')['productid'].unique().reset_index()

In [62]:
final_pred.head()

Unnamed: 0,UserId_x,productid
0,0,[11659624]
1,1,[12406904]
2,2,"[11660064, 11714456, 12407198, 11481104, 12407..."
3,3,"[12360440, 12371354, 12407298, 12407522, 12407..."
4,4,"[11659934, 12437468, 12407010, 12437314, 12437..."


In [63]:
top10[:2]
     

[12658512, 12407154]

In [64]:
# add top10 most bought products for users haveing less than 10 recommendations
def append_list(row):
    ct=0
    row = list(row)
    if len(row) < 10:
        while(len(row)<10):
            row.append(top10[ct])
            ct = ct + 1
        return row
    else:
        return row[:10]

In [65]:
final_pred['productid'] = final_pred['productid'].apply(append_list)

In [66]:
final_pred.head()

Unnamed: 0,UserId_x,productid
0,0,"[11659624, 12658512, 12407154, 12407730, 12371..."
1,1,"[12406904, 12658512, 12407154, 12407730, 12371..."
2,2,"[11660064, 11714456, 12407198, 11481104, 12407..."
3,3,"[12360440, 12371354, 12407298, 12407522, 12407..."
4,4,"[11659934, 12437468, 12407010, 12437314, 12437..."


In [67]:
final_pred = final_pred.rename({'UserId_x':'UserId','productid':'product_list'},axis=1)

In [68]:
final_pred.merge(test, on= 'UserId', how='inner').to_csv('capillary_nearest_neighbour25_image128_encode_cosine_user_feat.csv',index=False)


