In [1]:
import pandas as pd
from surprise import Reader, Dataset
from surprise import SVD, evaluate
from surprise import NMF
import matplotlib.pyplot as plt
import math
import numpy as np


In [2]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler  


In [3]:
from keras.models import Model
from keras.applications.inception_v3 import InceptionV3
from keras.models import Model
from keras.layers.convolutional import Conv2D, MaxPooling2D, UpSampling2D
from keras.layers.convolutional import Conv3D, MaxPooling3D, UpSampling3D
from keras.models import Input
from os import listdir
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array

Using TensorFlow backend.
  return f(*args, **kwds)


In [4]:
train = pd.read_csv('train.csv')

In [5]:
product = pd.read_csv('product_attributes.csv')

In [6]:
test = pd.read_csv('../test_nFNPSyV.csv')

In [7]:
sample = pd.read_csv('../sample_submission_qfCnaKZ.csv')

In [8]:
train.head()

Unnamed: 0,UserId,productid,Quantity,OrderDate
0,18075,12322648,1,01/04/18
1,6820,12371370,1,01/04/18
2,6820,12973004,1,01/04/18
3,6820,12657560,1,01/04/18
4,6820,11659914,1,01/04/18


In [9]:
train['OrderDate'] = pd.to_datetime(train['OrderDate'])

In [10]:
user_transactions = train.groupby('UserId')['Quantity'].sum().reset_index()

user_transactions = user_transactions.rename({'Quantity':'User_transactions'},axis=1)

In [11]:
user_transactions.head()

Unnamed: 0,UserId,User_transactions
0,0,1
1,1,1
2,2,1
3,3,18
4,4,2


In [12]:
train = train.sort_values(['UserId','OrderDate'])

In [13]:
train['days_diff'] =  train.groupby(['UserId'])['OrderDate'].diff().fillna(0)

In [14]:
train.head()

Unnamed: 0,UserId,productid,Quantity,OrderDate,days_diff
21155,0,11659624,1,2018-05-16,0 days
100672,1,12406904,1,2018-08-20,0 days
27942,2,11660064,1,2018-05-06,0 days
66578,3,12658228,1,2018-07-14,0 days
66579,3,12360440,1,2018-07-14,0 days


In [15]:
train['days_diff'] = train['days_diff'].dt.days

In [16]:
user_days = train.groupby('UserId')['days_diff'].mean().reset_index()

In [17]:
user_days.head()

Unnamed: 0,UserId,days_diff
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.055556
4,4,22.0


In [18]:
# user transactions to same df as avg time delta between transactions of users
user_days['User_transactions'] = user_transactions['User_transactions'].values 
                                

In [19]:
train = train.drop('days_diff',axis=1)

In [20]:
products = list(product['productid'].unique())

In [21]:
images = []
# reading all images
def preprocess_input(x):
    x /= 255.
    x -= 0.5
    x *= 2.
    return x

for i in products:
    image = load_img('./images/{}.jpg'.format(i), target_size=(28, 28))
    # convert the image pixels to a numpy array
    image = img_to_array(image)
    # reshape data for the model
    image = np.expand_dims(image, axis=0)
    # prepare the image for the  model
    image = preprocess_input(image)
    #image = list(image.reshape(192))
    
    images.append(image)

In [22]:
images[0].shape

(1, 28, 28, 3)

In [23]:
images1 = np.array(images).reshape(3015,28,28,3)

In [24]:
# training auto encoder to get better representation of image feature
input_img = Input(shape=(28,28,3))
x = Conv2D(16,(3,3), activation='relu', padding='same')(input_img)
x = MaxPooling2D((2,2), padding='same')(x)
x = Conv2D(8,(3,3), activation='relu', padding='same')(x)
x = MaxPooling2D((2,2), padding='same')(x)
x = Conv2D(8,(3,3), activation='relu', padding='same')(x)
encoded = MaxPooling2D((2,2), padding='same', name='encoder')(x)

x = Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu')(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x)

autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

In [25]:
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('encoder').output)


In [26]:
autoencoder.fit(images1,images1,epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a24caa128>

In [27]:
encode_images= encoder.predict(images1).reshape(3015,128)

In [28]:
image_features_encoded = pd.DataFrame(encode_images)

image_features_encoded['productid'] = products

In [29]:
image_features_encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,productid
0,0.0,0.60892,2.855339,0.6774,0.364869,0.047671,2.543807,5.547992,0.0,3.081552,...,3.378014,1.575334,2.786369,3.362122,0.0,1.250614,0.0,2.23212,0.0,11145600
1,0.0,0.337915,2.863397,0.990387,0.562812,0.192729,2.362145,5.56748,0.0,1.505002,...,2.325492,1.541043,2.687998,3.324111,0.0,1.183185,0.0,2.229527,0.0,11145602
2,0.0,0.577448,2.879786,0.911699,0.724002,0.122264,2.538716,5.458682,0.0,1.061169,...,2.681702,1.594841,2.678375,3.329248,0.0,1.250687,0.0,2.200346,0.0,11145612
3,0.0,0.447089,2.855491,0.950547,0.468308,0.224041,2.484273,5.547804,0.0,1.644658,...,2.656973,1.619471,2.7514,3.334593,0.0,1.265954,0.0,2.21633,0.0,11145614
4,0.0,0.40517,2.857222,0.830051,0.622614,0.130299,2.398987,5.542725,0.0,2.51632,...,4.012929,1.600968,2.652336,3.323912,0.0,1.295138,0.0,2.230615,0.0,11145620


In [30]:
# creating normalized quantity of products sold feature
total_quantity = train['Quantity'].sum()

product_sold = train.groupby('productid')['Quantity'].sum().reset_index()

product_sold['weight'] = product_sold['Quantity']/total_quantity

product_sold = product_sold.drop('Quantity', axis=1)

In [31]:
product_sold.shape

(3026, 2)

In [32]:
baseline = train.groupby('productid')['UserId'].count().reset_index().sort_values('UserId',ascending=False).head(10)

In [33]:
top10 =  baseline['productid'].tolist()

In [34]:
train.head()

Unnamed: 0,UserId,productid,Quantity,OrderDate
21155,0,11659624,1,2018-05-16
100672,1,12406904,1,2018-08-20
27942,2,11660064,1,2018-05-06
66578,3,12658228,1,2018-07-14
66579,3,12360440,1,2018-07-14


In [35]:
product.head()

Unnamed: 0,productid,attribute_name,attributevalue
0,11145600,Fit,37
1,11145600,Sleeve Length,23
2,11145600,Fabric,16
3,11145600,Color,25
4,11145600,Neckline,51


In [36]:
product_wide = product.pivot_table(index='productid',columns='attribute_name',values='attributevalue').reset_index()

In [37]:
product_wide = product_wide.merge(image_features_encoded, on='productid')

In [38]:
product_wide.head()

Unnamed: 0,productid,Category,Collection,Color,Fabric,Fit,Material,Neckline,Season,Sleeve Length,...,118,119,120,121,122,123,124,125,126,127
0,11139192,1.0,,41.0,2.0,0.0,,,42.0,,...,1.906636,3.207873,1.530845,2.685173,3.323173,0.0,1.175831,0.0,2.23359,0.0
1,11139194,1.0,,25.0,2.0,0.0,,,42.0,,...,2.125585,4.649667,1.603436,2.664587,3.354484,0.0,1.303225,0.0,2.197092,0.0
2,11139524,1.0,,95.0,16.0,14.0,,4.0,42.0,23.0,...,1.735821,4.606104,1.595468,2.826864,3.400774,0.0,1.289103,0.0,2.243676,0.0
3,11139560,1.0,,41.0,2.0,14.0,,4.0,42.0,127.0,...,1.739807,3.737457,1.554352,2.757056,3.357213,0.0,1.192951,0.0,2.233353,0.0
4,11139588,1.0,,25.0,58.0,0.0,,,42.0,,...,0.606797,3.789466,1.687436,2.935009,3.309062,0.0,1.262846,0.0,2.289904,0.0


In [39]:
product_wide = product_wide.merge(product_sold, on= 'productid',how='inner')

In [40]:
train_agg = train.groupby(['UserId','productid'])['Quantity'].sum().reset_index()

In [41]:
train_agg.shape

(74915, 3)

In [42]:
train_agg.describe()

Unnamed: 0,UserId,productid,Quantity
count,74915.0,74915.0,74915.0
mean,14493.035173,12328770.0,1.793539
std,8248.096978,558379.3,4.705072
min,0.0,11139190.0,1.0
25%,7354.5,11659940.0,1.0
50%,14580.0,12407460.0,1.0
75%,21813.5,12658330.0,1.0
max,27777.0,14129480.0,275.0


In [43]:
train_agg.head()

Unnamed: 0,UserId,productid,Quantity
0,0,11659624,1
1,1,12406904,1
2,2,11660064,1
3,3,12360440,3
4,3,12371354,3


In [44]:
product_wide.head()

Unnamed: 0,productid,Category,Collection,Color,Fabric,Fit,Material,Neckline,Season,Sleeve Length,...,119,120,121,122,123,124,125,126,127,weight
0,11139192,1.0,,41.0,2.0,0.0,,,42.0,,...,3.207873,1.530845,2.685173,3.323173,0.0,1.175831,0.0,2.23359,0.0,0.000186
1,11139194,1.0,,25.0,2.0,0.0,,,42.0,,...,4.649667,1.603436,2.664587,3.354484,0.0,1.303225,0.0,2.197092,0.0,0.000186
2,11139524,1.0,,95.0,16.0,14.0,,4.0,42.0,23.0,...,4.606104,1.595468,2.826864,3.400774,0.0,1.289103,0.0,2.243676,0.0,0.000804
3,11139560,1.0,,41.0,2.0,14.0,,4.0,42.0,127.0,...,3.737457,1.554352,2.757056,3.357213,0.0,1.192951,0.0,2.233353,0.0,7.4e-05
4,11139588,1.0,,25.0,58.0,0.0,,,42.0,,...,3.789466,1.687436,2.935009,3.309062,0.0,1.262846,0.0,2.289904,0.0,2.2e-05


In [45]:
final_train = train_agg.merge(product_wide, on='productid', how='left').fillna(-1)

In [46]:
#for i in final_train.columns:
 #   print(i,final_train[i].nunique())

In [47]:
final_train = pd.get_dummies(final_train,columns = ['Category','Fit','Material','Season','Collection'])

In [48]:
final_user_mean = final_train.groupby('UserId').mean().reset_index()


In [49]:
final_user_mean = final_user_mean.drop('productid',axis=1)

In [50]:
final_user_mean = final_user_mean.merge(user_days, on ='UserId', how ='inner')

In [51]:
final_user_mean.head()

Unnamed: 0,UserId,Quantity,Color,Fabric,Neckline,Sleeve Length,0,1,2,3,...,Collection_105.0,Collection_109.0,Collection_110.0,Collection_135.0,Collection_143.0,Collection_168.0,Collection_178.0,Collection_212.0,days_diff,User_transactions
0,0,1.0,32.0,27.0,8.0,3.0,0.0,0.631596,2.844937,0.848973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1,1.0,18.0,2.0,8.0,3.0,0.0,2.848488,2.566079,0.218537,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2,1.0,84.0,55.0,4.0,23.0,0.0,0.433039,2.860235,1.032936,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,3,3.0,36.833333,29.333333,3.0,1.666667,0.0,1.267949,2.64369,0.652305,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,18
4,4,1.0,31.5,69.0,1.5,1.0,0.0,1.230961,2.771392,0.574841,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,2


In [52]:
scaler = StandardScaler()  

In [53]:
final_user_mean.shape

(27778, 187)

In [54]:
nbrs = NearestNeighbors(n_neighbors=17, algorithm='auto',metric='cosine').fit(scaler.fit_transform(final_user_mean.iloc[:,1:]))


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [55]:
distances, indices = nbrs.kneighbors(scaler.fit_transform(final_user_mean.iloc[:,1:]))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [56]:
indices.shape

(27778, 17)

In [57]:
nearest_n = pd.DataFrame(indices)

In [58]:
nearest_n['UserId'] = nearest_n.index

In [59]:
nearest_n.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,UserId
0,8474,18362,17813,6752,21695,356,0,18810,20275,10085,27460,4599,10420,21339,2585,20486,2393,0
1,26254,15897,10594,2714,7602,21618,6692,25221,4205,20825,23102,8830,12046,14689,23257,2223,23883,1
2,17556,3057,2,5577,2819,14575,12420,16608,13554,10630,7661,27499,7181,21067,18243,668,12179,2
3,3,15107,21421,19459,12335,4443,16053,547,18128,17841,18921,5615,19397,7219,26126,10839,7571,3
4,4,12231,19833,25383,17326,12921,21569,181,26545,11777,12466,10560,19648,7796,1707,13929,9439,4


In [60]:
nearest_n_long = nearest_n.melt(id_vars='UserId').sort_values(['UserId','variable'])

In [61]:
final_merged = nearest_n_long.merge(train_agg,left_on=['value'], right_on='UserId').sort_values(['UserId_x','variable'])



In [62]:
top10

[12658512,
 12407154,
 12407730,
 12371378,
 11659624,
 12407400,
 12406904,
 12407742,
 12407398,
 12360424]

In [63]:
final_merged.head()

Unnamed: 0,UserId_x,variable,value,UserId_y,productid,Quantity
0,0,0,8474,8474,11659624,1
43,0,1,18362,18362,11659624,1
86,0,2,17813,17813,11659624,1
127,0,3,6752,6752,11659624,1
166,0,4,21695,21695,11659624,1


In [64]:
final_pred = final_merged.groupby('UserId_x')['productid'].unique().reset_index()

In [65]:
final_pred.head()

Unnamed: 0,UserId_x,productid
0,0,[11659624]
1,1,[12406904]
2,2,"[11660064, 12407198, 11481104, 11714456]"
3,3,"[12360440, 12371354, 12407298, 12407522, 12407..."
4,4,"[11659934, 12437468, 12407010, 12437314, 12437..."


In [66]:
top10[:2]
     

[12658512, 12407154]

In [67]:
# add top10 most bought products for users haveing less than 10 recommendations
def append_list(row):
    ct=0
    row = list(row)
    if len(row) < 10:
        while(len(row)<10):
            row.append(top10[ct])
            ct = ct + 1
        return row
    else:
        return row[:10]

In [68]:
final_pred['productid'] = final_pred['productid'].apply(append_list)

In [69]:
final_pred.head()

Unnamed: 0,UserId_x,productid
0,0,"[11659624, 12658512, 12407154, 12407730, 12371..."
1,1,"[12406904, 12658512, 12407154, 12407730, 12371..."
2,2,"[11660064, 12407198, 11481104, 11714456, 12658..."
3,3,"[12360440, 12371354, 12407298, 12407522, 12407..."
4,4,"[11659934, 12437468, 12407010, 12437314, 12437..."


In [70]:
final_pred = final_pred.rename({'UserId_x':'UserId','productid':'product_list'},axis=1)

In [71]:
#final_pred.merge(test, on= 'UserId', how='inner').to_csv('capillary_nearest_neighbour17_image128_encode_cosine_user_feat.csv',index=False)


