# To do for 07282017:
1. User_filter:
   Filter user based on certain features, e.g., 
   consistent with theme, certain time of viewing, 
   or certain time interval before each item viewing.
2. Recommendation core:
   It will basically be the collaborative filter (CF),
   but instead of using real items, I'd like to use 
   features extracted from CNN and dimension-reduced
   by tSNE to maybe 20 D.
3. Processor:
   Input are
   a. log of user history
   b. item features
   Output are
   a. Top N rank of recommendation item for each user
4. Evaluator:
   Evaluate whether the user buy the item within the top
   N rank of recommended items.

# After trial run:
* tSNE for this amount of sample and the dimension we want may not be feasible. Need to try small portion and time it or try PCA instead

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [2]:
os.chdir('/Users/Walkon302/Desktop/deep-learning-models-master/view2buy')

In [3]:
# Read the preprocessed file, containing the user profile and item features from view2buy folder
df = pd.read_pickle('user_fea_for_eval.pkl')

In [4]:
df.head()

Unnamed: 0,0,user_id,buy_spu,buy_sn,buy_ct3,view_spu,view_sn,view_ct3,time_interval,view_cnt,view_secondes,view_features,buy_features
0,2469583035\t4199682998971011301\t10013436\t334...,2469583035,4199682998971011301,10013436,334,220189917005230097,10013861,334,37496,7,45,"[0.621, 0.542, 0.0, 0.369, 0.062, 0.039, 0.103...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757..."
1,2469583035\t4199682998971011301\t10013436\t334...,2469583035,4199682998971011301,10013436,334,234826617504419925,10003862,334,170826,2,23,"[0.15, 0.98, 0.104, 1.295, 0.111, 0.0, 0.0, 0....","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757..."
2,2469583035\t4199682998971011301\t10013436\t334...,2469583035,4199682998971011301,10013436,334,235671027621670949,10003862,334,426968,2,11,"[0.106, 0.027, 0.0, 1.398, 0.096, 0.021, 0.072...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757..."
3,1488725183\t4199682998971011301\t10013436\t334...,1488725183,4199682998971011301,10013436,334,235671027621670949,10003862,334,180564,1,22,"[0.106, 0.027, 0.0, 1.398, 0.096, 0.021, 0.072...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757..."
4,2469583035\t4199682998971011301\t10013436\t334...,2469583035,4199682998971011301,10013436,334,245522675097001998,10026364,334,83993,2,7,"[0.019, 1.415, 0.007, 0.088, 0.055, 0.015, 0.0...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757..."


In [5]:
# Drop the first column, which is the original data format.
df.drop('0', axis = 1, inplace = True)

In [6]:
# Check the data
df.head()

Unnamed: 0,user_id,buy_spu,buy_sn,buy_ct3,view_spu,view_sn,view_ct3,time_interval,view_cnt,view_secondes,view_features,buy_features
0,2469583035,4199682998971011301,10013436,334,220189917005230097,10013861,334,37496,7,45,"[0.621, 0.542, 0.0, 0.369, 0.062, 0.039, 0.103...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757..."
1,2469583035,4199682998971011301,10013436,334,234826617504419925,10003862,334,170826,2,23,"[0.15, 0.98, 0.104, 1.295, 0.111, 0.0, 0.0, 0....","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757..."
2,2469583035,4199682998971011301,10013436,334,235671027621670949,10003862,334,426968,2,11,"[0.106, 0.027, 0.0, 1.398, 0.096, 0.021, 0.072...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757..."
3,1488725183,4199682998971011301,10013436,334,235671027621670949,10003862,334,180564,1,22,"[0.106, 0.027, 0.0, 1.398, 0.096, 0.021, 0.072...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757..."
4,2469583035,4199682998971011301,10013436,334,245522675097001998,10026364,334,83993,2,7,"[0.019, 1.415, 0.007, 0.088, 0.055, 0.015, 0.0...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757..."


In [7]:
# Remove the item that contains the seam buy and view spu to remove the bias
#df = df.query('buy_spu != view_spu')

In [8]:
# Slice the data into 10k items
df = df.iloc[0:10000, :]

In [9]:
# Calculate the average view features for all view items per user
avg_view_fea = pd.DataFrame(df.groupby(['user_id', 'buy_spu'])['view_secondes'].mean())

In [10]:
# Reset the index and rename the column
avg_view_fea.reset_index(inplace=True)
avg_view_fea.rename(columns = {'view_secondes':'avg_view_fea'}, inplace=True)

In [11]:
# Check the data
avg_view_fea.head()

Unnamed: 0,user_id,buy_spu,avg_view_fea
0,814009,77763563263074335,13.436364
1,1165283,77200616039542809,21.625
2,9873479,77200616039542809,34.863636
3,63236390,292247525162119174,19.736842
4,76700950,95777984703225857,155.0


In [12]:
# Merge avg item view into data
df = pd.merge(df, avg_view_fea, on=['user_id', 'buy_spu'])

In [13]:
# Calculate the weights for view item vec
df['weight_of_view'] = df['view_secondes']/df['avg_view_fea']

In [14]:
df.head()

Unnamed: 0,user_id,buy_spu,buy_sn,buy_ct3,view_spu,view_sn,view_ct3,time_interval,view_cnt,view_secondes,view_features,buy_features,avg_view_fea,weight_of_view
0,2469583035,4199682998971011301,10013436,334,220189917005230097,10013861,334,37496,7,45,"[0.621, 0.542, 0.0, 0.369, 0.062, 0.039, 0.103...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,1.299352
1,2469583035,4199682998971011301,10013436,334,234826617504419925,10003862,334,170826,2,23,"[0.15, 0.98, 0.104, 1.295, 0.111, 0.0, 0.0, 0....","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.664113
2,2469583035,4199682998971011301,10013436,334,235671027621670949,10003862,334,426968,2,11,"[0.106, 0.027, 0.0, 1.398, 0.096, 0.021, 0.072...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.317619
3,2469583035,4199682998971011301,10013436,334,245522675097001998,10026364,334,83993,2,7,"[0.019, 1.415, 0.007, 0.088, 0.055, 0.015, 0.0...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.202121
4,2469583035,4199682998971011301,10013436,334,296751124749754369,10005367,334,427866,2,12,"[0.066, 0.328, 0.043, 0.0, 0.062, 0.016, 0.303...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.346494


In [34]:
# Generate view_item_vec and buy_item_vec
view_buy_item_fea = pd.concat([df['view_features'], df['buy_features']], axis = 0)

In [36]:
view_buy_item_fea.shape

(20000,)

## Try TSNE and time it
* It turns out that TSNE is too time consuming even for small set of data. It is also because of how I transformed the data. Thus, in the PCA, I used list in the beginning and then transform all data into numpy array at once, which is much faster.

In [None]:
# Generate TSNE model
model = TSNE(n_components=10, random_state=0)

In [121]:
# Time the tSNE with 250 samples
%%time
a = pd.DataFrame()
for i, j in enumerate(view_item_vec.iloc[0:250]):
    a = pd.concat([a, pd.DataFrame(j).transpose()], axis = 0)
vt = model.fit_transform(a)

CPU times: user 22.3 s, sys: 501 ms, total: 22.8 s
Wall time: 22.8 s


In [114]:
# Time the tSNE with 500 samples
%%time
a = pd.DataFrame()
for i, j in enumerate(view_item_vec.iloc[0:500]):
    a = pd.concat([a, pd.DataFrame(j).transpose()], axis = 0)
vt = model.fit_transform(a)

CPU times: user 1min 23s, sys: 2.57 s, total: 1min 25s
Wall time: 1min 31s


In [113]:
# Time the tSNE with 1000 samples
%%time
a = pd.DataFrame()
for i, j in enumerate(view_item_vec.iloc[0:1000]):
    a = pd.concat([a, pd.DataFrame(j).transpose()], axis = 0)
vt = model.fit_transform(a)

CPU times: user 4min 25s, sys: 6.05 s, total: 4min 31s
Wall time: 4min 33s


## Try PCA instead
* PCA looks resonable. We can process 300k data around 30 secs if it does not blow up my RAM. I will proceed with this setting for first try

In [37]:
# Generate TSNE model
model = PCA(n_components=200, random_state=0)

# Append all view_items for PCA processing

In [38]:
%%time
view_item = []
for i in view_buy_item_fea:
    view_item.append(i)
view_item= np.array(view_item)

CPU times: user 1.17 s, sys: 542 ms, total: 1.72 s
Wall time: 1.77 s


In [39]:
%%time
pca_view_vec = model.fit_transform(view_item)

CPU times: user 11 s, sys: 789 ms, total: 11.8 s
Wall time: 6.75 s


In [40]:
# 200 dimensions of PCA can explain 85% of variables. Beyond that, e.g., 300 D, my computer will run out of memory (8g)
sum(model.explained_variance_ratio_)

0.90980608640901406

# Append all buy_items for PCA processing

In [41]:
# Incert pca result to data
df['pca_view'] = pca_view_vec[0:10000].tolist()
df['pca_buy'] = pca_view_vec[10000:20000].tolist()

In [42]:
# Check the data
df.head()

Unnamed: 0,user_id,buy_spu,buy_sn,buy_ct3,view_spu,view_sn,view_ct3,time_interval,view_cnt,view_secondes,view_features,buy_features,avg_view_fea,weight_of_view,pca_view,pca_buy
0,2469583035,4199682998971011301,10013436,334,220189917005230097,10013861,334,37496,7,45,"[0.621, 0.542, 0.0, 0.369, 0.062, 0.039, 0.103...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,1.299352,"[-4.18441352754, -4.98522684557, 7.40010898649...","[-2.45874875255, 0.950284632032, 5.98234076728..."
1,2469583035,4199682998971011301,10013436,334,234826617504419925,10003862,334,170826,2,23,"[0.15, 0.98, 0.104, 1.295, 0.111, 0.0, 0.0, 0....","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.664113,"[3.83780037191, 7.88132568231, 0.937903291471,...","[-2.45874875255, 0.950284632032, 5.98234076728..."
2,2469583035,4199682998971011301,10013436,334,235671027621670949,10003862,334,426968,2,11,"[0.106, 0.027, 0.0, 1.398, 0.096, 0.021, 0.072...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.317619,"[7.77165030033, 7.62261939761, -0.895622345806...","[-2.45874875255, 0.950284632032, 5.98234076728..."
3,2469583035,4199682998971011301,10013436,334,245522675097001998,10026364,334,83993,2,7,"[0.019, 1.415, 0.007, 0.088, 0.055, 0.015, 0.0...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.202121,"[-3.22576025396, -4.16373835223, 5.30225410798...","[-2.45874875255, 0.950284632032, 5.98234076728..."
4,2469583035,4199682998971011301,10013436,334,296751124749754369,10005367,334,427866,2,12,"[0.066, 0.328, 0.043, 0.0, 0.062, 0.016, 0.303...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.346494,"[1.43794255292, 10.9324726458, -2.09193256963,...","[-2.45874875255, 0.950284632032, 5.98234076728..."


In [44]:
# Check the data
df.head()

Unnamed: 0,user_id,buy_spu,buy_sn,buy_ct3,view_spu,view_sn,view_ct3,time_interval,view_cnt,view_secondes,view_features,buy_features,avg_view_fea,weight_of_view,pca_view,pca_buy,weighted_view_pca
0,2469583035,4199682998971011301,10013436,334,220189917005230097,10013861,334,37496,7,45,"[0.621, 0.542, 0.0, 0.369, 0.062, 0.039, 0.103...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,1.299352,"[-4.18441352754, -4.98522684557, 7.40010898649...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-5.43702523761, -6.47756346168, 9.61534491173..."
1,2469583035,4199682998971011301,10013436,334,234826617504419925,10003862,334,170826,2,23,"[0.15, 0.98, 0.104, 1.295, 0.111, 0.0, 0.0, 0....","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.664113,"[3.83780037191, 7.88132568231, 0.937903291471,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[2.5487336589, 5.23409195283, 0.6228739007, -1..."
2,2469583035,4199682998971011301,10013436,334,235671027621670949,10003862,334,426968,2,11,"[0.106, 0.027, 0.0, 1.398, 0.096, 0.021, 0.072...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.317619,"[7.77165030033, 7.62261939761, -0.895622345806...","[-2.45874875255, 0.950284632032, 5.98234076728...","[2.4684263476, 2.42109125239, -0.284466967819,..."
3,2469583035,4199682998971011301,10013436,334,245522675097001998,10026364,334,83993,2,7,"[0.019, 1.415, 0.007, 0.088, 0.055, 0.015, 0.0...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.202121,"[-3.22576025396, -4.16373835223, 5.30225410798...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-0.651995148561, -0.841580586219, 1.071698974..."
4,2469583035,4199682998971011301,10013436,334,296751124749754369,10005367,334,427866,2,12,"[0.066, 0.328, 0.043, 0.0, 0.062, 0.016, 0.303...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.346494,"[1.43794255292, 10.9324726458, -2.09193256963,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[0.498238197476, 3.78803412829, -0.72484169177..."


# Save the file for further processing

In [45]:
#df.to_pickle('top10k_user_pca.pkl')

In [46]:
# Define function
def dot(K, L):
    if len(K) != len(L): return 0
    return sum(i[0]*i[1] for i in zip(K, L))

def similarity(item_1, item_2):
    return dot(item_1, item_2) / np.sqrt(dot(item_1, item_1)*dot(item_2, item_2))

def average(lists):
    return [np.mean(i) for i in zip(*[l for l in lists])]

In [109]:
df = pd.read_pickle('top10k_user_pca.pkl')

In [128]:
# Calculate the weighted pca_view
df['weighted_view'] = df.apply(lambda x: [y*x['weight_of_view'] for y in x['buy_features']], axis=1)

In [99]:
df.head()

Unnamed: 0,user_id,buy_spu,buy_sn,buy_ct3,view_spu,view_sn,view_ct3,time_interval,view_cnt,view_secondes,view_features,buy_features,avg_view_fea,weight_of_view,pca_view,pca_buy,weighted_view_pca
0,2469583035,4199682998971011301,10013436,334,220189917005230097,10013861,334,37496,7,45,"[0.621, 0.542, 0.0, 0.369, 0.062, 0.039, 0.103...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,1.299352,"[-4.18441352754, -4.98522684557, 7.40010898649...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-5.43702523761, -6.47756346168, 9.61534491173..."
1,2469583035,4199682998971011301,10013436,334,234826617504419925,10003862,334,170826,2,23,"[0.15, 0.98, 0.104, 1.295, 0.111, 0.0, 0.0, 0....","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.664113,"[3.83780037191, 7.88132568231, 0.937903291471,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[2.5487336589, 5.23409195283, 0.6228739007, -1..."
2,2469583035,4199682998971011301,10013436,334,235671027621670949,10003862,334,426968,2,11,"[0.106, 0.027, 0.0, 1.398, 0.096, 0.021, 0.072...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.317619,"[7.77165030033, 7.62261939761, -0.895622345806...","[-2.45874875255, 0.950284632032, 5.98234076728...","[2.4684263476, 2.42109125239, -0.284466967819,..."
3,2469583035,4199682998971011301,10013436,334,245522675097001998,10026364,334,83993,2,7,"[0.019, 1.415, 0.007, 0.088, 0.055, 0.015, 0.0...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.202121,"[-3.22576025396, -4.16373835223, 5.30225410798...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-0.651995148561, -0.841580586219, 1.071698974..."
4,2469583035,4199682998971011301,10013436,334,296751124749754369,10005367,334,427866,2,12,"[0.066, 0.328, 0.043, 0.0, 0.062, 0.016, 0.303...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.346494,"[1.43794255292, 10.9324726458, -2.09193256963,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[0.498238197476, 3.78803412829, -0.72484169177..."


In [110]:
ori_user_fea = df.groupby(['user_id'])['view_features'].apply(lambda x: average(x))

In [111]:
ori_user_fea = pd.DataFrame(ori_user_fea)

In [112]:
ori_user_fea=ori_user_fea.reset_index()

In [113]:
df = pd.merge(df, ori_user_fea, on='user_id')

In [114]:
df.head()

Unnamed: 0,user_id,buy_spu,buy_sn,buy_ct3,view_spu,view_sn,view_ct3,time_interval,view_cnt,view_secondes,view_features_x,buy_features,avg_view_fea,weight_of_view,pca_view,pca_buy,weighted_view_pca,view_features_y
0,2469583035,4199682998971011301,10013436,334,220189917005230097,10013861,334,37496,7,45,"[0.621, 0.542, 0.0, 0.369, 0.062, 0.039, 0.103...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,1.299352,"[-4.18441352754, -4.98522684557, 7.40010898649...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-5.43702523761, -6.47756346168, 9.61534491173...","[0.195346938776, 0.549204081633, 0.08559183673..."
1,2469583035,4199682998971011301,10013436,334,234826617504419925,10003862,334,170826,2,23,"[0.15, 0.98, 0.104, 1.295, 0.111, 0.0, 0.0, 0....","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.664113,"[3.83780037191, 7.88132568231, 0.937903291471,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[2.5487336589, 5.23409195283, 0.6228739007, -1...","[0.195346938776, 0.549204081633, 0.08559183673..."
2,2469583035,4199682998971011301,10013436,334,235671027621670949,10003862,334,426968,2,11,"[0.106, 0.027, 0.0, 1.398, 0.096, 0.021, 0.072...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.317619,"[7.77165030033, 7.62261939761, -0.895622345806...","[-2.45874875255, 0.950284632032, 5.98234076728...","[2.4684263476, 2.42109125239, -0.284466967819,...","[0.195346938776, 0.549204081633, 0.08559183673..."
3,2469583035,4199682998971011301,10013436,334,245522675097001998,10026364,334,83993,2,7,"[0.019, 1.415, 0.007, 0.088, 0.055, 0.015, 0.0...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.202121,"[-3.22576025396, -4.16373835223, 5.30225410798...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-0.651995148561, -0.841580586219, 1.071698974...","[0.195346938776, 0.549204081633, 0.08559183673..."
4,2469583035,4199682998971011301,10013436,334,296751124749754369,10005367,334,427866,2,12,"[0.066, 0.328, 0.043, 0.0, 0.062, 0.016, 0.303...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.346494,"[1.43794255292, 10.9324726458, -2.09193256963,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[0.498238197476, 3.78803412829, -0.72484169177...","[0.195346938776, 0.549204081633, 0.08559183673..."


In [115]:
df.rename(columns = {'view_features_y':'user_features'}, inplace = True)

In [106]:
df.head()

Unnamed: 0,user_id,buy_spu,buy_sn,buy_ct3,view_spu,view_sn,view_ct3,time_interval,view_cnt,view_secondes,view_features,buy_features,avg_view_fea,weight_of_view,pca_view,pca_buy,weighted_view_pca_x,user_features
0,2469583035,4199682998971011301,10013436,334,220189917005230097,10013861,334,37496,7,45,"[0.621, 0.542, 0.0, 0.369, 0.062, 0.039, 0.103...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,1.299352,"[-4.18441352754, -4.98522684557, 7.40010898649...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-5.43702523761, -6.47756346168, 9.61534491173...","[-0.488648716682, 2.68093043688, 0.09369500442..."
1,2469583035,4199682998971011301,10013436,334,234826617504419925,10003862,334,170826,2,23,"[0.15, 0.98, 0.104, 1.295, 0.111, 0.0, 0.0, 0....","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.664113,"[3.83780037191, 7.88132568231, 0.937903291471,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[2.5487336589, 5.23409195283, 0.6228739007, -1...","[-0.488648716682, 2.68093043688, 0.09369500442..."
2,2469583035,4199682998971011301,10013436,334,235671027621670949,10003862,334,426968,2,11,"[0.106, 0.027, 0.0, 1.398, 0.096, 0.021, 0.072...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.317619,"[7.77165030033, 7.62261939761, -0.895622345806...","[-2.45874875255, 0.950284632032, 5.98234076728...","[2.4684263476, 2.42109125239, -0.284466967819,...","[-0.488648716682, 2.68093043688, 0.09369500442..."
3,2469583035,4199682998971011301,10013436,334,245522675097001998,10026364,334,83993,2,7,"[0.019, 1.415, 0.007, 0.088, 0.055, 0.015, 0.0...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.202121,"[-3.22576025396, -4.16373835223, 5.30225410798...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-0.651995148561, -0.841580586219, 1.071698974...","[-0.488648716682, 2.68093043688, 0.09369500442..."
4,2469583035,4199682998971011301,10013436,334,296751124749754369,10005367,334,427866,2,12,"[0.066, 0.328, 0.043, 0.0, 0.062, 0.016, 0.303...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.346494,"[1.43794255292, 10.9324726458, -2.09193256963,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[0.498238197476, 3.78803412829, -0.72484169177...","[-0.488648716682, 2.68093043688, 0.09369500442..."


In [118]:
df['sim'] = df.apply(lambda x: similarity(x['buy_features'], x['user_features']), axis=1)

In [119]:
df.head()

Unnamed: 0,user_id,buy_spu,buy_sn,buy_ct3,view_spu,view_sn,view_ct3,time_interval,view_cnt,view_secondes,view_features_x,buy_features,avg_view_fea,weight_of_view,pca_view,pca_buy,weighted_view_pca,user_features,sim
0,2469583035,4199682998971011301,10013436,334,220189917005230097,10013861,334,37496,7,45,"[0.621, 0.542, 0.0, 0.369, 0.062, 0.039, 0.103...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,1.299352,"[-4.18441352754, -4.98522684557, 7.40010898649...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-5.43702523761, -6.47756346168, 9.61534491173...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427
1,2469583035,4199682998971011301,10013436,334,234826617504419925,10003862,334,170826,2,23,"[0.15, 0.98, 0.104, 1.295, 0.111, 0.0, 0.0, 0....","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.664113,"[3.83780037191, 7.88132568231, 0.937903291471,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[2.5487336589, 5.23409195283, 0.6228739007, -1...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427
2,2469583035,4199682998971011301,10013436,334,235671027621670949,10003862,334,426968,2,11,"[0.106, 0.027, 0.0, 1.398, 0.096, 0.021, 0.072...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.317619,"[7.77165030033, 7.62261939761, -0.895622345806...","[-2.45874875255, 0.950284632032, 5.98234076728...","[2.4684263476, 2.42109125239, -0.284466967819,...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427
3,2469583035,4199682998971011301,10013436,334,245522675097001998,10026364,334,83993,2,7,"[0.019, 1.415, 0.007, 0.088, 0.055, 0.015, 0.0...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.202121,"[-3.22576025396, -4.16373835223, 5.30225410798...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-0.651995148561, -0.841580586219, 1.071698974...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427
4,2469583035,4199682998971011301,10013436,334,296751124749754369,10005367,334,427866,2,12,"[0.066, 0.328, 0.043, 0.0, 0.062, 0.016, 0.303...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.346494,"[1.43794255292, 10.9324726458, -2.09193256963,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[0.498238197476, 3.78803412829, -0.72484169177...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427


In [120]:
df['rank'] = df.groupby('user_id')['sim'].rank(ascending=False)

In [121]:
df

Unnamed: 0,user_id,buy_spu,buy_sn,buy_ct3,view_spu,view_sn,view_ct3,time_interval,view_cnt,view_secondes,view_features_x,buy_features,avg_view_fea,weight_of_view,pca_view,pca_buy,weighted_view_pca,user_features,sim,rank
0,2469583035,4199682998971011301,10013436,334,220189917005230097,10013861,334,37496,7,45,"[0.621, 0.542, 0.0, 0.369, 0.062, 0.039, 0.103...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,1.299352,"[-4.18441352754, -4.98522684557, 7.40010898649...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-5.43702523761, -6.47756346168, 9.61534491173...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427,25.0
1,2469583035,4199682998971011301,10013436,334,234826617504419925,10003862,334,170826,2,23,"[0.15, 0.98, 0.104, 1.295, 0.111, 0.0, 0.0, 0....","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.664113,"[3.83780037191, 7.88132568231, 0.937903291471,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[2.5487336589, 5.23409195283, 0.6228739007, -1...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427,25.0
2,2469583035,4199682998971011301,10013436,334,235671027621670949,10003862,334,426968,2,11,"[0.106, 0.027, 0.0, 1.398, 0.096, 0.021, 0.072...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.317619,"[7.77165030033, 7.62261939761, -0.895622345806...","[-2.45874875255, 0.950284632032, 5.98234076728...","[2.4684263476, 2.42109125239, -0.284466967819,...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427,25.0
3,2469583035,4199682998971011301,10013436,334,245522675097001998,10026364,334,83993,2,7,"[0.019, 1.415, 0.007, 0.088, 0.055, 0.015, 0.0...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.202121,"[-3.22576025396, -4.16373835223, 5.30225410798...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-0.651995148561, -0.841580586219, 1.071698974...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427,25.0
4,2469583035,4199682998971011301,10013436,334,296751124749754369,10005367,334,427866,2,12,"[0.066, 0.328, 0.043, 0.0, 0.062, 0.016, 0.303...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.346494,"[1.43794255292, 10.9324726458, -2.09193256963,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[0.498238197476, 3.78803412829, -0.72484169177...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427,25.0
5,2469583035,4199682998971011301,10013436,334,317580251858771991,10013436,334,79637,1,2,"[0.001, 1.924, 0.067, 2.464, 0.0, 0.0, 0.157, ...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.057749,"[-0.367162703196, -2.91958595455, 0.5847105535...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-0.0212032674798, -0.168603078106, 0.03376643...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427,25.0
6,2469583035,4199682998971011301,10013436,334,36105301270949891,10026364,334,84018,3,16,"[0.274, 0.376, 0.0, 0.004, 0.052, 0.074, 0.161...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.461992,"[-1.46307008459, -3.68472813598, 5.32224960831...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-0.675926308966, -1.70231400036, 2.4588354112...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427,25.0
7,2469583035,4199682998971011301,10013436,334,437770064827043954,10013861,334,80933,2,32,"[0.038, 0.239, 0.0, 0.253, 0.196, 0.081, 0.278...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.923984,"[-1.82415989318, -1.23091214842, 5.06093770573...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-1.6854936432, -1.13734251545, 4.67622293611,...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427,25.0
8,2469583035,4199682998971011301,10013436,334,452688234602614802,10021072,334,427802,2,6,"[0.006, 0.723, 0.004, 2.523, 0.212, 0.039, 0.3...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.173247,"[-6.22685705728, 2.40710692796, 0.261954686567...","[-2.45874875255, 0.950284632032, 5.98234076728...","[-1.07878372118, 0.417023828415, 0.04538283903...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427,25.0
9,2469583035,4199682998971011301,10013436,334,453251182129659946,10013861,334,80226,2,6,"[0.208, 0.378, 0.114, 1.377, 0.022, 0.108, 0.0...","[0.091, 0.805, 0.0, 0.591, 0.981, 0.026, 0.757...",34.632653,0.173247,"[4.00748782169, -0.241093712069, 4.0440555338,...","[-2.45874875255, 0.950284632032, 5.98234076728...","[0.694284867163, -0.0417687397455, 0.700620110...","[0.195346938776, 0.549204081633, 0.08559183673...",0.801427,25.0


In [122]:
float(len(df.query('buy_spu == view_spu & rank <= 6')))/float(len(df.query('buy_spu == view_spu'))) * 100

35.76642335766424

## It seems that PCA did not alter rank, and the similarity is also lower than directly use the original features. Also, weighted PCA features did not alter rank as well.
# Need to proceed to CF and other methods.