In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('df_modcloth.csv',usecols=['user_id','item_id','rating','category'])
df.head()

Unnamed: 0,item_id,user_id,rating,category
0,7443,Alex,4,Dresses
1,7443,carolyn.agan,3,Dresses
2,7443,Robyn,4,Dresses
3,7443,De,4,Dresses
4,7443,tasha,4,Dresses


In [3]:
combine_cloth_rating = df.dropna(axis=0 , subset =['category'])
cloth_ratingCount = (combine_cloth_rating.
                    groupby(by = ['category'])['rating'].
                    count().
                    reset_index().
                    rename(columns = {'rating': 'totalRatingCount'})
                    [['category','totalRatingCount']]
                    )
cloth_ratingCount.head()

Unnamed: 0,category,totalRatingCount
0,Bottoms,23625
1,Dresses,34160
2,Outerwear,7131
3,Tops,34977


In [4]:
rating_with_totalRatingCount = combine_cloth_rating.merge(cloth_ratingCount, left_on ='category', right_on ='category',
                                                         how  ='left')
rating_with_totalRatingCount.head()

Unnamed: 0,item_id,user_id,rating,category,totalRatingCount
0,7443,Alex,4,Dresses,34160
1,7443,carolyn.agan,3,Dresses,34160
2,7443,Robyn,4,Dresses,34160
3,7443,De,4,Dresses,34160
4,7443,tasha,4,Dresses,34160


In [5]:
pd.set_option('display.float_format',lambda x: '%.3f' % x)
print(cloth_ratingCount['totalRatingCount'].describe())

count       4.000
mean    24973.250
std     12969.645
min      7131.000
25%     19501.500
50%     28892.500
75%     34364.250
max     34977.000
Name: totalRatingCount, dtype: float64


In [6]:
popularity_threshold = 50
rating_popular_cloth= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_cloth.head()

Unnamed: 0,item_id,user_id,rating,category,totalRatingCount
0,7443,Alex,4,Dresses,34160
1,7443,carolyn.agan,3,Dresses,34160
2,7443,Robyn,4,Dresses,34160
3,7443,De,4,Dresses,34160
4,7443,tasha,4,Dresses,34160


In [7]:
rating_popular_cloth.shape

(99893, 5)

In [8]:
cloth_features_df= rating_popular_cloth.pivot_table(index='category',columns='user_id',values='rating').fillna(0)
cloth_features_df.head()

user_id,"""Ferrari"")",#,#1dad,'Chelle,'Tree',(usually),-L,.,..,00erin,...,zuel,zugai01,zulemaphone,zumbafitnesscarly,zumbaneko,zurajohnson,zuzu_zoom,🇦🇺,🐻,😊
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bottoms,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,0.0,0.0
Dresses,3.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,3.0,2.0,...,0.0,0.0,5.0,0.0,4.0,3.0,4.0,0.0,0.0,5.0
Outerwear,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
Tops,5.0,5.0,0.0,3.0,4.5,0.0,5.0,0.0,0.0,0.0,...,5.0,5.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0,0.0


In [10]:
from scipy.sparse import csr_matrix

cloth_features_df_matrix = csr_matrix(cloth_features_df.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm='brute')
model_knn.fit(cloth_features_df_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [11]:
cloth_features_df.shape


(4, 44783)

In [12]:
query_index = np.random.choice(cloth_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(cloth_features_df.iloc[query_index,:].values.reshape(1,-1), n_neighbors = 4)

3


In [13]:
cloth_features_df.head()

user_id,"""Ferrari"")",#,#1dad,'Chelle,'Tree',(usually),-L,.,..,00erin,...,zuel,zugai01,zulemaphone,zumbafitnesscarly,zumbaneko,zurajohnson,zuzu_zoom,🇦🇺,🐻,😊
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bottoms,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,0.0,0.0
Dresses,3.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,3.0,2.0,...,0.0,0.0,5.0,0.0,4.0,3.0,4.0,0.0,0.0,5.0
Outerwear,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
Tops,5.0,5.0,0.0,3.0,4.5,0.0,5.0,0.0,0.0,0.0,...,5.0,5.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0,0.0


In [14]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(cloth_features_df.index[query_index]))
    else:
        print('[0]: {1}, with distance of {2}:'.format(i, cloth_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Tops:

[0]: Bottoms, with distance of 0.7170639207300855:
[0]: Dresses, with distance of 0.7424667664499487:
[0]: Outerwear, with distance of 0.7959682573451936:
