# Collaborative Filtering on Grailed Data

## Setup

In [5]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats

from IPython.display import display, Image, HTML
from collections import defaultdict
from multiprocessing.dummy import Pool
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import jaccard_similarity_score
from sklearn.preprocessing import normalize
from scipy.io import mmread, mmwrite
from scipy.sparse import lil_matrix
from scipy.stats import pearsonr

plt.style.use('ggplot')

In [2]:
listings_df = pd.read_csv('../scraper/create_dataset/listings.csv')
users_df = pd.read_csv('../scraper/create_dataset/users.csv')
listing_user_follow_df = pd.read_csv('../scraper/create_dataset/listing_user_follow.csv')
photos_df = pd.read_csv('../scraper/create_dataset/photos.csv')

In [3]:
listings_df.index = listings_df['id']

In [4]:
designer_counts = listings_df.groupby('designer_name').count()
designers_of_interest = set(designer_counts[designer_counts['id'] > 25].index.values)

## Item Suggestions Based on Followers

Creating the Item-User matrix

In [11]:
listings_with_followers = listings_df[listings_df['follower_count'] > 0]
listing_list = list(listings_with_followers.index)
listing_map = {listing:idx for idx, listing in enumerate(listing_list)}

max_listing_id = len(listing_list)
max_user_id = max(users_df['id'])+1

In [12]:
# # save for post, use the stored file for convenience

# user_item_matrix = lil_matrix((max_listing_id, max_user_id))

# for index, row in listing_user_follow_df.iterrows():
#     if row['listing_id'] in listing_map:
#         user_item_matrix[listing_map[row['listing_id']], row['user_id']] = 1

# mmwrite('cached_matrices/user_item_with_followers.mtx', user_item_matrix)

In [13]:
user_item_matrix = mmread('cached_matrices/user_item_with_followers.mtx').tolil()

Item-Item Collaborative Filtering

In [14]:
def get_suggestions(item_of_interest_id, user_item_matrix, search_space):
    
    def pearson(item_to_compare_id):
        correlation, p_value = pearsonr(
            user_item_matrix[item_to_compare_id, :].toarray()[0],
            user_item_matrix[item_of_interest_id, :].toarray()[0],
        )
        if item_to_compare_id == item_of_interest_id or np.isnan(correlation):
            correlation = np.float64(-1)
        return correlation
    
    pool = Pool()
    pearson_values = pool.map(pearson, search_space)
    pool.close()
    pool.join()
    
    sorted_suggestions = np.argsort(pearson_values)[::-1]
    return [search_space[idx] for idx in sorted_suggestions]

def print_suggestions(suggestions, num_suggestions=10):
    html_string = ''
    for listing_id in suggestions[:num_suggestions]:
        suggested_listing = listings_df[listings_df['id'] == listing_id]
        suggested_listing_designer = suggested_listing['designer_name'].values[0]
        suggested_listing_title = suggested_listing['title'].values[0]
        photo_url = photos_df[photos_df['listing_id'] == listing_id]['url'].values[0]

        html_string += '<div style="display: inline-block; padding: {}px;">'.format(5)
        html_string += '<img src="{}" style="height: {}px;"/>'.format(photo_url, 200)
        html_string += '<div>{}</div>'.format(suggested_listing_designer)
        html_string += '<div>{}</div>'.format(suggested_listing_title)
        html_string += '</div>'
    display(HTML(html_string))

In [None]:
# comparing the item against all listings is too big a task atm, let's try a random sample
# listing_sample = np.random.choice(range(max_listing_id), size=100000)
listing_sample = range(max_listing_id)

# this is a Raf Simons fishtail
raf_suggestions = [listing_list[idx] for idx in get_suggestions(listing_map[408449], user_item_matrix, listing_sample)]

In [18]:
print_suggestions(raf_suggestions)

## Designer Suggestions Based on Followers, Buyers, and Sellers

Creating the Designer-User matrix

In [19]:
designer_list = list(designers_of_interest)
designer_map = {designer:idx for idx, designer in enumerate(designer_list)}

In [None]:
# save for post, use the stored file for convenience

max_designer_id = len(designer_list)
max_user_id = max(users_df['id']) + 1
user_designer_matrix = lil_matrix((max_designer_id, max_user_id))

for index, row in listings_df.iterrows():
    if row['designer_name'] not in designers_of_interest:
        continue

    if not np.isnan(row['buyer_id']):
        user_designer_matrix[designer_map[row['designer_name']], row['buyer_id']] += 1

    user_designer_matrix[designer_map[row['designer_name']], row['seller_id']] += 1

for index, row in listing_user_follow_df.iterrows():
    designer = listings_df[listings_df['id'] == row['listing_id']]['designer_name'].values[0]
    
    if designer in designers_of_interest:
        user_designer_matrix[designer_map[designer], row['user_id']] += 1

mmwrite('cached_matrices/user_designer.mtx', user_designer_matrix)

In [20]:
user_designer_matrix = mmread('cached_matrices/user_designer.mtx').tolil()

Designer-Designer Collaborative Filtering

In [21]:
designer_suggestions = get_suggestions(designer_map['Our Legacy'], user_designer_matrix, range(len(designer_list)))
[designer_list[idx] for idx in designer_suggestions][:10]

['Patrik Ervell',
 'Norse Projects',
 'A.P.C.',
 'Dana Lee',
 'Nonnative',
 'Engineered Garments',
 'Wings + Horns',
 'Acne Studios',
 'Stephan Schneider',
 'Common Projects']

In [22]:
designer_suggestions = get_suggestions(designer_map['Undercover'], user_designer_matrix, range(len(designer_list)))
[designer_list[idx] for idx in designer_suggestions][:10]

['Number (N)ine',
 'Raf Simons',
 'Helmut Lang',
 'Junya Watanabe',
 'Comme Des Garcons',
 'Undercover X Uniqlo',
 'Dior',
 'Cav Empt',
 'Diet Butcher Slim Skin',
 'Maison Margiela']

## Clustering via Agglomerative Clustering (using Pearson Correlation as distance)

In [23]:
 def print_clusters(labels):
    clusters = defaultdict(list)
    for designer_id, cluster in enumerate(labels):
        clusters[cluster].append(designer_id)

    for cluster, designer_ids in clusters.iteritems():
        designer_names = [designer_list[id] for id in designer_ids if not pd.isnull(designer_list[id])]
        designer_num_listings = [designer_counts.loc[designer_name][0] for designer_name in designer_names]
        clusters[cluster] = [designer_names[idx] for idx in np.argsort(designer_num_listings)[::-1]]


    html_string = ''
    for cluster, designers in clusters.iteritems():
        html_string += '<div style="display: inline-block; vertical-align: top; padding: 5px;"'
        html_string += '<h1>{}: {} designers</h1>'.format(cluster, len(designers))
        html_string += '<ol>'

        for designer in designers[:20]:
            html_string += '<li>{}</li>'.format(designer)

        html_string += '</ol>'
        html_string += '</div>'
    display(HTML(html_string))

In [24]:
normalized_user_designer_matrix = normalize(user_designer_matrix, axis=0, norm='l1')

In [25]:
def pearson_affinity(M):
    return 1 - np.array([[pearsonr(a,b)[0] for a in M] for b in M])

pearson_matrix = pearson_affinity(normalized_user_designer_matrix.toarray())
mmwrite('cached_matrices/designer_pearson.mtx', pearson_matrix)

In [26]:
pearson_matrix = mmread('cached_matrices/designer_pearson.mtx')

In [27]:
agg = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage='average')
agg.fit(pearson_matrix)
print_clusters(agg.labels_)

These are ordered by number of listings, NOT by their closeness to the centroid of the cluster.

Explain that the weird cluster is not uncategorized but that they're kind of peculiar and lie very far away from the other clusters.