In [1]:
from glob import glob
from random import shuffle, seed, randint, choice
from PIL import Image, ImageChops, ImageOps

from sklearn.decomposition import IncrementalPCA, SparsePCA
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

from IPython.display import display, HTML, display_html

import os
import numpy as np
import pandas as pd

FOLDER = './test/'
STANDARD_SIZE = (180,270)

N_COMPONENTS = 50
N_COMPONENTS_TO_SHOW = 50

%matplotlib inline

In [2]:
def resize(im, size=STANDARD_SIZE):
    """takes image and turns into centered and resized version"""
    old_size = im.size
    ratio = 1. * size[1] / size[0]
    
    if old_size[0] * ratio < old_size[1]:
        w = old_size[1] / ratio
        h = old_size[1]
    else:
        w = old_size[0]
        h = old_size[0] * ratio
    
    new_size = (round(w), round(h))
    
    img = Image.new("RGB", new_size, color=(255, 255, 255))
    img.paste(im, (round((new_size[0]-old_size[0])/2),
                   round((new_size[1]-old_size[1])/2)))
    
    return img

def trim(im):
    bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
    diff = ImageChops.difference(im, bg)
    diff = ImageChops.add(diff, diff, 2.0, -10)
    bbox = diff.getbbox()
    if bbox:
        return im.crop(bbox)

def process(im, size=STANDARD_SIZE):
    im = trim(im)
    im = resize(im, size)
    return im.resize(size)

def img2arr(im):
    """takes an image and turns it into a numpy array (3*width*height, ) of RGB pixels"""
    try:
        #convert PIL image to numpy array of shape (h, w, c)
        img = np.array(im, dtype=np.int16)
        
        r, g, b, wht = img[:,:,0], img[:,:,1], img[:,:,2], 250
        mask = (r>=wht) & (g>=wht) & (b>=wht)
        
        #change background pixels to some very different from usual values
        img[mask] = [-999, -999, -999]
        
        return img.reshape(img.size, )
    except:
        print ("Some error here!")
        return None

def item_info(cats, common_brands=True, gbp=76.77):
    
    df, pics = pd.DataFrame(), pd.DataFrame()
    
    if not isinstance(cats, list):
        if isinstance(cats, str):
            cats = [cats]
        else:
            print ("Bad value for 'cats'")
            return np.array([]), np.array([])
    
    for site in ['tsm', 'nap']:
        img_path = './' + site + '_pictures/*'
    
        ef = pd.read_csv('./data/' + site + '_items')
        ef = ef.drop(['img', 'page'], axis=1).drop_duplicates('id', keep='first')
        ef['type'] = site
        if site == 'nap': ef['price'] *= gbp
        
        for cat in cats:
            cf = ef[ef['nav_menu'] == str(cat)]
            df = df.append(cf)
        
        rows = []
        images = glob(img_path)
        for image in images:
            base = os.path.basename(image)
            offer = int(os.path.splitext(base)[0])
            rows.append({'id': offer, 'img': image})

        ep = pd.DataFrame.from_dict(rows)
        pics = pics.append(ep)
    
    if common_brands:
        cb = pd.read_csv('./data/common_brands')
        df = df.merge(cb, on='brand')
    
    df = df.merge(pics, on='id').set_index('id')
    
    return df

def get_data(cats, common_brands=True):
    
    df = item_info(cats, common_brands)
    
    print('processing images...')
    print('(this takes a long time if you have a lot of images)')
    
    raw_data = []
    i, m = 0, df['img'].count()
    
    print("{} out of {} images processed".format(0, m))
    
    for ix, row in df.iterrows():
        img = row['img']
        
        try:
            img = Image.open(img)
        except:
            print ("\tSome error here at iter " + str(i) + " for index: " + str(ix))
        
        if (row['type'] == 'tsm') & (row['nav_menu'] == 'Женская обувь'):
            img = ImageOps.mirror(img)
        
        try:
            img = process(img)
        except:
            print (int(ix), row['img'])
        
        arr = img2arr(img)
        
        if arr is not None:
            raw_data.append((arr,ix))
            i += 1
    
    print("{} out of {} images processed".format(i, m))
    print('finished processing images...')
    
    shuffle(raw_data)
    data = np.array([img for (img, lbl) in raw_data])
    labels = np.array([lbl for (im, lbl) in raw_data])
    
    return data, labels, df

def pca(data, n_components=N_COMPONENTS, batch_size=None):
    print ('doing PCA...')
    pca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
    #pca = SparsePCA(n_components)
    #data = normalize(data)
    X = pca.fit_transform(data)
    print ('PCA is finished...')
    print ('finding similarities...')
    sim = cosine_similarity(X)
    print ('finished...')
    
    return X, sim

def get_item_info(d):
    if not isinstance(d, dict):
        return d
    df, pics = item_info()
    d.update({key:[] for key in ['name', 'brand', 'image', 'price']})
    for id in d['id']:
        offer = df.loc[id]
        img = pics.loc[id]['path']
        if offer is not None:
            d['name'].append(offer['name'])
            d['brand'].append(offer['brand'])
            d['image'].append(img)
            d['price'].append(offer['price'])
            # URL
        else:
            d['name'].append('')
            d['brand'].append('')
            d['image'].append('')
            d['price'].append('')
            d['oldprice'].append('')
            # URL
    return d

def get_html(d, info, width = 150):
    if type(width)==type(1): width = "{}px".format(width)
    html = ["<table align = 'center' style = 'border-style: hidden'><tr>"]
    
    #for key in d.keys(): print '{0} have {1} elements'.format(key, len(d[key]))
    #print d['id']
    
    desc = info
    
    for i in range(len(d['id'])):
        row = desc.loc[d['id'][i]]
        html.append("<th style = 'text-align: center; width: {}'>".format(width))
        html.append("offer: {}</br>price: {:.2f}</br>{}</br>{:.2f} %</br>".format(d['id'][i], 
                                                                              row['price'], 
                                                                              row['brand'], 
                                                                              100 * d['score'][i]))
        html.append("</th>")
        
    html.append("</tr><tr>")
    
    for i in range(len(d['id'])):
        pic = desc.loc[d['id'][i]]
        html.append("<td style = 'width:{}'>".format(width))
        html.append("<a href = '{}' target='_blank'><img src = '{}' width=/></a></td>".format(pic['img'], pic['img']))
    
    html.append("</tr></table>")
    
    return html

def show_recos(d, **args):
    
    html = '<head><meta charset=\"utf-8\"></head>\n'
    
    if not isinstance(d, dict):
        return HTML(d)
    
    if d.get('given') is not None:
        html += '<p style ="text-align: center">Given items:</p>'
        html += ''.join(get_html(d['given'], info))
    
    html += '<p style ="text-align: center">Recommendations:</p>'
    html += ''.join(get_html(d, info))
    display(HTML(html))
    #return HTML(html)

def get_ix(labels, df, site=None, brand=None):
    if site is not None:
        df = df.loc[(df['type']==site)]
    if brand is not None:
        df = df.loc[(df['brand']==brand)]
    ix = np.in1d(labels, df.index.values)
    return np.where(ix)[0]

def get_similar_items(sim, labels, df, to='tsm', offer = None, limit = 6, treshold=.75):
    
    if offer is None:
        ix = get_ix(labels, df, site=to)
        inputint = choice(ix)
    else:
        inputint = labels.tolist().index(offer)
    
    brand = df.loc[labels[inputint]]['brand']
    ix = get_ix(labels, df, brand=brand)
    
    sim[np.ix_(ix, ix)] += 1
    indices = np.argpartition(-sim[inputint], range(limit + 1))[:limit + 1]
    sim[np.ix_(ix, ix)] -= 1
    
    d = {'id': [], 'score': []}
    d['given'] = {'id': [labels[indices].tolist()[0]], 'score': [sim[inputint][indices].tolist()[0]]}
    for i in range(1, limit+1):
        if sim[inputint][indices].tolist()[i] > treshold:
            d['id'].append(labels[indices].tolist()[i])
            d['score'].append(sim[inputint][indices].tolist()[i])
    return d

In [None]:
%%time
# ~25сек на 5к товаров
#cats = ['Женская одежда', 'Clothing']
cats = ['Женская обувь', 'Shoes']
data, labels, info = get_data(cats)

In [None]:
%%time
# ~11мин на 11к товаров
# ~ 4мин на  5к товаров
X, p_sim = pca(data, n_components=250, batch_size=2048)

In [None]:
X.shape

In [None]:
n_sim = p_sim
tix = get_ix(labels, info, site='tsm')
nix = get_ix(labels, info, site='nap')

n_sim[np.ix_(tix, tix)] = 0
n_sim[np.ix_(nix, nix)] = 0
np.fill_diagonal(n_sim, 1)

In [None]:
# 914014 (939951) > 2331326
# 800508 > 2009960
# 2365683 > 895576 (895577)
# 2255630 > 887436 (569471)
# 887446 > 1914395 (1914385)
show_recos(get_similar_items(n_sim, labels, info), info=info)