In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import inspect

In [None]:
from pink_is_for_girls import spiders
#scrape links to images
# WARNING: in this repository, crawler is not included 
# to protect the shops webpages from excessive scraping,
# so this cell won't work.

spiderlist=[i[1] for i in inspect.getmembers(spiders, inspect.isclass)]

process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'FEED_URI': 'images.jsonl'
    })

for s in spiderlist:
    process.crawl(s)

process.start()

In [None]:
#download images and analyze them

import json
import urllib2
from time import sleep

from PIL import Image
import colorsys

from matplotlib.colors import rgb_to_hsv

import cStringIO
import numpy as np

import matplotlib.pyplot as plt
from background_mask import get_mask

class hsvhistogram:
    """This is a 3-d histogram class for color analysis of images.
    It's dimensions are hue, saturation and value and are all in interval [0,1].
    Color of every bin is stored in self.bincolors."""
    
    def __init__(self,nbins=[10,5,5]):
        self.nbins=np.array(nbins)
        self.binwidth=1./self.nbins
        self.data=np.zeros(self.nbins)
        self.ndata=0
        self.bincolors=np.apply_along_axis(self.ind_to_hex,3,(np.indices(self.nbins)+0.5).transpose([1,2,3,0]))
        #less numpy version of the previous line:
        #self.bincolors=np.array([[[self.ind_to_hex(np.array([i,j,k])) for k in range(nbins)]for j in range(nbins)]for i in range(nbins)])

    def ind_to_hex(self,hsv):
        hsv=np.true_divide(hsv,self.nbins)
        r,g,b=colorsys.hsv_to_rgb(hsv[0],hsv[1],hsv[2])
        return '#%02x%02x%02x' % (r*255,g*255,b*255)

    def clean(self):
        self.data=np.zeros(self.nbins)
        self.ndata=0
        
    def get_bin(self,x,i):
        if x==1:
            return -1
        else:
            return int(x*self.nbins[i])
        
    def add_data(self,hsv,count=1):
        try:
            self.data[self.get_bin(hsv[0],0),self.get_bin(hsv[1],1),self.get_bin(hsv[2],2)]+=count
            self.ndata+=count
        except IndexError:
            print 'data must be tripple of values in the interval [0,1], ignoring this datapoint'
    
    def raise_bins(self,hsv_counts):
        for hsv,count in hsv_counts:
            self.add_data(hsv,count)
    
    def raise_bins_by_one(self,hsv_counts):   
        #raises every bin only once per image
        raise_mask=np.zeros(self.data.shape,dtype='bool')
        for hsv,count in hsv_counts:
            try:
                raise_mask[self.get_bin(hsv[0],0),self.get_bin(hsv[1],1),self.get_bin(hsv[2],2)]=True
            except IndexError:
                print 'data must be tripple of values in the interval [0,1], ignoring this datapoint'   
        self.data[raise_mask]+=1
        self.ndata+=1

    def normalize(self):
        if np.any(self.data):
            self.data=np.true_divide(self.data,self.ndata)

    def save(self,dataname):
        dataname=dataname.split('.')[0]
        np.save(dataname+'_data',self.data)
        np.save(dataname+'_colors',self.data)

histshape=[10,5,5]

#percentage of area
hist=hsvhistogram(histshape)
hist_men=hsvhistogram(histshape)
hist_women=hsvhistogram(histshape)

#percentage of items
hist2=hsvhistogram(histshape)
hist2_men=hsvhistogram(histshape)
hist2_women=hsvhistogram(histshape)

print 'histogram ready, analyzing images'
print

#WARNING: this file is produced by crawler, you will have to provide your own
#the format of every line is {'img':'html://...','gender':'M/W'}
inputfile=open('images.jsonl')

class PinkException(Exception):
    def __init__(self,*args,**kwargs):
        Exception.__init__(self,*args,**kwargs)
        
def parseline(line):
    try:
        return json.loads(line)
    except:
        raise PinkException('Failed to parse line: '+line)
    
def get_image(scraped):
    try:
        #sleep(1)
        print 'loading line ',ln,' :',scraped['img']
        return urllib2.urlopen(scraped['img']).read()
    except:
        raise PinkException('Failed to load image')

def imgdata_to_array(imgdata):
    try:
        pilimg=Image.open(cStringIO.StringIO(imgdata))
        if not pilimg.mode=='RGB':
            pilimg=pilimg.convert('RGB')
        img=np.asarray(pilimg)
        return img
    except:
        raise PinkException('Failed to convert image to array.')

succes=0
failure=0
for ln,line in enumerate(inputfile):
    try:
        scraped=parseline(line)
        imgdata=get_image(scraped)
        img=imgdata_to_array(imgdata)
        
        #create mask: background is False, object is True
        mask=get_mask(img, threshold=0.1)
        hsv_img=rgb_to_hsv(img[mask]/255.)
        
        #PIL.getcolors is useless with masking, unique is faster than directly adding to histogram
        hsv_colors=zip(*np.unique(ar=hsv_img,return_counts=True,axis=0))

        hist.raise_bins(hsv_colors)
        hist2.raise_bins_by_one(hsv_colors)
        
        if scraped['gender']=='M':
            hist_men.raise_bins(hsv_colors)
            hist2_men.raise_bins_by_one(hsv_colors)
            
        if scraped['gender']=='W':
            hist_women.raise_bins(hsv_colors)
            hist2_women.raise_bins_by_one(hsv_colors)
        
        succes+=1

    except PinkException as e:
        print e
        failure+=1

hist.normalize()
hist_men.normalize()
hist_women.normalize()

hist2.normalize()
hist2_men.normalize()
hist2_women.normalize()

hist.save('all')
hist_men.save('men')
hist_women.save('women')

hist2.save('all2')
hist2_men.save('men2')
hist2_women.save('women2')
    

print
print 'data ready'
print 'succesfully analyzed: ',succes
print 'failed: ',failure

In [None]:
#plot the data
%matplotlib notebook
import mpld3
from matplotlib.ticker import FuncFormatter

def plot_from_getplotable(get_plotable, hists):
    f=plt.figure(figsize=(9,8))
    ax0 = plt.subplot(311)
    ax1 = plt.subplot(312,sharex = ax0)
    ax2 = plt.subplot(313)

    menplotable=get_plotable(hists[0])
    womenplotable=get_plotable(hists[1])
    allplotable=get_plotable(hists[2])

    for i,dc in enumerate(zip(menplotable[0],menplotable[1])):
        d,c=dc
        ax1.bar(i, d,width=1, color=c,edgecolor=c)
    ax1.set_title(u'Muži',x=0.5, y=0.8)

    for i,dc in enumerate(zip(womenplotable[0],womenplotable[1])):
        d,c=dc
        ax0.bar(i, d, width=1, color=c,edgecolor=c)
    ax0.set_title(u'Ženy',x=0.5,y=0.8)

    plotable=(womenplotable[0]-menplotable[0],allplotable[1])
    for i,dc in enumerate(zip(plotable[0],plotable[1])):
        d,c=dc
        ax2.bar(i, d, width=1, color=c,edgecolor=c)
    ax2.set_title(u'Rozdiel',x=0.5,y=0.8)
    
    for ax in [ax0,ax1,ax2]:
        ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
        
    return f

def int_to_hex(i):
    s='#'+str(hex(i))[2:]
    return s+(7-len(s))*'0'

def hex_to_int(h):
    return int(h[1:],16)

def sum_chunk(x, chunk_size, axis=-1):
    shape = x.shape
    if axis < 0:
        axis += x.ndim
    shape = shape[:axis] + (-1, chunk_size) + shape[axis+1:]
    x = x.reshape(shape)
    return x.sum(axis=axis+1)


def get_plotable(hist):
    #first grey, than high value
    border=hist.data.shape[2]/2
    data=np.concatenate((hist.data[:,:,:border].flatten(),hist.data[:,:,border:].flatten()))
    bincolors=np.concatenate((hist.bincolors[:,:,:border].flatten(),hist.bincolors[:,:,border:].flatten()))
    return data,bincolors

f=plot_from_getplotable(get_plotable,(hist_men,hist_women,hist,))
output=open('fig.html','w')
output.write(mpld3.fig_to_html(f))
output.close()
plt.show()

In [None]:
f=plot_from_getplotable(get_plotable, hists=(hist2_men,hist2_women,hist2))
#plt.show()
output=open('fig2.html','w')
output.write(mpld3.fig_to_html(f))
output.close()
plt.show()