In [1]:
#import libraries
from numpy import pi, exp, sqrt
import timeit

#### Using normal distribution PDF for calculating size score
___

In ecommerce, the available sizes on a PDP have a significant impact on (potential) add-to-cart and conversion rates. In the example below, we managed to log every available (and unavailable) size for every product and for every PDP pageview. This is done by firing this data within the "Items" array in the Enhanced Ecommerce implementation in GA4. The sizes logged are European shoe sizes. Then, the underlying normal distribution PDF is used to calculate a size score per article. The formula below is being used:
\
\
$f(x) = \frac{1}{\sigma\sqrt{2\pi}} 
  \exp\left( -\frac{1}{2}\left(\frac{x-\mu}{\sigma}\right)^{\!2}\,\right)$


\
**Example 1:**\
Article 21200812-12 for females is available in sizes 36, 38, 39, and 42. All other in-between sizes are not. Average size for females is 39 with a standard deviation of 1.5. The size score for this specific article is:

36.0: 0.036\
37.0: 0.000\
38.0: 0.213\
39.0: 0.266\
40.0: 0.000\
41.0: 0.000\
42.0: 0.036\
Total size score: 0.5509148126334408


**Example 2:**\
Article 21200812-27 for females is available in all sizes from 35 to (and including) 43. Average size for females is 39 with a standard deviation of 1.5. The size score for this specific article is:

35.0: 0.0076\
36.0: 0.036\
37.0: 0.1093\
38.0: 0.213\
39.0: 0.266\
40.0: 0.213\
41.0: 0.1093\
42.0: 0.036\
43.0: 0.0076\
Total size score: 0.9977548972480639


In [2]:
# define mu (means) and sigmas (standard deviations) for both male and female shoes
male_mu = 43 # mean of male sizes sold
male_sigma = 1.7 # standard deviation of male sizes sold

female_mu = 39 # mean of female sizes sold
female_sigma = 1.5 # standard deviation of female sizes sold

formula_input = [(male_mu, male_sigma), (female_mu, female_sigma)]

In [3]:
data = [['21200812-12', '36_1|37_0|38_1|39_1|40_0|41_0|42_1'],
       ['21200812-17', '36_1|37_1|38_0|39_0|40_1|41_0|42_1'],
       ['21200812-19', '36_1|37_1|38_0|39_0|40_1|41_1|42_1'],
       ['21200812-23', '36_1|37_1|38_0|39_0|40_1|41_1|42_1'],
       ['21200812-24', '36_1|37_1|38_0|39_0|40_1|41_1|42_1'],
       ['21200812-25', '36_1|37_1|38_1|39_1|40_1|41_1|42_1'],
       ['21200812-26', '35_1|36_1|37_1|38_1|39_1|40_1|41_1|42_1'],
       ['21200812-27', '35_1|36_1|37_1|38_1|39_1|40_1|41_1|42_1|43_1']]

In [4]:
ss_dict = {}

def calculate_size_score(nelson_id, size_availability):
    """ Grabs gender ID (first character of ID) and size availability and
    returns a calculated size score based on a normal distribution. """
    
    
    gender_id = int(nelson_id[0])
    idx = gender_id - 1 # to grab input from list formula_input
    dict_tuple = gender_id, size_availability
    
    if dict_tuple in ss_dict.keys():
        return ss_dict[dict_tuple] # a.k.a. calculated_size_score
    
    else:
        calculated_size_score = 0
        
        # result here is list of lists with size and availability, e.g.
        # [[36, 1], [37, 0], [38, 1]] etc.
        extracted_size_availabity = [x.split('_') for x in size_availability.split('|')]

        for size, availability in extracted_size_availabity:
            if availability == '1':
                size = float(size)

                # use normal distribution for calculation of size score
                size_score = 1 / (formula_input[idx][1] * sqrt(2 * pi)) * exp( - (size - formula_input[idx][0])**2 / (2 * formula_input[idx][1] **2))
                calculated_size_score += size_score
            
            # put in dictionary
            ss_dict[dict_tuple] = calculated_size_score

        return calculated_size_score

In [5]:
# check dataset input
data[-1][0]

'21200812-27'

In [6]:
# test
calculate_size_score(data[-1][0], data[-1][1])

0.9977548972480639

In [7]:
# microsecond is an SI unit of time equal to one millionth (0.000001 or 10−6 or 1⁄1,000,000) of a second. 
%timeit for nelson_id, size_availability in data: calculate_size_score(nelson_id, size_availability)

2.57 µs ± 69.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [8]:
for nelson_id, size_availability in data:
    print(nelson_id, '>', size_availability, '>', calculate_size_score(nelson_id, size_availability))

21200812-12 > 36_1|37_0|38_1|39_1|40_0|41_0|42_1 > 0.5509148126334408
21200812-17 > 36_1|37_1|38_0|39_0|40_1|41_0|42_1 > 0.39429334214981465
21200812-19 > 36_1|37_1|38_0|39_0|40_1|41_1|42_1 > 0.5036333919338104
21200812-23 > 36_1|37_1|38_0|39_0|40_1|41_1|42_1 > 0.5036333919338104
21200812-24 > 36_1|37_1|38_0|39_0|40_1|41_1|42_1 > 0.5036333919338104
21200812-25 > 36_1|37_1|38_1|39_1|40_1|41_1|42_1 > 0.9825602492163338
21200812-26 > 35_1|36_1|37_1|38_1|39_1|40_1|41_1|42_1 > 0.9901575732321989
21200812-27 > 35_1|36_1|37_1|38_1|39_1|40_1|41_1|42_1|43_1 > 0.9977548972480639
