# Imports

In [1]:
import random

import numpy as np
import pandas as pd

# Instructions

Our metric is the Expected Degrees of Separation, EDS, between Shop A and Shop B. That is, imagine a token is given to a customer of Shop A at random. The customer is told to give the token to a customer in the next shop they visit, again at random, and to mark the token. At each step the customer is told to do the same thing, so that the number of marks on the token records the number of customers the token has passed through. If the token comes to Shop B, it is taken, and the number of marks is recorded. EDS is the expected number of marks on the token at the destination shop.

Suppose we have a transition matrix, the probability that a customer Shop A also shops at Shop B.

From Shop A	  |To Shop B       |P(A -> B)
--------------|----------------|--------
Tesco, Bristol|	Boots, Bath	   |0.1
Tesco, Bristol|	Asda, Bristol  |0.2
Asda, Bristol |	Currys, Bristol|0.1

We are looking for a table of the form:

Source Shop A |	Destination Shop B |EDS(A -> … -> B)
--------------|--------------------|-----------------
Tesco, Bristol|	Boots, Bath|3.5
Tesco, Bristol|Asda, Bristol|2.3
Tesco, Bristol|	Currys, Bristol|4.0


Questions we might ask:

- What assumptions are made in the calculation of EDS?
- How would you calculate EDS if the number of shops was small (say <1,000)?
- What challenges would be faced as the number of shops grows? How would you deal with these?
- What are the properties of EDS as a metric?
- Given a matrix of EDS values, how might you estimate the latent shopping preferences of the customers?


# Transition dict approach

In [293]:
# cities = ['bristol', 'bath', 'torquay', 'falmouth', 'gloucester', 'frome']
# brands = ['tesco', 'boots', 'currys', 'asda', 'specsavers', 'whsmith']

cities = range(4)
brands = range(4)

shops = [(city, brand) for city in cities for brand in brands]
shops[:5]

[(0, 0), (0, 1), (0, 2), (0, 3), (1, 0)]

In [316]:
np.random.seed(1)
tm_dict = {}
idx = 0
for from_shop in range(len(shops)):
    for to_shop in range(len(shops)):
        if from_shop != to_shop:
            transition_probability = np.random.poisson(2)
            if shops[from_shop][0] == shops[to_shop][0]:
                transition_probability *= 10
            if shops[from_shop][1] == shops[to_shop][1]:
                transition_probability *= 3
            tm_dict[idx] = {
                'from_shop': shops[from_shop],
                'to_shop': shops[to_shop],
                'p_also_shops': transition_probability,
            }
            idx += 1

transition_table = pd.DataFrame(tm_dict).T[['from_shop', 'to_shop', 'p_also_shops']]

In [317]:
transition_matrix = transition_table.pivot(index='from_shop', columns='to_shop')
tm_normed = np.array(transition_matrix.apply(lambda x: x/x.sum(), axis=1).fillna(0))

In [318]:
possible_transitions = {}
for shop_A in shops:
    possible_transitions[shop_A] = transition_table[
        transition_table['from_shop'] == shop_A
    ][['to_shop', 'p_also_shops']]

In [319]:
n_trials = 100

stochastic_results = {}

for row in transition_table.index:
    shop_A = transition_table.loc[row, 'from_shop']
    shop_B = transition_table.loc[row, 'to_shop']
    results = []
    for trial in range(n_trials):
        current_shop = shop_A
        marks = 0
        while current_shop != shop_B:
            possible_destinations = possible_transitions[current_shop]
            probs = possible_destinations['p_also_shops'].astype(float)
            probs = probs / probs.sum()
            marks += 1
            current_shop = np.random.choice(
                possible_destinations['to_shop'],
                p=probs
            )
        results.append(marks)
    print(f'From {shop_A} to {shop_B}: mean: {np.array(results).mean()}; stdev: {np.array(results).std()}')

From (0, 0) to (0, 1): mean: 9.23; stdev: 10.85527982135882
From (0, 0) to (0, 2): mean: 15.43; stdev: 16.150080495155432
From (0, 0) to (0, 3): mean: 20.63; stdev: 18.082950533582732
From (0, 0) to (1, 0): mean: 27.91; stdev: 24.356557638549827
From (0, 0) to (1, 1): mean: 17.6; stdev: 15.396103403134186
From (0, 0) to (1, 2): mean: 32.12; stdev: 27.4660808999027
From (0, 0) to (1, 3): mean: 46.36; stdev: 49.01908199874819
From (0, 0) to (2, 0): mean: 14.15; stdev: 17.509069078623227
From (0, 0) to (2, 1): mean: 10.59; stdev: 8.928712113177355
From (0, 0) to (2, 2): mean: 18.78; stdev: 19.486189981625447
From (0, 0) to (2, 3): mean: 14.21; stdev: 13.896974490873903
From (0, 0) to (3, 0): mean: 19.39; stdev: 20.726261119652044
From (0, 0) to (3, 1): mean: 12.94; stdev: 13.265609673136023
From (0, 0) to (3, 2): mean: 18.05; stdev: 15.940122333282138
From (0, 0) to (3, 3): mean: 15.27; stdev: 12.667955636171133
From (0, 1) to (0, 0): mean: 15.19; stdev: 14.804522957528892
From (0, 1) to 

In [320]:
transition_matrix

Unnamed: 0_level_0,p_also_shops,p_also_shops,p_also_shops,p_also_shops,p_also_shops,p_also_shops,p_also_shops,p_also_shops,p_also_shops,p_also_shops,p_also_shops,p_also_shops,p_also_shops,p_also_shops,p_also_shops,p_also_shops
to_shop,"(0, 0)","(0, 1)","(0, 2)","(0, 3)","(1, 0)","(1, 1)","(1, 2)","(1, 3)","(2, 0)","(2, 1)","(2, 2)","(2, 3)","(3, 0)","(3, 1)","(3, 2)","(3, 3)"
from_shop,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
"(0, 0)",,20.0,10.0,0.0,3.0,2.0,2.0,0.0,9.0,3.0,3.0,0.0,6.0,4.0,2.0,4.0
"(0, 1)",0.0,,20.0,10.0,0.0,6.0,1.0,1.0,2.0,6.0,0.0,4.0,1.0,3.0,1.0,5.0
"(0, 2)",20.0,50.0,,30.0,2.0,0.0,6.0,4.0,3.0,1.0,3.0,4.0,0.0,0.0,0.0,2.0
"(0, 3)",20.0,30.0,0.0,,3.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,2.0,9.0
"(1, 0)",9.0,0.0,0.0,3.0,,30.0,20.0,0.0,6.0,1.0,2.0,3.0,0.0,4.0,3.0,3.0
"(1, 1)",5.0,6.0,0.0,1.0,20.0,,10.0,0.0,4.0,15.0,2.0,3.0,2.0,6.0,3.0,5.0
"(1, 2)",2.0,5.0,6.0,2.0,40.0,20.0,,40.0,5.0,3.0,0.0,1.0,3.0,2.0,6.0,4.0
"(1, 3)",0.0,1.0,2.0,9.0,0.0,20.0,30.0,,2.0,1.0,3.0,12.0,4.0,3.0,3.0,9.0
"(2, 0)",18.0,1.0,0.0,1.0,0.0,4.0,3.0,1.0,,20.0,10.0,20.0,12.0,3.0,3.0,1.0
"(2, 1)",1.0,12.0,4.0,0.0,0.0,0.0,1.0,0.0,30.0,,20.0,30.0,1.0,3.0,1.0,4.0


In [306]:
# tm_normed

In [321]:
pd.DataFrame(get_eds_for_all_pairs(tm_normed)).T



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,17.1084,8.724734,14.404906,19.765298,28.865361,19.667837,27.707381,42.626781,14.900051,11.53454,22.127659,15.382831,18.357466,13.883605,17.462764,14.995117
1,16.296443,10.356031,12.352182,16.457315,29.649091,18.81121,28.00161,42.13183,16.140757,11.10091,22.995797,14.815635,19.938313,14.584138,18.089643,14.900185
2,14.003042,6.551512,15.52213,14.952704,29.336606,19.807195,27.322561,41.451752,16.498193,12.104156,23.069136,15.526691,20.477869,15.378075,18.710695,15.649458
3,12.916452,7.012781,15.403464,19.509164,28.82698,19.201069,28.45348,43.214664,16.800679,12.296864,22.932363,16.005313,19.595827,14.308393,17.593615,13.991188
4,15.848272,12.577928,18.276725,21.236424,25.794087,12.580127,20.944933,40.074504,15.392802,11.510075,22.569265,15.184298,20.044094,14.369338,17.507453,15.68135
5,16.609522,11.991132,18.027586,21.410752,22.474986,18.689764,24.027145,41.278358,15.16946,10.062891,21.979284,14.548864,19.585054,14.009836,17.385129,15.348335
6,17.26532,12.494632,17.829242,20.989871,20.117948,14.616276,24.084286,30.333113,16.228196,11.977976,23.279271,15.561749,19.967713,14.753366,17.507606,15.655987
7,17.587571,12.507658,18.079102,19.907525,26.304614,15.35897,19.821141,39.373682,16.310166,11.468405,22.604009,14.254074,19.386818,14.396107,17.373941,14.843276
8,14.925528,12.272454,17.917585,21.774129,30.245755,20.396297,27.975092,42.096845,15.015934,8.486154,19.68456,11.660995,18.066677,14.333597,17.504237,16.143366
9,17.201468,11.646392,17.514821,21.728806,30.96413,21.248655,28.549396,42.337578,11.121902,9.115854,17.940574,10.008397,20.069377,15.218386,18.311053,16.444981


# Matrix test 

Plan:
 - construct transition matrix
 - one step A->Z probability is given by M(A->Z) i.e. `M[0,-1]`
 - calculate 2+ step probabilities using initial state `[1,0,0]` times `M[:-1,:-1]` 1 or more times, times `M[:-1,-1]` for final transition to Z.

In [200]:
def first_n_steps(transition_matrix, n_steps=50, stop_tol=1e-6, stop_steps=5):
    M = transition_matrix
    results = {}
    results[1] = {'prob': M[0,-1], 'cum_prob': M[0,-1], 'expected_steps': M[0,-1], 'prob_ratio': np.nan}
    M_excl_Z = M[:-1,:-1]
    M_to_Z = M[:-1,-1]
    state = M[0,:-1]
    tol_steps = 0
    for i in range(2, n_steps + 1):
        results[i] = {}
        results[i]['prob'] = state @ M_to_Z
        results[i]['cum_prob'] = results[i-1]['cum_prob'] + results[i]['prob']
        results[i]['expected_steps'] = results[i-1]['expected_steps'] + (results[i]['prob'] * i)
        results[i]['prob_ratio'] = (results[i]['prob'] / results[i-1]['prob'])
        
        if abs(results[i]['prob_ratio'] - results[i-1]['prob_ratio']) < stop_tol:
            tol_steps += 1
            if tol_steps == stop_steps:
                results['final'] = {}
                infinite_sum_multiplier = 1 / (1 - results[i]['prob_ratio'])
                results['final']['prob'] = results[i]['prob'] * results[i]['prob_ratio'] * infinite_sum_multiplier
                results['final']['cum_prob'] = results[i]['cum_prob'] + results['final']['prob']
                results['final']['expected_steps'] = (results[i]['expected_steps']
                                                      + i * results['final']['prob']
                                                      + results['final']['prob'] * infinite_sum_multiplier)
                return results
        else:
            tol_steps = 0
        
        state = state @ M_excl_Z
    return results

In [227]:
def cycle_through_matrix_constructors(M):
    assert M.shape[0] == M.shape[1], "Matrix not square"
    idx = list(range(M.shape[0]))
    for a in idx:
        for z in idx:
            idx_excl_z = idx.copy()
            idx_excl_z.pop(z)
            if a != z:
                M_excl_Z = M[idx_excl_z, :][:, idx_excl_z]
                M_to_Z = M[:, z][idx_excl_z]
                state = M[a, :][idx_excl_z]
                print(M_excl_Z)
                print(M_to_Z)
                print(state)
                print('\n')

In [232]:
def get_components(M, a, z):
    idx = list(range(M.shape[0]))
    idx.pop(z)
    M_excl_Z = M[idx, :][:, idx]
    M_to_Z = M[:, z][idx]
    initial_state = M[a, :][idx]
    return M_excl_Z, M_to_Z, initial_state

In [234]:
def get_eds_for_all_pairs(M, max_steps=50, stop_tol=1e-6, stop_steps=5):
    assert M.shape[0] == M.shape[1], "Matrix not square"
    eds = {}
    dim = M.shape[0]
    
    for a in range(dim):
        eds[a] = {}
        
        for z in range(dim):
            M_excl_Z, M_to_Z, state = get_components(M, a, z)
            results = {}
            results[1] = {'prob': M[a, z], 'cum_prob': M[a, z], 'expected_steps': M[a, z], 'prob_ratio': np.nan}
            tol_steps = 0
            
            for i in range(2, max_steps + 1):
                results[i] = {}
                results[i]['prob'] = state @ M_to_Z
                results[i]['cum_prob'] = results[i-1]['cum_prob'] + results[i]['prob']
                results[i]['expected_steps'] = results[i-1]['expected_steps'] + (results[i]['prob'] * i)
                results[i]['prob_ratio'] = (results[i]['prob'] / results[i-1]['prob'])

                if abs(results[i]['prob_ratio'] - results[i-1]['prob_ratio']) < stop_tol:
                    tol_steps += 1
                    if tol_steps == stop_steps:
                        results['final'] = {}
                        infinite_sum_multiplier = 1 / (1 - results[i]['prob_ratio'])
                        results['final']['prob'] = results[i]['prob'] * results[i]['prob_ratio'] * infinite_sum_multiplier
                        results['final']['cum_prob'] = results[i]['cum_prob'] + results['final']['prob']
                        results['final']['expected_steps'] = (results[i]['expected_steps']
                                                              + i * results['final']['prob']
                                                              + results['final']['prob'] * infinite_sum_multiplier)
                        eds[a][z] = results['final']['expected_steps']
                        break
                else:
                    tol_steps = 0

                state = state @ M_excl_Z
                
    return eds

In [241]:
# %%timeit
pd.DataFrame(get_eds_for_all_pairs(transitions)).T



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,14.443942,28.710449,7.549457,11.012544,9.874714,8.596228,10.11815,6.289011,5.75174,5.632527
1,11.252007,29.540637,7.691356,11.957639,10.206575,10.811805,8.592634,7.259456,5.026139,5.586733
2,13.433899,29.022193,8.182971,9.990665,7.559857,11.16819,9.843484,7.811278,6.918509,4.128587
3,14.108981,29.037933,7.507504,12.028705,9.29547,11.139101,10.28122,7.225979,4.782918,4.223497
4,14.031272,28.102385,7.242441,10.120216,10.404618,9.939004,10.049774,6.634502,6.643854,4.661493
5,13.590174,29.024848,7.312602,10.664192,10.266496,11.699195,10.100099,7.764108,5.531668,3.514954
6,11.269977,29.386371,7.355559,11.001095,9.676392,8.035773,10.915762,7.446471,6.974977,5.093724
7,13.172853,25.613987,6.847469,11.098304,9.327003,9.904162,9.92039,8.162063,6.493152,5.039184
8,14.239409,28.943853,5.657365,10.522777,8.401622,11.230875,9.715934,6.268096,7.562209,5.599404
9,11.669278,27.28568,7.027284,11.860818,10.197254,10.381638,8.400442,6.756422,6.294091,6.111597


In [233]:
get_components(transitions, 2, 3)

(array([[0. , 0.2, 0.3, 0.1],
        [0.5, 0. , 0.1, 0.2],
        [0.1, 0.5, 0. , 0.1],
        [0.2, 0.2, 0.2, 0.2]]),
 array([0.4, 0.2, 0.3, 0.2]),
 array([0.1, 0.5, 0. , 0.1]))

In [238]:
transitions

array([[0.        , 0.01322084, 0.04075287, 0.06848171, 0.05015461,
        0.26229575, 0.08131108, 0.26005575, 0.21802029, 0.00570708],
       [0.20441002, 0.        , 0.        , 0.        , 0.01079509,
        0.        , 0.20736317, 0.05383926, 0.36745164, 0.15614082],
       [0.04474023, 0.00430093, 0.        , 0.18142681, 0.32928149,
        0.        , 0.06757087, 0.        , 0.03699898, 0.33568069],
       [0.00248358, 0.        , 0.        , 0.        , 0.11151293,
        0.04845721, 0.        , 0.05458342, 0.40081245, 0.38215042],
       [0.        , 0.02935697, 0.11679737, 0.17261359, 0.        ,
        0.14286508, 0.06104638, 0.22139781, 0.06383898, 0.19208382],
       [0.01909868, 0.00345776, 0.06494329, 0.14233525, 0.01666774,
        0.        , 0.00638546, 0.        , 0.25155595, 0.49555588],
       [0.23352083, 0.        , 0.12992722, 0.07026483, 0.09568944,
        0.28859237, 0.        , 0.09620371, 0.        , 0.0858016 ],
       [0.05834531, 0.14039206, 0.1849157

In [217]:
idx = list(range(transitions.shape[0]))
idx.pop(0)
idx

[1, 2, 3, 4]

In [220]:
transitions[[1,2,3]; [1,2,3]]

SyntaxError: invalid syntax (<ipython-input-220-f9357c9bcdf3>, line 1)

In [228]:
cycle_through_matrix_constructors(transitions)

[[0.  0.3 0.4 0.1]
 [0.1 0.  0.3 0.1]
 [0.2 0.6 0.  0.2]
 [0.2 0.2 0.2 0.2]]
[0.2 0.5 0.  0.2]
[0.  0.3 0.4 0.1]


[[0.  0.2 0.4 0.1]
 [0.5 0.  0.2 0.2]
 [0.2 0.  0.  0.2]
 [0.2 0.2 0.2 0.2]]
[0.3 0.1 0.6 0.2]
[0.  0.2 0.4 0.1]


[[0.  0.2 0.3 0.1]
 [0.5 0.  0.1 0.2]
 [0.1 0.5 0.  0.1]
 [0.2 0.2 0.2 0.2]]
[0.4 0.2 0.3 0.2]
[0.  0.2 0.3 0.1]


[[0.  0.2 0.3 0.4]
 [0.5 0.  0.1 0.2]
 [0.1 0.5 0.  0.3]
 [0.2 0.  0.6 0. ]]
[0.1 0.2 0.1 0.2]
[0.  0.2 0.3 0.4]


[[0.  0.1 0.2 0.2]
 [0.5 0.  0.3 0.1]
 [0.  0.6 0.  0.2]
 [0.2 0.2 0.2 0.2]]
[0.5 0.1 0.2 0.2]
[0.  0.1 0.2 0.2]


[[0.  0.2 0.4 0.1]
 [0.5 0.  0.2 0.2]
 [0.2 0.  0.  0.2]
 [0.2 0.2 0.2 0.2]]
[0.3 0.1 0.6 0.2]
[0.5 0.  0.2 0.2]


[[0.  0.2 0.3 0.1]
 [0.5 0.  0.1 0.2]
 [0.1 0.5 0.  0.1]
 [0.2 0.2 0.2 0.2]]
[0.4 0.2 0.3 0.2]
[0.5 0.  0.1 0.2]


[[0.  0.2 0.3 0.4]
 [0.5 0.  0.1 0.2]
 [0.1 0.5 0.  0.3]
 [0.2 0.  0.6 0. ]]
[0.1 0.2 0.1 0.2]
[0.5 0.  0.1 0.2]


[[0.  0.1 0.2 0.2]
 [0.5 0.  0.3 0.1]
 [0.  0.6 0.  0.2]
 [0.2 0.2 0.2 0.2]]
[0.

In [170]:
results = first_n_steps(transitions)

  


In [171]:
pd.DataFrame(results).T

Unnamed: 0,cum_prob,expected_steps,prob,prob_ratio
1,0.0,0.0,0.0,
2,0.000317,0.000634,0.000317,inf
3,0.000409,0.000911,9.2e-05,0.291833
4,0.000503,0.001287,9.4e-05,1.01705
5,0.000598,0.001758,9.4e-05,1.002642
6,0.000692,0.002324,9.4e-05,0.999584
7,0.000786,0.002983,9.4e-05,0.999938
8,0.00088,0.003737,9.4e-05,0.99991
9,0.000974,0.004585,9.4e-05,0.999905
10,0.001069,0.005527,9.4e-05,0.999906


In [93]:
def rand_square_array(size=10, sparsify=1.0):
    sq = np.random.exponential(size=(size, size))
    if 0.0 < sparsify < 1.0:
        sq = sq * np.random.binomial(1, sparsify, size=(size, size))
    np.fill_diagonal(sq, 0)
    sq_normed = sq / sq.sum(axis=1)[:,None]
    return sq_normed

In [237]:
transitions = rand_square_array(size=10, sparsify=0.8)

In [168]:
transitions[:,-1].sum()

0.9453368080055926

In [169]:
transitions.max().max()

0.07436464434211604

In [69]:
np.divide(test, test.sum(axis=1)[:,None]).sum(axis=1)

array([1., 1., 1.])

In [203]:
transitions = np.array([
    [0.0, 0.2, 0.3, 0.4, 0.1],
    [0.5, 0.0, 0.1, 0.2, 0.2],
    [0.1, 0.5, 0.0, 0.3, 0.1],
    [0.2, 0.0, 0.6, 0.0, 0.2],
    [0.2, 0.2, 0.2, 0.2, 0.2]
])

([1, 2, 3], None)