# Robustness Checks

In [2]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import matplotlib as mpl 

mpl.rcParams['figure.dpi']= 200
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False

from skmisc.loess import loess

from rpy2.robjects import pandas2ri
pandas2ri.activate()

%reload_ext rpy2.ipython

## Load Data

In [3]:
MAINDIR = os.getcwd().rsplit('/', 1)[0]
file = os.path.join(MAINDIR, 'data/analysis', 'vendor_item_period.csv')

In [4]:
df = pd.read_csv(file, index_col = False)
df = df.drop(df.columns[0], axis=1)

In [5]:
df.columns

Index(['vendor', 'item', 'w', 'rating_m', 'reputation_m', 'price_usd_m',
       'delta_t_m', 'neg_rating_m', 'count_min', 'neg_count_min',
       'pos_count_min', 'rtime_dt_min', 'price_usd_s', 'delta_t_s',
       'cum_count', 'arf', 'international_shipment', 'item_count',
       'item_count_w', 'category_count', 'category_count_w'],
      dtype='object')

In [28]:
# make ARM variable
t_cut_per_seller = df.groupby('vendor')['delta_t_m'].quantile(.5).to_dict()
s_cut_per_seller = df.groupby('vendor')['delta_t_s'].quantile(.5).to_dict()
p_cut_per_seller = df.groupby('vendor')['price_usd_m'].quantile(.5).to_dict()
t_cut = df['delta_t_m'].quantile(1/2)
p_cut = df['price_usd_m'].quantile(1/2)
s_cut = df['delta_t_s'].quantile(1/2)

arm = df\
    .assign(t_low_s = lambda x: x['delta_t_m'] < x['vendor'].map(t_cut_per_seller),
            p_low_s = lambda x: x['price_usd_m'] < x['vendor'].map(p_cut_per_seller),
            s_low_s = lambda x: x['delta_t_s'] < x['vendor'].map(s_cut_per_seller))\
    .assign(arm = lambda x: (x[['t_low_s', 'p_low_s', 's_low_s']].all(axis = 1)))\
    .groupby(['vendor', 'w'])['arm'].max().rename('arm_i').reset_index()

arm['arm_t'] = df\
    .assign(t_low_s = lambda x: x['delta_t_m'] < x['vendor'].map(t_cut_per_seller),
            p_low_s = lambda x: x['price_usd_m'] < x['vendor'].map(p_cut_per_seller),
            s_low_s = lambda x: x['delta_t_s'] < x['vendor'].map(s_cut_per_seller),
            t_low   = lambda x: x['delta_t_m'] < t_cut)\
    .assign(arm = lambda x: (x[['t_low_s', 'p_low_s', 's_low_s', 't_low']].all(axis = 1)))\
    .groupby(['vendor', 'w'])['arm'].max().reset_index()['arm']

arm['arm_p'] = df\
    .assign(t_low_s = lambda x: x['delta_t_m'] < x['vendor'].map(t_cut_per_seller),
            p_low_s = lambda x: x['price_usd_m'] < x['vendor'].map(p_cut_per_seller),
            s_low_s = lambda x: x['delta_t_s'] < x['vendor'].map(s_cut_per_seller),
            p_low   = lambda x: x['price_usd_m'] < p_cut)\
    .assign(arm = lambda x: (x[['t_low_s', 'p_low_s', 's_low_s', 'p_low']].all(axis = 1)))\
    .groupby(['vendor', 'w'])['arm'].max().reset_index()['arm']

arm['arm_tp'] = df\
    .assign(t_low_s = lambda x: x['delta_t_m'] < x['vendor'].map(t_cut_per_seller),
            p_low_s = lambda x: x['price_usd_m'] < x['vendor'].map(p_cut_per_seller),
            s_low_s = lambda x: x['delta_t_s'] < x['vendor'].map(s_cut_per_seller),
            t_low   = lambda x: x['delta_t_m'] < t_cut,
            p_low   = lambda x: x['price_usd_m'] < p_cut)\
    .assign(arm = lambda x: (x[['t_low_s', 'p_low_s', 's_low_s', 't_low', 'p_low']].all(axis = 1)))\
    .groupby(['vendor', 'w'])['arm'].max().reset_index()['arm']

arm['arm_a'] = df\
    .assign(t_low_s = lambda x: x['delta_t_m'] < x['vendor'].map(t_cut_per_seller),
            p_low_s = lambda x: x['price_usd_m'] < x['vendor'].map(p_cut_per_seller),
            s_low_s = lambda x: x['delta_t_s'] < x['vendor'].map(s_cut_per_seller),
            t_low   = lambda x: x['delta_t_m'] < t_cut,
            p_low   = lambda x: x['price_usd_m'] < p_cut,
            s_low   = lambda x: x['delta_t_s'] < s_cut)\
    .assign(arm = lambda x: (x[['t_low_s', 'p_low_s', 's_low_s', 't_low', 'p_low', 's_low']].all(axis = 1)))\
    .groupby(['vendor', 'w'])['arm'].max().reset_index()['arm']

# Shift Dependent variables with one week 
for col in ['arm_tp', 'arm_p', 'arm_t', 'arm_a', 'arm_i']: 
    arm['{0}_shift'.format(col)] = arm\
        .groupby('vendor')\
        .apply(lambda x: x[col].shift(1))\
        .reset_index(drop = True)

### Merge internal specified ARM with vendor week data

In [29]:
# import person week file
df = pd.read_pickle(os.path.join(MAINDIR, 'data/analysis', 'vendor_week.pickle'))
df = df.merge(df.groupby('vendor')['me_min'].min().rename('me'), on = 'vendor')
df = df.rename(columns= {
    'international_shipment_count_w': 'int_shipment_count_w', 
    'international_shipment': 'int_shipment'})

In [30]:
df = df.merge(arm, on = ['vendor', 'w'])

In [32]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()

# normalize variables
skewed_variables = [
    'neg_count_min', 'pos_count_min', 'neg_count_w', 
    'neg_count_w_shift', 'pos_count_w', 'pos_count_w_shift', 'sales_volume_w', 
    'item_count_w', 'int_shipment_count_w']

bc_vars = pd.DataFrame(
    pt.fit_transform(df[skewed_variables]),
    columns = ['bc_' + var for var in skewed_variables])

c_vars = df[skewed_variables].add_prefix('c_')

dummy_vars = df.assign(
        items   = pd.qcut(df['item_count'], 3,  labels=["low", "medium", "high"]),
        items_w = pd.qcut(df['item_count_w'], 3,  labels=["low", "medium", "high"]),
        sales   = pd.qcut(df['cum_count'], 2,  labels=["low", "high"]),
        sales_w = pd.qcut(df['sales_volume_w'], 2,  labels=["low", "high"]))\
    .loc[:,['items', 'items_w', 'sales', 'sales_w']]

n_vars = df[['cum_count', 'sales_volume_w', 'item_count', 
             'item_count_w', 'me', 'int_shipment_count_w',
             'empty_stock_last_week_count_w', 'empty_stock_last_week_maxw']].rename({'cum_count': 'sales_volume'})

bool_vars = df[
    ['arf', 'arm_maxw', 'arm_maxw_shift', 'arm_i', 'arm_a', 'arm_t', 'arm_p', 'arm_tp',
     'int_shipment', 'has_price_drop', 'has_price_drop_shift']].astype('int')

lev_vars =  df[['vendor','w', 'me_min']]

data = pd.concat([lev_vars, bool_vars, bc_vars, c_vars, dummy_vars, n_vars], axis = 1)

In [33]:
%%R

library('lme4')
library('margins')
library("performance")
library('tidyverse')

getICCs <- function(m, type = 'poisson'){
  var_s <- as.numeric(getME(m, "theta")[1]^2) # seller level variance
  var_w <- as.numeric(getME(m, "theta")[2]^2) # week level variance
  
  if(type == 'poisson'){
    lambda = .139
    alpha <- log(1 + 1/lambda)
  }
  if(type == 'binomial'){
    alpha <- (pi^2) / 3
  }
  
  icc <- list(
    s = (var_s)/ (var_s + var_w + alpha),
    w = (var_w)/ (var_s + var_w + alpha),
    t = (var_s + var_w)/ (var_s + var_w + alpha),
    a = (alpha)/ (var_s + var_w + alpha)
  )
  
  return(icc)
}

In [34]:
%%R -i data

data['me_'] <- scale(data$me)[,1]
data['w_'] <- scale(data$w)[,1]
data['me_2'] <- scale(data$me^2)[,1]
data['w_2'] <- scale(data$w^2)[,1]

variables = c(
    'bc_pos_count_w', 'bc_item_count_w', 'bc_int_shipment_count_w',
    'bc_neg_count_w', 'c_neg_count_w', 'c_pos_count_w')

for (var in variables){
  for (vendor in unique(data$vendor)){
    # calculate mean and deviances
    x = data[data$vendor == vendor, var]
    m = mean(x)
    dev = x - m
    
    # concat information to dataframe
    data[data$vendor == vendor, paste(var, "m", sep=".")] <- m
    data[data$vendor == vendor, paste(var, "dev", sep=".")] <- dev
  }
}

Optimizer <- glmerControl(optimizer = "bobyqa",
                          optCtrl = list(maxfun=2e5))

## Robustness of ARM with internal specification

### Internal and external cut-offs

In [12]:
%%R
model = glmer(arm_a ~ arf
      + bc_neg_count_w.m
      + bc_neg_count_w.dev
      + bc_neg_count_w.m:arf
      + bc_neg_count_w.dev:arf
      + bc_pos_count_w.m
      + bc_pos_count_w.dev
      + bc_item_count_w.m
      + bc_item_count_w.dev
      + bc_int_shipment_count_w.m
      + bc_int_shipment_count_w.dev
      + me_ + me_2 
      + w_ + w_2 
      + (1 | vendor) + (1 | w),
      data, 
      family = binomial, 
      control = Optimizer)

summary(model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: 
arm_a ~ arf + bc_neg_count_w.m + bc_neg_count_w.dev + bc_neg_count_w.m:arf +  
    bc_neg_count_w.dev:arf + bc_pos_count_w.m + bc_pos_count_w.dev +  
    bc_item_count_w.m + bc_item_count_w.dev + bc_int_shipment_count_w.m +  
    bc_int_shipment_count_w.dev + me_ + me_2 + w_ + w_2 + (1 |  
    vendor) + (1 | w)
   Data: data
Control: Optimizer

     AIC      BIC   logLik deviance df.resid 
  7585.8   7730.4  -3774.9   7549.8    22759 

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-3.540 -0.119 -0.026 -0.003 45.636 

Random effects:
 Groups Name        Variance Std.Dev.
 vendor (Intercept) 2.02745  1.4239  
 w      (Intercept) 0.03264  0.1807  
Number of obs: 22777, groups:  vendor, 902; w, 50

Fixed effects:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -6.26886    0.16752 -37.421  < 2e-16 ***
a

R[write to console]: 
Correlation matrix not shown by default, as p = 16 > 12.
Use print(object, correlation=TRUE)  or
    vcov(object)        if you need it




In [36]:
%%R
model = glmer(arm_p ~ arf
      + bc_neg_count_w.m
      + bc_neg_count_w.dev
      + bc_neg_count_w.m:arf
      + bc_neg_count_w.dev:arf
      + bc_pos_count_w.m
      + bc_pos_count_w.dev
      + bc_item_count_w.m
      + bc_item_count_w.dev
      + bc_int_shipment_count_w.m
      + bc_int_shipment_count_w.dev
      + me_ + me_2 
      + w_ + w_2 
      + (1 | vendor) + (1 | w),
      data, 
      family = binomial, 
      control = Optimizer)

summary(model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: 
arm_p ~ arf + bc_neg_count_w.m + bc_neg_count_w.dev + bc_neg_count_w.m:arf +  
    bc_neg_count_w.dev:arf + bc_pos_count_w.m + bc_pos_count_w.dev +  
    bc_item_count_w.m + bc_item_count_w.dev + bc_int_shipment_count_w.m +  
    bc_int_shipment_count_w.dev + me_ + me_2 + w_ + w_2 + (1 |  
    vendor) + (1 | w)
   Data: data
Control: Optimizer

     AIC      BIC   logLik deviance df.resid 
 12629.3  12773.9  -6296.6  12593.3    22759 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.5273 -0.3428 -0.2013 -0.1199 15.7258 

Random effects:
 Groups Name        Variance Std.Dev.
 vendor (Intercept) 1.25007  1.118   
 w      (Intercept) 0.03763  0.194   
Number of obs: 22777, groups:  vendor, 902; w, 50

Fixed effects:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -3.01975    0.06552 -46.088  < 2

R[write to console]: 
Correlation matrix not shown by default, as p = 16 > 12.
Use print(object, correlation=TRUE)  or
    vcov(object)        if you need it




convergence code: 0
unable to evaluate scaled gradient
Model failed to converge: degenerate  Hessian with 1 negative eigenvalues



In [35]:
%%R
model = glmer(arm_tp ~ arf
      + bc_neg_count_w.m
      + bc_neg_count_w.dev
      + bc_neg_count_w.m:arf
      + bc_neg_count_w.dev:arf
      + bc_pos_count_w.m
      + bc_pos_count_w.dev
      + bc_item_count_w.m
      + bc_item_count_w.dev
      + bc_int_shipment_count_w.m
      + bc_int_shipment_count_w.dev
      + me_ + me_2 
      + w_ + w_2 
      + (1 | vendor) + (1 | w),
      data, 
      family = binomial, 
      control = Optimizer)

summary(model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: 
arm_tp ~ arf + bc_neg_count_w.m + bc_neg_count_w.dev + bc_neg_count_w.m:arf +  
    bc_neg_count_w.dev:arf + bc_pos_count_w.m + bc_pos_count_w.dev +  
    bc_item_count_w.m + bc_item_count_w.dev + bc_int_shipment_count_w.m +  
    bc_int_shipment_count_w.dev + me_ + me_2 + w_ + w_2 + (1 |  
    vendor) + (1 | w)
   Data: data
Control: Optimizer

     AIC      BIC   logLik deviance df.resid 
  9016.9   9161.5  -4490.4   8980.9    22759 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.8650 -0.2449 -0.1109 -0.0548 19.1832 

Random effects:
 Groups Name        Variance Std.Dev.
 vendor (Intercept) 2.29734  1.516   
 w      (Intercept) 0.05476  0.234   
Number of obs: 22777, groups:  vendor, 902; w, 50

Fixed effects:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -4.23575    0.11393 -37.178  < 

R[write to console]: 
Correlation matrix not shown by default, as p = 16 > 12.
Use print(object, correlation=TRUE)  or
    vcov(object)        if you need it




### Optimized operationalization of ARM
- excluding cutoff for overall selling price
    - this assumes that sellers definition of cheap is internally defined
- excluding cutoff for overall standard deviation of delta time
    - this assumes that the staticity of reputation inflation is internally defined

In [24]:
%%R
model = glmer(arm_t ~ arf
      + bc_neg_count_w.m
      + bc_neg_count_w.dev
      + bc_neg_count_w.m:arf
      + bc_neg_count_w.dev:arf
      + bc_pos_count_w.m
      + bc_pos_count_w.dev
      + bc_item_count_w.m
      + bc_item_count_w.dev
      + bc_int_shipment_count_w.m
      + bc_int_shipment_count_w.dev
      + me_ + me_2 
      + w_ + w_2 
      + (1 | vendor) + (1 | w),
      data, 
      family = binomial, 
      control = Optimizer)

summary(model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: 
arm_o ~ arf + bc_neg_count_w.m + bc_neg_count_w.dev + bc_neg_count_w.m:arf +  
    bc_neg_count_w.dev:arf + bc_pos_count_w.m + bc_pos_count_w.dev +  
    bc_item_count_w.m + bc_item_count_w.dev + bc_int_shipment_count_w.m +  
    bc_int_shipment_count_w.dev + me_ + me_2 + w_ + w_2 + (1 |  
    vendor) + (1 | w)
   Data: data
Control: Optimizer

     AIC      BIC   logLik deviance df.resid 
  9469.7   9614.3  -4716.9   9433.7    22759 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.8077 -0.2573 -0.1352 -0.0773 23.2620 

Random effects:
 Groups Name        Variance Std.Dev.
 vendor (Intercept) 1.70934  1.3074  
 w      (Intercept) 0.05802  0.2409  
Number of obs: 22777, groups:  vendor, 902; w, 50

Fixed effects:
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -3.85436    0.09639 -39.988  < 2

R[write to console]: 
Correlation matrix not shown by default, as p = 16 > 12.
Use print(object, correlation=TRUE)  or
    vcov(object)        if you need it




In [14]:
%%R
model = glmer(arm_i ~ arf
      + bc_neg_count_w.m
      + bc_neg_count_w.dev
      + bc_neg_count_w.m:arf
      + bc_neg_count_w.dev:arf
      + bc_pos_count_w.m
      + bc_pos_count_w.dev
      + bc_item_count_w.m
      + bc_item_count_w.dev
      + bc_int_shipment_count_w.m
      + bc_int_shipment_count_w.dev
      + me_ + me_2 
      + w_ + w_2 
      + (1 | vendor) + (1 | w),
      data, 
      family = binomial, 
      control = Optimizer)

summary(model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: 
arm_i ~ arf + bc_neg_count_w.m + bc_neg_count_w.dev + bc_neg_count_w.m:arf +  
    bc_neg_count_w.dev:arf + bc_pos_count_w.m + bc_pos_count_w.dev +  
    bc_item_count_w.m + bc_item_count_w.dev + bc_int_shipment_count_w.m +  
    bc_int_shipment_count_w.dev + me_ + me_2 + w_ + w_2 + (1 |  
    vendor) + (1 | w)
   Data: data
Control: Optimizer

     AIC      BIC   logLik deviance df.resid 
 20489.6  20634.2 -10226.8  20453.6    22759 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-35.267  -0.508  -0.095   0.535  39.719 

Random effects:
 Groups Name        Variance Std.Dev.
 vendor (Intercept) 1.20699  1.0986  
 w      (Intercept) 0.02409  0.1552  
Number of obs: 22777, groups:  vendor, 902; w, 50

Fixed effects:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                 -1.165494   0.057155 -20.392  <

R[write to console]: 
Correlation matrix not shown by default, as p = 16 > 12.
Use print(object, correlation=TRUE)  or
    vcov(object)        if you need it




In [39]:
data['arm_a'].m

0.06273872766387145