In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

In [2]:
PATH = "/Volumes/Extreme SSD/data/analysis"
file = os.path.join(PATH, 'vendor_week.pickle')
df = pd.read_pickle(file)\
    .drop(['cum_count', 'rtime_dt_min'], axis = 1)

## Vendor

### Vendor characteristics
* `vendor` : a anonymized numeric identifier `[1:1040]` as created as a factorized score of unique vendor nicknames.
* `m_max` : number of months after market opening at which a seller has entered the market.
* `category_count` : number of categories in which the vendor has offered products in total.
* `item_count` : number of unique items that the vendor has offered in total.
* `arf` : seller was identified to have used ARF at market entry.
* `arf_bgm` : seller was indentified to have used ARF at market entry by cluster analysis.


In [3]:
VENDOR_CHAR = ['vendor', 'm_maxw', 'category_count', 'item_count', 'arf', 'arf_bgm']
DESC = ['count', 'mean', 'std', 'min', 'max']

print(df[VENDOR_CHAR]\
          .groupby('vendor').max()\
          .reset_index().info(), '\n')
print('DESCRIPTIVE STATISTICS:\n',
      df[VENDOR_CHAR]\
          .groupby('vendor').max()\
          .reset_index()\
          .astype({'arf' : int, 'arf_bgm' : int})\
          .describe()\
          .T[DESC]\
          .applymap('{:.3f}'.format))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040 entries, 0 to 1039
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   vendor          1040 non-null   int64
 1   m_maxw          1040 non-null   int64
 2   category_count  1040 non-null   int64
 3   item_count      1040 non-null   int64
 4   arf             1040 non-null   bool 
 5   arf_bgm         1040 non-null   bool 
dtypes: bool(2), int64(4)
memory usage: 34.7 KB
None 

DESCRIPTIVE STATISTICS:
                    count     mean      std    min       max
vendor          1040.000  520.500  300.366  1.000  1040.000
m_maxw          1040.000    3.533    3.198  1.000    12.000
category_count  1040.000    3.203    1.819  1.000    10.000
item_count      1040.000   26.572   45.095  1.000   731.000
arf             1040.000    0.046    0.210  0.000     1.000
arf_bgm         1040.000    0.203    0.402  0.000     1.000


### Time varying variables
* `w` : denotes the weeks since the 11th sale in the market (post market entry phase) per vendor per week `[1:51]`. 
* `rating_m` : the average rating score calculated as a rolling mean per vendor per week.
* `reputation_m` : the average of the market provided reputation score per vendor per week.
* `price_usd_m` : the average price of the of all products sold per vendor per week.
* `delta_t_m` : the average time gaps between feedbacks per vendor per week.
* `neg_rating_m` : the proportion of negative ratings per vendor per week.
* `count_min` : the number of sales per vendor at the beginning of the week. 
* `neg_count_min` : the number of negative feedbacks per vendor at the beginning of the week.
* `pos_count_min` : the number of positive feedbacks per vendor at the beginning of the week. 
* `price_usd_s` : the standard deviation in selling prices of all products sold per vendor per week.
* `delta_t_m` : the standard deviation in selling prices of all products sold per vendor per week.
* `item_count_w` : the number of unique items on offer per vendor per week.
* `category_count_w`: the number of categories in which a the vendor offers products in per week.
* `price_diff_mw` : average proportional change in selling price per item per vendor per week.
* `arm_maxw` : vendor was identified to have used ARM in that week.
* `arm_bgm_maxw` : vendor was identified to have used ARM in that week by cluster analysis.
* `empty_stock_last_week` : vendor was identified to have had an empty stock last week.
* `has_price_drop` : vendor was identified to have lowered the selling price of an item with more than 10 percent per week

In [4]:
TIME_VARYING = ['w', 'rating_m', 'reputation_m', 'price_usd_m', 'delta_t_m', 
                'neg_rating_m', 'count_min', 'neg_count_min', 'pos_count_min',
                'price_usd_s', 'delta_t_s', 'item_count_w', 'category_count_w',
                'price_diff_mw', 'arm_maxw', 'arm_bgm_maxw', 'international_shipment',
                'empty_stock_last_week_maxw', 'has_price_drop']

print(df[TIME_VARYING].info(), '\n')
print('DESCRIPTIVE STATISTICS:\n',
      df[TIME_VARYING]\
          .assign(price_usd_s = df['price_usd_s'].mask(df['price_usd_s'] > 10e5),
                  price_usd_m = df['price_usd_m'].mask(df['price_usd_m'] > 10e5),
                  price_diff_mw = df['price_diff_mw'].mask((df['price_diff_mw'].isin([np.nan, np.inf]))\
                                                           | (df['price_diff_mw'] > 10e5)))
          .astype({'has_price_drop': int,
             'empty_stock_last_week_maxw': int,
             'arm_maxw': int,
             'international_shipment': int})\
          .describe()\
          .T[DESC]\
          .applymap('{:.3f}'.format))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24551 entries, 0 to 24550
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   w                           24551 non-null  int64  
 1   rating_m                    24551 non-null  float64
 2   reputation_m                24551 non-null  float64
 3   price_usd_m                 24551 non-null  float64
 4   delta_t_m                   24551 non-null  float64
 5   neg_rating_m                24551 non-null  float64
 6   count_min                   24551 non-null  int64  
 7   neg_count_min               24551 non-null  float64
 8   pos_count_min               24551 non-null  float64
 9   price_usd_s                 24551 non-null  float64
 10  delta_t_s                   24551 non-null  float64
 11  item_count_w                24551 non-null  int64  
 12  category_count_w            24551 non-null  int64  
 13  price_diff_mw               245