## Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

## Upload Data Base

In [2]:
df = pd.read_csv(r'base_pubs.csv')

In [3]:
df.shape

(208012, 7)

In [4]:
df.columns

Index(['CTLG_PROD_ID', 'SIT_SITE_ID', 'ITE_ITEM_ID', 'ITE_BASE_CURRENT_PRICE',
       'ITE_STATUS', 'CAT_CATEG_ID_L7', 'CAT_CATEG_ID_L1'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,CTLG_PROD_ID,SIT_SITE_ID,ITE_ITEM_ID,ITE_BASE_CURRENT_PRICE,ITE_STATUS,CAT_CATEG_ID_L7,CAT_CATEG_ID_L1
0,15914455.0,MLB,1730834725,209.8,active,196208,1000
1,,MLB,1932231953,80.5,active,1672,1648
2,16255367.0,MLB,1940727918,77.31,active,6777,1648
3,14727339.0,MLB,1756695472,13.38,active,1714,1648
4,6084026.0,MLB,1900682550,28.39,active,186456,1144


In [6]:
df.dtypes

CTLG_PROD_ID              float64
SIT_SITE_ID                object
ITE_ITEM_ID                 int64
ITE_BASE_CURRENT_PRICE    float64
ITE_STATUS                 object
CAT_CATEG_ID_L7             int64
CAT_CATEG_ID_L1             int64
dtype: object

## Drop NaN rows

In [7]:
countNaN = df['CTLG_PROD_ID'].isna().sum()
countNaN

3277

In [8]:
df.dropna(subset = ['CTLG_PROD_ID'], inplace=True)

In [9]:
countNaN = df['CTLG_PROD_ID'].isna().sum()
countNaN

0

## Change data type

In [10]:
df['CTLG_PROD_ID']=df['CTLG_PROD_ID'].astype('int')

In [11]:
df['CTLG_PROD_ID'].dtypes

dtype('int64')

## Describe of the prices of the products

In [12]:
gb=df.groupby(['CTLG_PROD_ID'])
df_d=gb['ITE_BASE_CURRENT_PRICE'].describe()
df_d

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
CTLG_PROD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
633,1.0,66.890000,,66.89,66.89,66.89,66.890,66.89
5020,1.0,43.910000,,43.91,43.91,43.91,43.910,43.91
11438,1.0,931.850000,,931.85,931.85,931.85,931.850,931.85
101022,1.0,432.520000,,432.52,432.52,432.52,432.520,432.52
802465,27.0,153.065556,18.054667,131.70,141.94,148.65,156.525,201.61
...,...,...,...,...,...,...,...,...
18209455,13.0,598.552308,195.217031,362.71,362.71,643.31,710.210,1008.65
18209881,1.0,248.630000,,248.63,248.63,248.63,248.630,248.63
18213136,1.0,187.130000,,187.13,187.13,187.13,187.130,187.13
18217813,3.0,192.546667,38.656023,163.02,170.67,178.32,207.310,236.30


## Drop rows with NaN std

In [13]:
countNaN = df_d['std'].isna().sum()
countNaN

2930

In [14]:
df_d.dropna(subset = ['std'], inplace=True)

In [15]:
countNaN = df_d['std'].isna().sum()
countNaN

0

## Reset the index

In [16]:
df_d=df_d.reset_index()

In [17]:
df_d.head()

Unnamed: 0,CTLG_PROD_ID,count,mean,std,min,25%,50%,75%,max
0,802465,27.0,153.065556,18.054667,131.7,141.94,148.65,156.525,201.61
1,802499,11.0,63.684545,7.481744,49.25,63.455,67.19,67.19,70.14
2,802500,9.0,75.371111,7.870711,60.67,70.6,77.27,81.44,83.46
3,802523,33.0,44.114545,7.16492,31.19,38.64,45.46,47.61,60.94
4,802544,47.0,106.392979,20.900336,74.61,97.76,99.4,109.84,181.78


In [18]:
df_d.shape

(10663, 9)

## Export the products IDs

In [19]:
#ID=df_d['CTLG_PROD_ID']
#ID.to_csv(r'products_id.csv', index = False)

## Upload of orders data base

In [20]:
df_or = pd.read_csv(r'orders_prod_id.csv')

In [21]:
df_or.shape

(806506, 10)

In [22]:
df_or.columns

Index(['ORD_ORDER_ID', 'ITE_ITEM_ID', 'CTLG_PROD_ID', 'flag_bad_user',
       'is_buybox', 'has_bpp', 'has_claim_seller', 'is_refund', 'GMV_USD',
       'bpp_cashout_final'],
      dtype='object')

In [23]:
df_or.dtypes

ORD_ORDER_ID           int64
ITE_ITEM_ID            int64
CTLG_PROD_ID           int64
flag_bad_user          int64
is_buybox              int64
has_bpp                int64
has_claim_seller       int64
is_refund            float64
GMV_USD              float64
bpp_cashout_final    float64
dtype: object

In [24]:
df_or.sample(5)

Unnamed: 0,ORD_ORDER_ID,ITE_ITEM_ID,CTLG_PROD_ID,flag_bad_user,is_buybox,has_bpp,has_claim_seller,is_refund,GMV_USD,bpp_cashout_final
160257,4548366648,1600451640,9488665,0,0,0,0,0.0,12.62,0.0
146461,4572480276,1858159245,6239418,0,1,0,0,0.0,149.33,0.0
470973,4560605834,1808465592,16951108,0,1,0,0,0.0,199.13,0.0
448206,4552306314,1614436800,11136916,0,1,0,0,0.0,23.92,0.0
185164,4595620238,1883400955,16211423,0,1,0,0,0.0,25.58,0.0


## Join the DFs by Product ID

In [25]:
df_f = pd.merge(left = df_d, right = df_or,
                how = 'inner',
                left_on='CTLG_PROD_ID', right_on='CTLG_PROD_ID')

In [26]:
df_f.shape

(806506, 18)

In [27]:
df_f.columns

Index(['CTLG_PROD_ID', 'count', 'mean', 'std', 'min', '25%', '50%', '75%',
       'max', 'ORD_ORDER_ID', 'ITE_ITEM_ID', 'flag_bad_user', 'is_buybox',
       'has_bpp', 'has_claim_seller', 'is_refund', 'GMV_USD',
       'bpp_cashout_final'],
      dtype='object')

In [28]:
df_f.sample(5)

Unnamed: 0,CTLG_PROD_ID,count,mean,std,min,25%,50%,75%,max,ORD_ORDER_ID,ITE_ITEM_ID,flag_bad_user,is_buybox,has_bpp,has_claim_seller,is_refund,GMV_USD,bpp_cashout_final
550218,15912271,4.0,558.985,111.685674,419.98,494.8,574.76,638.945,666.44,4590393500,1689813033,0,1,0,0,0.0,546.7,0.0
603801,15984003,77.0,71.754026,8.213587,58.96,70.77,71.03,74.65,114.67,4581008947,1883031524,0,1,0,0,0.0,60.76,0.0
385609,15257472,29.0,21.582759,9.554278,11.87,14.89,18.07,26.15,50.78,4538383569,1691805336,0,0,0,0,0.0,14.79,0.0
449867,15471717,27.0,74.837037,144.466065,40.17,43.555,46.08,49.54,797.2,4603669116,1884979644,0,1,0,0,0.0,45.52,0.0
463902,15570789,2.0,32.52,0.0,32.52,32.52,32.52,32.52,32.52,4583440078,1627562634,0,1,0,0,0.0,28.38,0.0


## Create columns flag low_price for those items with price below 25% and 50%

In [29]:
df_f['Low25_Price'] = df_f.apply(lambda x: 1 if x['25%'] >= x['GMV_USD'] else 0, axis=1)

In [30]:
df_f['Low50_Price'] = df_f.apply(lambda x: 1 if x['50%'] >= x['GMV_USD'] else 0, axis=1)

In [31]:
df_f.sample(5)

Unnamed: 0,CTLG_PROD_ID,count,mean,std,min,25%,50%,75%,max,ORD_ORDER_ID,ITE_ITEM_ID,flag_bad_user,is_buybox,has_bpp,has_claim_seller,is_refund,GMV_USD,bpp_cashout_final,Low25_Price,Low50_Price
800324,17806731,44.0,22.441818,21.380912,2.28,3.775,18.615,35.66,60.62,4550306669,1872082483,0,0,0,0,0.0,51.99,0.0,0,0
47721,6121297,72.0,85.080556,118.072867,37.99,54.975,68.03,81.07,1054.76,4589856034,1562811951,0,1,0,0,0.0,45.64,0.0,1,1
168550,8019979,11.0,87.200909,142.211125,28.44,38.295,47.34,53.815,515.24,4541858899,1853108457,0,1,0,0,0.0,30.88,0.0,1,1
749488,16995137,5.0,4.094,1.198282,3.17,3.38,3.78,3.98,6.16,4580810397,1871437843,0,1,0,0,0.0,3.78,0.0,0,1
324545,14682774,34.0,326.411765,282.600523,133.58,169.4825,176.37,373.59,1562.78,4561812016,1830040927,0,0,0,0,0.0,457.8,0.0,0,0


In [None]:
df_f.plot(kind='bar', figsize=(10, 6))

In [None]:
#df_ratios['extremos']= df_ratios.apply(lambda x: 'caso1' if x['ratio_recla_1m']> 0.3 and x['ratio_cashout_1m']> 0.3 and x['ratio_ref_1m']> 0.3 and x['ratio_claims_1m']> 0.3 else 'no',axis=1)


In [None]:
#df['adicionales']= df.apply(lambda x: 'caso1' if x['gmv_90d']<= 0 else('caso2' if  x['gmv_30d']<=0  else 'no'),axis=1)
