## Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

## Upload Data Base

In [2]:
df = pd.read_csv(r'base_pubs.csv')

In [3]:
df.shape

(208012, 7)

In [4]:
df.columns

Index(['CTLG_PROD_ID', 'SIT_SITE_ID', 'ITE_ITEM_ID', 'ITE_BASE_CURRENT_PRICE',
       'ITE_STATUS', 'CAT_CATEG_ID_L7', 'CAT_CATEG_ID_L1'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,CTLG_PROD_ID,SIT_SITE_ID,ITE_ITEM_ID,ITE_BASE_CURRENT_PRICE,ITE_STATUS,CAT_CATEG_ID_L7,CAT_CATEG_ID_L1
0,15914455.0,MLB,1730834725,209.8,active,196208,1000
1,,MLB,1932231953,80.5,active,1672,1648
2,16255367.0,MLB,1940727918,77.31,active,6777,1648
3,14727339.0,MLB,1756695472,13.38,active,1714,1648
4,6084026.0,MLB,1900682550,28.39,active,186456,1144


In [6]:
df.dtypes

CTLG_PROD_ID              float64
SIT_SITE_ID                object
ITE_ITEM_ID                 int64
ITE_BASE_CURRENT_PRICE    float64
ITE_STATUS                 object
CAT_CATEG_ID_L7             int64
CAT_CATEG_ID_L1             int64
dtype: object

## Drop NaN rows

In [7]:
countNaN = df['CTLG_PROD_ID'].isna().sum()
countNaN

3277

In [8]:
df.dropna(subset = ['CTLG_PROD_ID'], inplace=True)

In [9]:
countNaN = df['CTLG_PROD_ID'].isna().sum()
countNaN

0

## Change data type

In [10]:
df['CTLG_PROD_ID']=df['CTLG_PROD_ID'].astype('int')

In [11]:
df['CTLG_PROD_ID'].dtypes

dtype('int64')

## Describe of the prices of the products

In [12]:
gb=df.groupby(['CTLG_PROD_ID'])
df_d=gb['ITE_BASE_CURRENT_PRICE'].describe()
df_d

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
CTLG_PROD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
633,1.0,66.890000,,66.89,66.89,66.89,66.890,66.89
5020,1.0,43.910000,,43.91,43.91,43.91,43.910,43.91
11438,1.0,931.850000,,931.85,931.85,931.85,931.850,931.85
101022,1.0,432.520000,,432.52,432.52,432.52,432.520,432.52
802465,27.0,153.065556,18.054667,131.70,141.94,148.65,156.525,201.61
...,...,...,...,...,...,...,...,...
18209455,13.0,598.552308,195.217031,362.71,362.71,643.31,710.210,1008.65
18209881,1.0,248.630000,,248.63,248.63,248.63,248.630,248.63
18213136,1.0,187.130000,,187.13,187.13,187.13,187.130,187.13
18217813,3.0,192.546667,38.656023,163.02,170.67,178.32,207.310,236.30


## Drop rows with NaN std

In [13]:
countNaN = df_d['std'].isna().sum()
countNaN

2930

In [14]:
df_d.dropna(subset = ['std'], inplace=True)

In [15]:
countNaN = df_d['std'].isna().sum()
countNaN

0

## Reset the index

In [16]:
df_d=df_d.reset_index()

In [17]:
df_d.head()

Unnamed: 0,CTLG_PROD_ID,count,mean,std,min,25%,50%,75%,max
0,802465,27.0,153.065556,18.054667,131.7,141.94,148.65,156.525,201.61
1,802499,11.0,63.684545,7.481744,49.25,63.455,67.19,67.19,70.14
2,802500,9.0,75.371111,7.870711,60.67,70.6,77.27,81.44,83.46
3,802523,33.0,44.114545,7.16492,31.19,38.64,45.46,47.61,60.94
4,802544,47.0,106.392979,20.900336,74.61,97.76,99.4,109.84,181.78


In [18]:
df_d.shape

(10663, 9)

## Export the products IDs

In [22]:
ID=df_d['CTLG_PROD_ID']
ID.to_csv(r'products_id.csv', index = False)

## Upload of orders data base

In [23]:
df_or = pd.read_csv(r'orders_prod_id.csv')

In [24]:
df_or.shape

(806506, 10)

In [25]:
df_or.columns

Index(['ORD_ORDER_ID', 'ITE_ITEM_ID', 'CTLG_PROD_ID', 'flag_bad_user',
       'is_buybox', 'has_bpp', 'has_claim_seller', 'is_refund', 'GMV_USD',
       'bpp_cashout_final'],
      dtype='object')

In [26]:
df_or.dtypes

ORD_ORDER_ID           int64
ITE_ITEM_ID            int64
CTLG_PROD_ID           int64
flag_bad_user          int64
is_buybox              int64
has_bpp                int64
has_claim_seller       int64
is_refund            float64
GMV_USD              float64
bpp_cashout_final    float64
dtype: object

In [29]:
df_or.sample(5)

Unnamed: 0,ORD_ORDER_ID,ITE_ITEM_ID,CTLG_PROD_ID,flag_bad_user,is_buybox,has_bpp,has_claim_seller,is_refund,GMV_USD,bpp_cashout_final
259899,4578323778,1635255241,15963017,0,0,0,0,1.0,35.99,0.0
29293,4552848421,1869239888,6063250,0,0,0,0,0.0,294.92,0.0
698995,4547776400,1848478193,16930325,0,0,0,1,1.0,116.94,0.0
725275,4555402999,1636056668,15949972,0,0,0,0,0.0,202.23,0.0
89245,4546036489,1841865975,17459951,1,0,0,0,0.0,271.91,0.0


## Join the DFs by Product ID

In [30]:
df_f = pd.merge(left = df_d, right = df_or,
                how = 'inner',
                left_on='CTLG_PROD_ID', right_on='CTLG_PROD_ID')

In [31]:
df_f.shape

(806506, 18)

In [33]:
df_f.columns

Index(['CTLG_PROD_ID', 'count', 'mean', 'std', 'min', '25%', '50%', '75%',
       'max', 'ORD_ORDER_ID', 'ITE_ITEM_ID', 'flag_bad_user', 'is_buybox',
       'has_bpp', 'has_claim_seller', 'is_refund', 'GMV_USD',
       'bpp_cashout_final'],
      dtype='object')

In [35]:
df_f.sample(5)

Unnamed: 0,CTLG_PROD_ID,count,mean,std,min,25%,50%,75%,max,ORD_ORDER_ID,ITE_ITEM_ID,flag_bad_user,is_buybox,has_bpp,has_claim_seller,is_refund,GMV_USD,bpp_cashout_final
615221,16002640,2.0,229.08,0.0,229.08,229.08,229.08,229.08,229.08,4562398216,1679415510,0,1,0,0,0.0,215.45,0.0
393381,15268423,2.0,251.97,0.0,251.97,251.97,251.97,251.97,251.97,4598276352,1867380639,0,0,0,0,0.0,157.81,0.0
600307,15984002,481.0,73.059168,20.926095,55.07,65.01,69.72,75.27,262.33,4556858951,1732806578,0,0,0,0,0.0,63.19,0.0
284465,13841393,34.0,67.935882,11.109042,51.49,60.3375,68.03,72.9325,100.89,4605198644,1871889698,0,1,1,1,1.0,57.28,0.0
14754,6064933,5.0,24.192,1.658454,22.21,22.98,24.38,24.97,26.42,4595749597,1817149601,0,1,0,0,0.0,12.98,0.0
