# <font color='#F37126'> Análise descritiva dos dados </font>

## 0.0. Imports

In [1]:
import pandas as pd
import numpy  as np

from sqlalchemy import create_engine
from IPython.display import Image

## 0.1. Coleta dos dados

In [2]:
path = '/home/judson/Documents/repos/hm_project/'
database_name = 'database_hm.sqlite'
conn = create_engine( 'sqlite:///' + path + database_name, echo=False )

In [3]:
query = """
    SELECT * FROM vitrine
"""

In [4]:
df_raw = pd.read_sql( query, con=conn )

In [9]:
df_raw.tail()

Unnamed: 0,product_id,style_id,color_id,product_name,color_name,fit,price,size_number,size_model,cotton,polyester,spandex,elasterell,scrapy_datetime
46,938875007,938875,7,slim_tapered_jeans,black,slim_fit,39.99,188.0,31/32,1.0,0.0,0.01,0.0,2021-12-28 20:27:03
47,938875008,938875,8,slim_tapered_jeans,denim_blue,slim_fit,29.99,187.0,31/32,1.0,0.0,0.01,0.0,2021-12-28 20:27:03
48,974597006,974597,6,slim_tapered_jeans,dark_gray,slim_fit,29.99,,,1.0,0.0,0.02,0.0,2021-12-28 20:27:03
49,974597001,974597,1,slim_tapered_jeans,denim_blue,slim_fit,29.99,186.0,31/32,1.0,0.0,0.01,0.0,2021-12-28 20:27:03
50,1028865001,1028865,1,relaxed_jeans_with_embroidery_detail,light_denim_blue/smiley®,relaxed_fit,49.99,186.0,,1.0,0.65,0.0,0.0,2021-12-28 20:27:03


# 1.0. Passo 01 - Descrição dos dados

In [6]:
df01 = df_raw.copy()

## 1.1. Dimensão dos dados

In [8]:
print( f'Number of Rows: {df01.shape[0]}' )
print( f'Number of Cols: {df01.shape[1]}' )

Number of Rows: 51
Number of Cols: 14


## 1.2. Tipo dos dados 

In [54]:
df01.dtypes

product_id          object
style_id            object
color_id            object
product_name        object
color_name          object
fit                 object
price              float64
size_number         object
size_model          object
cotton             float64
polyester          float64
spandex            float64
elasterell         float64
scrapy_datetime     object
dtype: object

In [55]:
# convert object to datetime
df01['scrapy_datetime'] = pd.to_datetime( df01['scrapy_datetime'] )

In [56]:
df01.dtypes

product_id                 object
style_id                   object
color_id                   object
product_name               object
color_name                 object
fit                        object
price                     float64
size_number                object
size_model                 object
cotton                    float64
polyester                 float64
spandex                   float64
elasterell                float64
scrapy_datetime    datetime64[ns]
dtype: object

## 1.3. Identificação de dados faltantes

In [57]:
df01.isna().sum()

product_id          0
style_id            0
color_id            0
product_name        0
color_name          0
fit                 0
price               0
size_number        18
size_model         24
cotton              0
polyester           0
spandex             0
elasterell          0
scrapy_datetime     0
dtype: int64

In [58]:
df01.isna().sum() / df01.shape[0]

product_id         0.000000
style_id           0.000000
color_id           0.000000
product_name       0.000000
color_name         0.000000
fit                0.000000
price              0.000000
size_number        0.352941
size_model         0.470588
cotton             0.000000
polyester          0.000000
spandex            0.000000
elasterell         0.000000
scrapy_datetime    0.000000
dtype: float64

## 1.4. Substituição de dados faltantes 

In [59]:
df_01 = df01.drop( columns=['size_number', 'size_model'] ).dropna()
df_01.shape

(51, 12)

## 1.5. Descrição dos dados

In [60]:
num_attributes = df_01.select_dtypes( include=['int64', 'float64'] )
cat_attributes = df_01.select_dtypes( exclude=['int64', 'float64', 'datetime64[ns]'] )

### 1.5.1. Dados numéricos

In [61]:
# tendencia central - media, mediana
t1 = pd.DataFrame ( num_attributes.apply( np.mean ) ).T
t2 = pd.DataFrame ( num_attributes.apply( np.median ) ).T

# dispersão - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame ( num_attributes.apply( np.std ) ).T
d2 = pd.DataFrame ( num_attributes.apply( np.min ) ).T
d3 = pd.DataFrame ( num_attributes.apply( np.max ) ).T
d4 = pd.DataFrame ( num_attributes.apply( lambda x: x.max() - x.min() ) ).T
d5 = pd.DataFrame ( num_attributes.apply( lambda x: x.skew() ) ).T
d6 = pd.DataFrame ( num_attributes.apply( lambda x: x.kurtosis() ) ).T

# concat
m1 = pd.concat( [d2, d3, d4, t1, t2, d1, d5, d6] ).T.reset_index()
m1.columns = ['attributes', 'min', 'max', 'range', 'media', 'mediana', 'std', 'skew', 'kurtosis']
m1

Unnamed: 0,attributes,min,max,range,media,mediana,std,skew,kurtosis
0,price,14.99,49.99,35.0,30.127255,29.99,9.777966,0.438422,-0.848709
1,cotton,0.77,1.0,0.23,0.957843,0.99,0.070747,-1.890872,2.177997
2,polyester,0.0,1.0,1.0,0.272745,0.0,0.3143,0.529472,-1.446507
3,spandex,0.0,0.02,0.02,0.013333,0.01,0.006157,-0.365544,-0.607877
4,elasterell,0.0,0.08,0.08,0.009412,0.0,0.025775,2.446002,4.143991
