# Dask

In [17]:
import pandas as pd
import time as tm
from dask import dataframe as dd

In [15]:
inicio = tm.time()
vendas = pd.read_csv("2019-Oct.csv")
fim    = tm.time()

print("Duração:", round(fim - inicio, 2), "segundos, sem usar o dask")

Duração: 3.65 segundos, sem usar o dask


In [6]:
vendas.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424487 entries, 0 to 424486
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     424487 non-null  object 
 1   event_type     424487 non-null  object 
 2   product_id     424487 non-null  int64  
 3   category_id    424487 non-null  int64  
 4   category_code  289544 non-null  object 
 5   brand          363069 non-null  object 
 6   price          424487 non-null  float64
 7   user_id        424487 non-null  int64  
 8   user_session   424487 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 157.4 MB


In [7]:
for tipos in ['float','int64','object']:
    selecao_tipos  = vendas.select_dtypes(include=[tipos])
    total_usado_b  = selecao_tipos.memory_usage(deep=True).sum()
    total_usado_mb = round(total_usado_b / 1024 ** 2, 2)
    print(f"Total de memória utilizada por {tipos}: {total_usado_mb} MB")

Total de memória utilizada por float: 3.24 MB
Total de memória utilizada por int64: 9.72 MB
Total de memória utilizada por object: 144.48 MB


## Mudando tipos de dados para reduzir uso de memória

In [8]:
lista_tipo_object = vendas.select_dtypes('object').columns

vendas[lista_tipo_object] = vendas[lista_tipo_object].astype('category')

print("Tamanho:", round(vendas.memory_usage(deep=True).sum() / 1024 ** 2, 2), 'MB')

Tamanho: 98.85 MB


## Trabalhando com DASK

In [9]:
# !pip install dask[dataframe]

Collecting dask[dataframe]
  Using cached dask-2021.6.0-py3-none-any.whl (965 kB)
Collecting cloudpickle>=1.1.1
  Using cached cloudpickle-1.6.0-py3-none-any.whl (23 kB)
Collecting partd>=0.3.10
  Using cached partd-1.2.0-py3-none-any.whl (19 kB)
Collecting pyyaml
  Using cached PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl (662 kB)
Collecting fsspec>=0.6.0
  Using cached fsspec-2021.6.0-py3-none-any.whl (114 kB)
Collecting locket
  Using cached locket-0.2.1-py2.py3-none-any.whl (4.1 kB)
Installing collected packages: locket, pyyaml, partd, fsspec, cloudpickle, dask
Successfully installed cloudpickle-1.6.0 dask-2021.6.0 fsspec-2021.6.0 locket-0.2.1 partd-1.2.0 pyyaml-5.4.1


In [24]:
inicio = tm.time()
dd_vendas = dd.read_csv("2019-Oct.csv", blocksize=100 * 1024 * 1024) # Blocos de 100mb
df_vendas = dd_vendas.compute(num_workers=8)
fim    = tm.time()

print("Duração:", round(fim - inicio, 2), "segundos, usando DASK")
df_vendas.info(memory_usage='deep')

Duração: 4.66 segundos, usando DASK
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424487 entries, 0 to 424486
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     424487 non-null  object 
 1   event_type     424487 non-null  object 
 2   product_id     424487 non-null  int64  
 3   category_id    424487 non-null  int64  
 4   category_code  289544 non-null  object 
 5   brand          363069 non-null  object 
 6   price          424487 non-null  float64
 7   user_id        424487 non-null  int64  
 8   user_session   424487 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 157.4 MB


In [25]:
dd_vendas

Unnamed: 0_level_0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,object,object,int64,int64,object,object,float64,int64,object
,...,...,...,...,...,...,...,...,...


In [27]:
df_vendas.head(3)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:18 UTC,view,1801995,2053013554415534427,electronics.video.tv,haier,193.03,537192226,e3151795-c355-4efa-acf6-e1fe1bebeee5
1,2019-10-01 00:00:36 UTC,view,3600575,2053013563810775923,appliances.kitchen.washer,hotpoint-ariston,275.37,554754045,bd0302ef-c5ca-4b6a-b916-95cc2840c72c
2,2019-10-01 00:01:58 UTC,view,4100274,2053013561218695907,,microsoft,275.4,519885473,b70cb218-db90-4011-b582-0bd237109df1


In [31]:
df_vendas.tail(3)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
424484,2019-10-31 23:58:50 UTC,view,1306421,2053013558920217191,computers.notebook,hp,514.56,530857208,c69f9c63-7098-426c-97d9-8cb94a0a1083
424485,2019-10-31 23:59:01 UTC,view,2702481,2053013563911439225,appliances.kitchen.refrigerators,midea,143.89,559215873,26ca9bd8-21bd-47b9-b227-b6a083bde2a7
424486,2019-10-31 23:59:40 UTC,view,1004566,2053013555631882655,electronics.smartphone,huawei,164.84,566265908,52c2c76c-b79e-4794-86ff-badc76d35f5a


In [34]:
df_vendas.shape

(424487, 9)