![](https://opengraph.githubassets.com/88c66bdae373dffd1f339f2ed0436ef24278f269e20351f1977dfac2f17ff713/rapidsai/dask-cudf)

Dask-cuDF는 별도 프로젝트로 진행되다 현재는 cuDF 프로젝트에 통합되었다.

In [1]:
import os
import numpy as np
import cupy as cp
import pandas as pd
import cudf
import dask
import dask.array as da
import dask.dataframe as dd
import dask_cudf
import time

from dask.diagnostics import ProgressBar

cp.random.seed(220919)

print(pd.__version__)
print(dask.__version__)
print(cudf.__version__)
print(dask_cudf.__version__)

1.4.2
2022.05.2
22.06.00
22.06.00


In [2]:
%%time
# 실제로 2.9 GB 파일을 메모리에 읽어들이기 때문에 오랜 시간 소요
pdf = pd.read_csv("../loan-default-data/MidSizedData.csv")

CPU times: user 25.8 s, sys: 3.15 s, total: 29 s
Wall time: 29 s


In [4]:
%%time
# GPU로 읽어들이는 것은 매우 빠르다.
cdf = cudf.read_csv("../loan-default-data/MidSizedData.csv")

CPU times: user 695 ms, sys: 301 ms, total: 996 ms
Wall time: 1.01 s


In [5]:
%%time
# 계산 그래프만 생성하는 Lazy Operation이기 때문에 Dask-cuDF는 더 빠르다. 바로 실행 완료
dcdf = dask_cudf.read_csv("../loan-default-data/MidSizedData.csv", npartitions=64)  # 25MB chunks

CPU times: user 14.4 ms, sys: 7.1 ms, total: 21.5 ms
Wall time: 20.7 ms


In [8]:
%time pdf.sort_values('cust_id').head()
%time cdf.sort_values('cust_id').head()
%time dcdf.sort_values('cust_id').head()

CPU times: user 3.69 s, sys: 440 ms, total: 4.13 s
Wall time: 4.12 s
CPU times: user 43.2 ms, sys: 27.6 ms, total: 70.8 ms
Wall time: 72.4 ms
CPU times: user 1.96 s, sys: 2.04 s, total: 4 s
Wall time: 4.16 s


Unnamed: 0,cust_id,year,state,date_issued,date_final,emp_duration,own_type,income_type,app_type,loan_purpose,...,annual_pay,loan_amount,interest_rate,loan_duration,dti,total_pymnt,total_rec_prncp,recoveries,installment,is_default
7853,54734,2009,Haryana,01/08/2009,1102011,0.5,RENT,Low,INDIVIDUAL,debt_consolidation,...,85000,25000,11.89,36 months,19.48,29324.32,25000.0,0.0,829.1,0
895232,54734,2009,Haryana,01/08/2009,1102011,0.5,RENT,Low,INDIVIDUAL,debt_consolidation,...,85000,25000,11.89,36 months,19.48,29324.32,25000.0,0.0,829.1,0
54869,54734,2009,Haryana,01/08/2009,1102011,0.5,RENT,Low,INDIVIDUAL,debt_consolidation,...,85000,25000,11.89,36 months,19.48,29324.32,25000.0,0.0,829.1,0
942248,54734,2009,Haryana,01/08/2009,1102011,0.5,RENT,Low,INDIVIDUAL,debt_consolidation,...,85000,25000,11.89,36 months,19.48,29324.32,25000.0,0.0,829.1,0
101105,54734,2009,Haryana,01/08/2009,1102011,0.5,RENT,Low,INDIVIDUAL,debt_consolidation,...,85000,25000,11.89,36 months,19.48,29324.32,25000.0,0.0,829.1,0


In [11]:
%time pdf.state.value_counts()
%time cdf.state.value_counts()
%time dcdf.state.value_counts().compute()

CPU times: user 1.09 s, sys: 2.1 ms, total: 1.1 s
Wall time: 1.09 s
CPU times: user 14.3 ms, sys: 5.12 ms, total: 19.4 ms
Wall time: 18.2 ms
CPU times: user 885 ms, sys: 741 ms, total: 1.63 s
Wall time: 1.65 s


Chhattisgarh         708642
Sikkim               706860
Haryana              703978
Punjab               700766
Assam                699424
Goa                  699292
Madhya Pradesh       698544
Uttar Pradesh        698500
Maharashtra          698104
Himachal Pradesh     698060
Arunachal Pradesh    697466
Nagaland             697422
Andhra Pradesh       697136
Kerala               697114
Tripura              696718
Rajasthan            696718
Karnataka            695618
Manipur              695266
West Bengal          695112
Odisha               694914
Telangana            694892
Bihar                694694
Jharkhand            694100
Mizoram              693792
Uttarakhand          693792
Gujarat              693792
Tamil Nadu           693374
Meghalaya            688248
Name: state, dtype: int64

In [12]:
# `persist()`로 Dask DataFrame을 메모리에 고정시켜서 속도가 더 빨라지는지 확인한다.
# Dask-cuDF는 `persist()` 결과를 받지 않아도 해당 변수가 in-place로 적용된다.
dcdf.persist()

Unnamed: 0_level_0,cust_id,year,state,date_issued,date_final,emp_duration,own_type,income_type,app_type,loan_purpose,interest_payments,grade,annual_pay,loan_amount,interest_rate,loan_duration,dti,total_pymnt,total_rec_prncp,recoveries,installment,is_default
npartitions=12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
,int64,int64,object,object,int64,float64,object,object,object,object,object,object,int64,int64,float64,object,float64,float64,float64,float64,float64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [14]:
%time pdf.state.value_counts()
%time cdf.state.value_counts()
%time dcdf.state.value_counts().compute()

CPU times: user 1.02 s, sys: 4.75 ms, total: 1.02 s
Wall time: 1.01 s
CPU times: user 21.4 ms, sys: 6.73 ms, total: 28.1 ms
Wall time: 27.2 ms
CPU times: user 923 ms, sys: 760 ms, total: 1.68 s
Wall time: 1.73 s


Chhattisgarh         708642
Sikkim               706860
Haryana              703978
Punjab               700766
Assam                699424
Goa                  699292
Madhya Pradesh       698544
Uttar Pradesh        698500
Maharashtra          698104
Himachal Pradesh     698060
Arunachal Pradesh    697466
Nagaland             697422
Andhra Pradesh       697136
Kerala               697114
Tripura              696718
Rajasthan            696718
Karnataka            695618
Manipur              695266
West Bengal          695112
Odisha               694914
Telangana            694892
Bihar                694694
Jharkhand            694100
Mizoram              693792
Uttarakhand          693792
Gujarat              693792
Tamil Nadu           693374
Meghalaya            688248
Name: state, dtype: int64

`persist()`로 메모리에 고정해도 속도가 빨라지지 않는다.  
`LocalCUDACluster`를 구동해본다. GPU 갯수만큼 4개가 자동으로 실행된다.

In [2]:
import time

from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster(ip='0.0.0.0')
client = Client(cluster)

2022-11-01 09:22:24,058 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-11-01 09:22:24,107 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-11-01 09:22:24,163 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-11-01 09:22:24,165 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


In [20]:
%time pdf.state.value_counts()
%time cdf.state.value_counts()
%time dcdf.state.value_counts().compute()

CPU times: user 1.01 s, sys: 5.63 ms, total: 1.02 s
Wall time: 1 s
CPU times: user 33.8 ms, sys: 6.99 ms, total: 40.8 ms
Wall time: 30.9 ms
CPU times: user 53.4 ms, sys: 14.3 ms, total: 67.7 ms
Wall time: 763 ms


Chhattisgarh         708642
Sikkim               706860
Haryana              703978
Punjab               700766
Assam                699424
Goa                  699292
Madhya Pradesh       698544
Uttar Pradesh        698500
Maharashtra          698104
Himachal Pradesh     698060
Arunachal Pradesh    697466
Nagaland             697422
Andhra Pradesh       697136
Kerala               697114
Tripura              696718
Rajasthan            696718
Karnataka            695618
Manipur              695266
West Bengal          695112
Odisha               694914
Telangana            694892
Bihar                694694
Jharkhand            694100
Mizoram              693792
Uttarakhand          693792
Gujarat              693792
Tamil Nadu           693374
Meghalaya            688248
Name: state, dtype: int64

LocalCUDACluster 구동 후 2배 이상 속도가 빨라졌으나 그래도 기대에 미치지 못한다. 대시보드에서 프로파일링 해보면 read-csv 태스크가 대부분을 차지한다. 

![](https://user-images.githubusercontent.com/1250095/198995247-bfc9571e-627e-43d2-aecd-d71292597b95.png)

그렇다면 `persist()`로 메모리에 고정시켜 read-csv가 필요 없도록 하면 속도를 더 높일 수 있을 것 같다. 마찬가지로 `persist()`가 Dask와 달리 리턴 없이도 in-place로 바로 적용된다. 먼저 실행하기 전 메모리 현황은 다음과 같다.

![](https://user-images.githubusercontent.com/1250095/198998857-f96b8cf1-6bb5-4860-a076-b555a2f4d8a5.png)

이제 `persist()`를 다음과 같이 실행한다.

In [None]:
dcdf.persist()

In [23]:
%time pdf.state.value_counts()
%time cdf.state.value_counts()
%time dcdf.state.value_counts().compute()

CPU times: user 1.12 s, sys: 14.9 ms, total: 1.13 s
Wall time: 1.12 s
CPU times: user 34.6 ms, sys: 3.52 ms, total: 38.1 ms
Wall time: 33.3 ms
CPU times: user 22.9 ms, sys: 953 µs, total: 23.8 ms
Wall time: 77.4 ms


Chhattisgarh         708642
Sikkim               706860
Haryana              703978
Punjab               700766
Assam                699424
Goa                  699292
Madhya Pradesh       698544
Uttar Pradesh        698500
Maharashtra          698104
Himachal Pradesh     698060
Arunachal Pradesh    697466
Nagaland             697422
Andhra Pradesh       697136
Kerala               697114
Tripura              696718
Rajasthan            696718
Karnataka            695618
Manipur              695266
West Bengal          695112
Odisha               694914
Telangana            694892
Bihar                694694
Jharkhand            694100
Mizoram              693792
Uttarakhand          693792
Gujarat              693792
Tamil Nadu           693374
Meghalaya            688248
Name: state, dtype: int64

Dask-cuDF의 속도가 10배 이상 증가했다. 대신 GPU 메모리는 고정적으로 점유하는 걸 확인할 수 있다.
![](https://user-images.githubusercontent.com/1250095/198998819-74ff1ac1-0dfb-43f1-88ba-7a5c2511543e.png)

In [24]:
%time pdf.state.nunique()
%time cdf.state.nunique()
%time dcdf.state.nunique().compute()

CPU times: user 996 ms, sys: 12.9 ms, total: 1.01 s
Wall time: 976 ms
CPU times: user 9.11 ms, sys: 2.92 ms, total: 12 ms
Wall time: 11.7 ms
CPU times: user 27.8 ms, sys: 5.55 ms, total: 33.4 ms
Wall time: 128 ms


28

In [25]:
%time pdf.groupby('state').total_pymnt.mean()
%time cdf.groupby('state').total_pymnt.mean()
%time dcdf.groupby('state').total_pymnt.mean().compute()

CPU times: user 1.23 s, sys: 40 ms, total: 1.27 s
Wall time: 1.23 s
CPU times: user 28.1 ms, sys: 3.21 ms, total: 31.3 ms
Wall time: 30.2 ms
CPU times: user 29.3 ms, sys: 4.73 ms, total: 34 ms
Wall time: 96.9 ms


state
Mizoram              7563.220400
Odisha               7617.385990
Karnataka            7499.055526
Uttarakhand          7595.781252
Tamil Nadu           7547.376405
Jharkhand            7528.867289
Meghalaya            7626.587164
Himachal Pradesh     7532.032232
Andhra Pradesh       7539.887278
Telangana            7539.608749
Punjab               7551.748058
Goa                  7567.639128
Maharashtra          7595.657335
Tripura              7560.031309
Chhattisgarh         7555.550607
Kerala               7605.894841
Assam                7483.791068
Arunachal Pradesh    7571.114449
Nagaland             7685.616803
Haryana              7508.253666
Uttar Pradesh        7545.473138
Sikkim               7587.741911
Gujarat              7530.040670
Madhya Pradesh       7542.526326
Bihar                7542.281996
West Bengal          7554.332179
Rajasthan            7541.678963
Manipur              7528.928218
Name: total_pymnt, dtype: float64

In [26]:
def add(x):
  return x + 5
%time pdf['total_pymnt'].apply(add)
%time cdf['total_pymnt'].apply(add)
%time dcdf['total_pymnt'].compute().apply(add)

CPU times: user 3.58 s, sys: 386 ms, total: 3.97 s
Wall time: 3.92 s
CPU times: user 387 ms, sys: 22 ms, total: 409 ms
Wall time: 394 ms
CPU times: user 114 ms, sys: 113 ms, total: 227 ms
Wall time: 355 ms


0         13655.38
1          1668.04
2          1236.38
3          5205.44
4          5570.65
            ...   
504483        5.00
504484        5.00
504485      252.39
504486        5.00
504487        5.00
Name: total_pymnt, Length: 19522338, dtype: float64

In [27]:
%time pdf['state'].unique()[0].lower()
%time cdf['state'].unique()[0].lower()
%time dcdf['state'].unique().compute()[0].lower()

CPU times: user 990 ms, sys: 21.3 ms, total: 1.01 s
Wall time: 975 ms
CPU times: user 168 ms, sys: 6.94 ms, total: 175 ms
Wall time: 170 ms
CPU times: user 28.2 ms, sys: 4.19 ms, total: 32.4 ms
Wall time: 106 ms


'andhra pradesh'

# Super-Sized Data 테스트

Dask와 달리 Dask-cuDF는 항상 `persist()`한 결과가 월등히 뛰어나다.

In [6]:
# 129 GB
dcdf = dask_cudf.read_csv("../loan-default-data/SuperSizedData.csv", blocksize=25e6)  # 25MB chunks

In [9]:
%time dcdf.state.value_counts().compute()
%time dcdf.state.nunique().compute()
%time dcdf.query('cust_id > 83000').head()
%time dcdf.groupby('state').total_pymnt.mean().compute()
%time dcdf['state'].unique().compute()[0].lower()

CPU times: user 3.59 s, sys: 607 ms, total: 4.19 s
Wall time: 29.2 s
CPU times: user 3.63 s, sys: 555 ms, total: 4.18 s
Wall time: 31.8 s
CPU times: user 540 ms, sys: 71.7 ms, total: 612 ms
Wall time: 1.2 s
CPU times: user 4.12 s, sys: 531 ms, total: 4.65 s
Wall time: 30.7 s
CPU times: user 3.81 s, sys: 483 ms, total: 4.3 s
Wall time: 31.5 s


'andhra pradesh'

In [11]:
dcdf.persist()

Unnamed: 0_level_0,cust_id,year,state,date_issued,date_final,emp_duration,own_type,income_type,app_type,loan_purpose,interest_payments,grade,annual_pay,loan_amount,interest_rate,loan_duration,dti,total_pymnt,total_rec_prncp,recoveries,installment,is_default
npartitions=514,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
,int64,int64,object,object,int64,float64,object,object,object,object,object,object,int64,int64,float64,object,float64,float64,float64,float64,float64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [12]:
%time dcdf.state.value_counts().compute()
%time dcdf.state.nunique().compute()
%time dcdf.query('cust_id > 83000').head()
%time dcdf.groupby('state').total_pymnt.mean().compute()
%time dcdf['state'].unique().compute()[0].lower()

CPU times: user 420 ms, sys: 48.2 ms, total: 468 ms
Wall time: 1.73 s
CPU times: user 877 ms, sys: 77.5 ms, total: 955 ms
Wall time: 3.45 s
CPU times: user 39.7 ms, sys: 0 ns, total: 39.7 ms
Wall time: 73.3 ms
CPU times: user 1.1 s, sys: 108 ms, total: 1.21 s
Wall time: 2.74 s
CPU times: user 698 ms, sys: 89.7 ms, total: 787 ms
Wall time: 2.77 s


'andhra pradesh'

# Hyper-Sized Data 테스트

전체 메모리에 다 올라가지 않을 만큼 큰 데이터는 `persist()` 이후에도 속도 개선 효과가 거의 없다. 그래도 OOM 에러가 나지 않고(자동으로 일부 비율만큼만 올리는 것으로 보임) 2-3배 이상의 속도를 보인다.

In [3]:
# 296 GB
dcdf = dask_cudf.read_csv("../loan-default-data/HyperSizedData.csv", blocksize=25e6)  # 25MB chunks

In [4]:
%time dcdf.state.value_counts().compute()

CPU times: user 17.1 s, sys: 3.09 s, total: 20.2 s
Wall time: 2min 59s


Chhattisgarh         74117511
Sikkim               73931130
Haryana              73629699
Punjab               73293753
Assam                73153392
Goa                  73139586
Madhya Pradesh       73061352
Uttar Pradesh        73056750
Maharashtra          73015332
Himachal Pradesh     73010730
Arunachal Pradesh    72948603
Nagaland             72944001
Andhra Pradesh       72914088
Kerala               72911787
Rajasthan            72870369
Tripura              72870369
Karnataka            72755319
Manipur              72718503
West Bengal          72702396
Odisha               72681687
Telangana            72679386
Bihar                72658677
Jharkhand            72596550
Mizoram              72564336
Uttarakhand          72564336
Gujarat              72564336
Tamil Nadu           72520617
Meghalaya            71984484
Name: state, dtype: int64

In [5]:
dcdf.persist()

Unnamed: 0_level_0,cust_id,year,state,date_issued,date_final,emp_duration,own_type,income_type,app_type,loan_purpose,interest_payments,grade,annual_pay,loan_amount,interest_rate,loan_duration,dti,total_pymnt,total_rec_prncp,recoveries,installment,is_default
npartitions=1181,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
,int64,int64,object,object,int64,float64,object,object,object,object,object,object,int64,int64,float64,object,float64,float64,float64,float64,float64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [7]:
%time dcdf.state.value_counts().compute()
%time dcdf.state.nunique().compute()
%time dcdf.query('cust_id > 83000').head()
%time dcdf.groupby('state').total_pymnt.mean().compute()
%time dcdf['state'].unique().compute()[0].lower()

CPU times: user 12.3 s, sys: 492 ms, total: 12.8 s
Wall time: 54.9 s
CPU times: user 12.8 s, sys: 557 ms, total: 13.3 s
Wall time: 57.2 s
CPU times: user 574 ms, sys: 66.9 ms, total: 640 ms
Wall time: 1.41 s
CPU times: user 13 s, sys: 617 ms, total: 13.6 s
Wall time: 55.4 s
CPU times: user 11.4 s, sys: 551 ms, total: 12 s
Wall time: 52.4 s


'andhra pradesh'

# Conclusion

Dask는 LocalCluster를 구성하면 대부분의 Operation에서 Pandas보다 더 빠른 속도를 보인다. 그러나 Dask-cuDF는 LocalCUDACluster를 구동해도 Single GPU만 사용하는 cuDF에 비해 항상 빠르지는 않다. 하나의 GPU에도 코어가 충분히 많다 보니 분산에 한계가 있어 현재 파일 크기 정도로는 더 이상 속도를 높이지 못하는듯 하다. 대신 Dask-cuDF는 단일 메모리 최대 한계인 80 GB를 극복할 수 있으며, 매우 큰 파일도 `persist()`로 메모리에 고정이 가능하며, CPU에 비해서는 월등히 빠른 속도를 보인다.