In [15]:
import pandas as pd
import numpy as np
import string 
import random 
import time
import dask.dataframe

In [10]:
time.time()

1585883810.618082

# small data

## 00_Generating a CSV file with random data ~75 MB

In [4]:
# setting the number of rows for the CSV file (1M)
N = 1000000

# creating a pandas dataframe (df) with 8 columns and N rows with random integers between 999 and 999999 and with column names from A to H
df = pd.DataFrame(
    np.random.randint(9,999, size=(N, 7)), 
    columns=list('ABCDEFG'))

# creating one column 'H' of float type using the uniform distribution
df['H'] = np.random.rand(N)

# creating two additional columns with random strings
df['I'] = pd.util.testing.rands_array(10, N)
df['J'] = pd.util.testing.rands_array(10, N)

# expect: 1M rows x 10 columns
df.shape

(1000000, 10)

In [21]:
# export the dataframe to csv ~75MB
df.to_csv("test_data_75MB.csv", index=False)

## 01_pandas_read

In [14]:
start_time = time.time()

df = pd.read_csv("test_data_75MB.csv") 

print("%s seconds" % (time.time() - start_time))

1.310065746307373 seconds


## 02_pandas_with_chunsize_option

In [13]:
start_time = time.time()

df_chunk = pd.read_csv("test_data_75MB.csv", chunksize=5000) 
chunk_list = []  

for chunk in df_chunk:  
    chunk_list.append(chunk)
df_concat = pd.concat(chunk_list)

print("%s seconds" % (time.time() - start_time))

2.338909864425659 seconds


## 03_dask_dataframe

In [22]:
start_time = time.time()

data = dask.dataframe.read_csv("test_data_75MB.csv").compute()

print("%s seconds" % (time.time() - start_time))

1.3885600566864014 seconds


Apparently, unlike pandas with dask the data is not fully loaded into memory, but is ready to be processed. Also certain opperations can be performed again without loading the whole dataset into memory. Another advantage is that the most functions used with pandas can be also use with dask. The differences arise from the parallel nature of dask.

# medium data

## 00_Generating a CSV file with random data ~75 MB

In [23]:
# setting the number of rows for the CSV file (10M)
N = 10000000

# creating a pandas dataframe (df) with 8 columns and N rows with random integers between 999 and 999999 and with column names from A to H
df = pd.DataFrame(
    np.random.randint(9,999, size=(N, 7)), 
    columns=list('ABCDEFG'))

# creating one column 'H' of float type using the uniform distribution
df['H'] = np.random.rand(N)

# creating two additional columns with random strings
df['I'] = pd.util.testing.rands_array(10, N)
df['J'] = pd.util.testing.rands_array(10, N)

# expect: 10M rows x 10 columns
df.shape

(10000000, 10)

In [None]:
# export the dataframe to csv ~75MB
df.to_csv("test_data_750MB.csv", index=False)