In [1]:
import pandas as pd
import time
import multiprocessing as mp
from multiprocessing import Pool
import numpy as np

In [2]:
#import the dataset
dataframe = pd.read_csv('/Users/lindseyclark/Documents/formula_1_project/formula-1-race-data-19502017/lapTimes.csv')

In [None]:
#describe the dataset
dataframe.describe()

In [None]:
dataframe.head(10)

In [3]:
#define a function to perform a transformation on the dataset, here is splitting a column
def create_split_cols(dataframe):
    dataframe2 = dataframe.time.apply(lambda x: pd.Series(str(x).split(".")))
    return dataframe2

# Without Multiprocess

In [None]:
#without multiprocessing
start_time = time.time()
dataframe2 = create_split_cols(dataframe)
end_time = time.time()
time_elapsed = end_time-start_time
print('The runtime without multiprocess is %s seconds' %(time_elapsed))

In [None]:
dataframe2

# With Multiprocess

In [None]:
split_process = mp.Process(target=create_split_cols, args=(dataframe,))

In [None]:
start_time = time.time()
split_process.start()
split_process.join()
split_process.terminate()
end_time = time.time()
time_elapsed = end_time-start_time
print('The runtime with multiprocess is %s seconds' %(time_elapsed))

# With Multiprocess and Pool

In [None]:
#http://www.racketracer.com/2016/07/06/pandas-in-parallel/
#The Process class sends each task to a different processor, 
#and the Pool class sends sets of tasks to different processors.
#https://medium.com/@urban_institute/using-multiprocessing-to-make-python-code-faster-23ea5ef996ba

In [None]:
num_partitions = 10 #number of partitions to split dataframe
num_cores = 4 #number of cores on your machine

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
start_time = time.time()
parallelize_dataframe(dataframe, create_split_cols)
end_time = time.time()
time_elapsed = end_time-start_time
print('The runtime with multiprocess and pool is %s seconds' %(time_elapsed))

# Dask

In [10]:
import dask.dataframe as dd
%time dataframe_dask = \
dd.read_csv("/Users/lindseyclark/Documents/formula_1_project/formula-1-race-data-19502017/lapTimes.csv")

CPU times: user 15.4 ms, sys: 17.9 ms, total: 33.3 ms
Wall time: 33 ms


In [None]:
from dask import dataframe as dd
from dask.multiprocessing import get
from multiprocessing import cpu_count
nCores = cpu_count()

In [None]:
#data = <your_pandas_dataframe>
dataframe_dask = dd.from_pandas(dataframe, npartitions=20)

result = dataframe_dask.map_partitions(lambda: create_split_cols, meta=pd.DataFrame([])).compute(scheduler='processes')  


#result = dataframe_dask.map_partitions(lambda df: df.time.apply((lambda row: pd.Series(str(x).split("."))), axis=1)).compute(get=get) 

In [None]:
def test_f(dataframe, time):
    return dataframe.assign(result=pd.Series(str(x).split("."))


ddf_out = ddf.map_partitions(test_f, 'dataframe', 'time')

# Here is good place to do something with BIG ddf_out dataframe before calling .compute()

result = ddf_out.compute(get=get)  

In [None]:
dataframe_dask = dd.from_pandas(dataframe, npartitions=20)
