# Imports and data

In [7]:
from swifter import swiftapply
import pandas as pd
import numpy as np

These data (~71 million rows) were taken from https://www.kaggle.com/benhamner/sf-bay-area-bike-share/data 

In [2]:
data = pd.read_feather('/home/ec2-user/data/status')

In [4]:
print(data.shape)
data.head()

(71984434, 4)


Unnamed: 0,station_id,bikes_available,docks_available,time
0,2,2,25,2013/08/29 12:06:01
1,2,2,25,2013/08/29 12:07:01
2,2,2,25,2013/08/29 12:08:01
3,2,2,25,2013/08/29 12:09:01
4,2,2,25,2013/08/29 12:10:01


# Run any function in the fastest available manner

## When possible, vectorized form of function is used for 100x speed of pandas

In [5]:
def bikes_proportion(x, max_x):
    return x * 1.0 / max_x

In [8]:
%time data['bike_prop'] = swiftapply(data['bikes_available'], bikes_proportion, max_x=np.max(data['bikes_available']))

CPU times: user 868 ms, sys: 908 ms, total: 1.78 s
Wall time: 1.77 s


In [11]:
data['bike_prop']

0           0.074074
1           0.074074
2           0.074074
3           0.074074
4           0.074074
5           0.074074
6           0.074074
7           0.074074
8           0.074074
9           0.074074
10          0.074074
11          0.074074
12          0.074074
13          0.074074
14          0.074074
15          0.074074
16          0.074074
17          0.074074
18          0.074074
19          0.074074
20          0.074074
21          0.074074
22          0.074074
23          0.074074
24          0.074074
25          0.074074
26          0.074074
27          0.074074
28          0.074074
29          0.074074
              ...   
71984404    0.296296
71984405    0.296296
71984406    0.296296
71984407    0.296296
71984408    0.296296
71984409    0.296296
71984410    0.296296
71984411    0.296296
71984412    0.296296
71984413    0.296296
71984414    0.296296
71984415    0.296296
71984416    0.296296
71984417    0.296296
71984418    0.296296
71984419    0.296296
71984420    0

## When vectorized form is not available, utilized dask parallel processing for 10x speed of pandas

In [12]:
def gt_5_bikes(x):
    if x > 5:
        return True
    else:
        return False

In [13]:
%time data['gt_5_bikes'] = swiftapply(data['bikes_available'], gt_5_bikes)

CPU times: user 5.15 s, sys: 4.78 s, total: 9.94 s
Wall time: 13.8 s


### But when possible, you should still write code in a vectorized format

In [16]:
def gt_5_bikes_vectorized(x):
    return np.where(x > 5, True, False)

In [17]:
%time data['gt_5_bikes_vec'] = swiftapply(data['bikes_available'], gt_5_bikes_vectorized)

CPU times: user 144 ms, sys: 88 ms, total: 232 ms
Wall time: 231 ms


In [18]:
data.head()

Unnamed: 0,station_id,bikes_available,docks_available,time,bike_prop,gt_5_bikes,gt_5_bikes_vec
0,2,2,25,2013/08/29 12:06:01,0.074074,False,False
1,2,2,25,2013/08/29 12:07:01,0.074074,False,False
2,2,2,25,2013/08/29 12:08:01,0.074074,False,False
3,2,2,25,2013/08/29 12:09:01,0.074074,False,False
4,2,2,25,2013/08/29 12:10:01,0.074074,False,False


## When you can't write code in a vectorized format, swiftapply still makes parallel processing easy 

In [19]:
data['date'] = swiftapply(data['time'], pd.to_datetime)

In [23]:
def convert_to_human(datetime):
    return datetime.weekday_name + ', the ' + str(datetime.day) + 'th day of ' + datetime.strftime("%B") + ', ' + str(datetime.year)

In [25]:
%time data['humanreadable_date'] = swiftapply(data['date'], convert_to_human)

CPU times: user 5min 21s, sys: 34.2 s, total: 5min 55s
Wall time: 9min 24s


In [26]:
data.head()

Unnamed: 0,station_id,bikes_available,docks_available,time,bike_prop,gt_5_bikes,gt_5_bikes_vec,date,humanreadable_date
0,2,2,25,2013/08/29 12:06:01,0.074074,False,False,2013-08-29 12:06:01,"Thursday, the 29th day of August, 2013"
1,2,2,25,2013/08/29 12:07:01,0.074074,False,False,2013-08-29 12:07:01,"Thursday, the 29th day of August, 2013"
2,2,2,25,2013/08/29 12:08:01,0.074074,False,False,2013-08-29 12:08:01,"Thursday, the 29th day of August, 2013"
3,2,2,25,2013/08/29 12:09:01,0.074074,False,False,2013-08-29 12:09:01,"Thursday, the 29th day of August, 2013"
4,2,2,25,2013/08/29 12:10:01,0.074074,False,False,2013-08-29 12:10:01,"Thursday, the 29th day of August, 2013"
