# Imports and data

In [1]:
import pandas as pd
import numpy as np
import modin.pandas as md
import swifter

These data (~71 million rows) were taken from https://www.kaggle.com/benhamner/sf-bay-area-bike-share/data 

In [2]:
trips = pd.read_csv('trip.csv')

In [2]:
data = pd.read_csv('status.csv')

In [3]:
print(data.shape)
data.head()

(71984434, 4)


Unnamed: 0,station_id,bikes_available,docks_available,time
0,2,2,25,2013/08/29 12:06:01
1,2,2,25,2013/08/29 12:07:01
2,2,2,25,2013/08/29 12:08:01
3,2,2,25,2013/08/29 12:09:01
4,2,2,25,2013/08/29 12:10:01


# Apply any function in the fastest available manner

## When possible, vectorized form of function is used for 100x speed of pandas

In [7]:
def bikes_proportion(x, max_x):
    return x * 1.0 / max_x

In [8]:
%time data['bike_prop'] = data['bikes_available'].swifter.apply(bikes_proportion, max_x=np.max(data['bikes_available']))

CPU times: user 952 ms, sys: 952 ms, total: 1.9 s
Wall time: 1.9 s


## When vectorized form is not available, utilized dask parallel processing for 10x speed of pandas

In [9]:
def gt_5_bikes(x):
    if x > 5:
        return True
    else:
        return False

In [10]:
%time data['gt_5_bikes'] = data['bikes_available'].swifter.apply(gt_5_bikes)

HBox(children=(IntProgress(value=0, description='Dask Apply', max=48), HTML(value='')))


CPU times: user 2.55 s, sys: 2.54 s, total: 5.1 s
Wall time: 8.62 s


### But when possible, you should still write code in a vectorized format

In [11]:
def gt_5_bikes_vectorized(x):
    return np.where(x > 5, True, False)

In [12]:
%time data['gt_5_bikes_vec'] = data['bikes_available'].swifter.apply(gt_5_bikes_vectorized)

CPU times: user 138 ms, sys: 29.7 ms, total: 168 ms
Wall time: 167 ms


In [13]:
data.head()

Unnamed: 0,station_id,bikes_available,docks_available,time,bike_prop,gt_5_bikes,gt_5_bikes_vec
0,2,2,25,2013/08/29 12:06:01,0.074074,False,False
1,2,2,25,2013/08/29 12:07:01,0.074074,False,False
2,2,2,25,2013/08/29 12:08:01,0.074074,False,False
3,2,2,25,2013/08/29 12:09:01,0.074074,False,False
4,2,2,25,2013/08/29 12:10:01,0.074074,False,False


## When you can't write code in a vectorized format, swifter still makes parallel processing easy 

In [5]:
%time data['date'] = data['time'].swifter.apply(pd.to_datetime)

CPU times: user 10.5 s, sys: 751 ms, total: 11.3 s
Wall time: 11.3 s


In [17]:
def convert_to_human(datetime):
    return datetime.day_name() + ', the ' + str(datetime.day) + 'th day of ' + datetime.strftime("%B") + ', ' + str(datetime.year)

In [18]:
%time data['readable_date'] = data['date'].swifter.apply(convert_to_human)

HBox(children=(IntProgress(value=0, description='Dask Apply', max=48), HTML(value='')))


CPU times: user 4min 18s, sys: 2min 2s, total: 6min 21s
Wall time: 30min 25s


In [19]:
data.head()

Unnamed: 0,station_id,bikes_available,docks_available,time,bike_prop,gt_5_bikes,gt_5_bikes_vec,date,readable_date
0,2,2,25,2013/08/29 12:06:01,0.074074,False,False,2013-08-29 12:06:01,"Thursday, the 29th day of August, 2013"
1,2,2,25,2013/08/29 12:07:01,0.074074,False,False,2013-08-29 12:07:01,"Thursday, the 29th day of August, 2013"
2,2,2,25,2013/08/29 12:08:01,0.074074,False,False,2013-08-29 12:08:01,"Thursday, the 29th day of August, 2013"
3,2,2,25,2013/08/29 12:09:01,0.074074,False,False,2013-08-29 12:09:01,"Thursday, the 29th day of August, 2013"
4,2,2,25,2013/08/29 12:10:01,0.074074,False,False,2013-08-29 12:10:01,"Thursday, the 29th day of August, 2013"


# Multiple columns apply example

In [20]:
def bikes_per_dock_availability_ratio(bikes_avail, docks_avail):
    return bikes_avail / docks_avail

In [26]:
%time data["bikes_available_per_dock_available"] = data[['bikes_available', 'docks_available']].swifter.apply(lambda row: bikes_per_dock_availability_ratio(row["bikes_available"], row["docks_available"]))

CPU times: user 2.97 s, sys: 6.86 s, total: 9.83 s
Wall time: 11.7 s


In [27]:
data.head()

Unnamed: 0,station_id,bikes_available,docks_available,time,bike_prop,gt_5_bikes,gt_5_bikes_vec,date,readable_date,bikes_available_per_dock_available,rolling_sum_bikes_available
0,2,2,25,2013/08/29 12:06:01,0.074074,False,False,2013-08-29 12:06:01,"Thursday, the 29th day of August, 2013",0.08,
1,2,2,25,2013/08/29 12:07:01,0.074074,False,False,2013-08-29 12:07:01,"Thursday, the 29th day of August, 2013",0.08,
2,2,2,25,2013/08/29 12:08:01,0.074074,False,False,2013-08-29 12:08:01,"Thursday, the 29th day of August, 2013",0.08,
3,2,2,25,2013/08/29 12:09:01,0.074074,False,False,2013-08-29 12:09:01,"Thursday, the 29th day of August, 2013",0.08,
4,2,2,25,2013/08/29 12:10:01,0.074074,False,False,2013-08-29 12:10:01,"Thursday, the 29th day of August, 2013",0.08,


# Applymap example

In [None]:
data[["bikes_available", "docks_available"]] = data[["bikes_available", "docks_available"]].swifter.applymap(float)



HBox(children=(FloatProgress(value=0.0, description='Dask Applymap', max=24.0, style=ProgressStyle(description…




# Rolling objects apply example

In [None]:
data.head()

In [28]:
%time data["rolling_sum_bikes_available"] = data['bikes_available'].swifter.rolling(10).apply(sum)

  self._samp_pd.apply(func, *args, **kwds)
  return getattr(rolling, name)(*args, **kwargs)


HBox(children=(IntProgress(value=0, description='Dask Apply', max=47), HTML(value='')))


CPU times: user 5.01 s, sys: 11.5 s, total: 16.5 s
Wall time: 34.8 s


In [29]:
data.iloc[10:20,:]

Unnamed: 0,station_id,bikes_available,docks_available,time,bike_prop,gt_5_bikes,gt_5_bikes_vec,date,readable_date,bikes_available_per_dock_available,rolling_sum_bikes_available
10,2,2,25,2013/08/29 12:18:01,0.074074,False,False,2013-08-29 12:18:01,"Thursday, the 29th day of August, 2013",0.08,20.0
11,2,2,25,2013/08/29 12:19:01,0.074074,False,False,2013-08-29 12:19:01,"Thursday, the 29th day of August, 2013",0.08,20.0
12,2,2,25,2013/08/29 12:20:01,0.074074,False,False,2013-08-29 12:20:01,"Thursday, the 29th day of August, 2013",0.08,20.0
13,2,2,25,2013/08/29 12:21:01,0.074074,False,False,2013-08-29 12:21:01,"Thursday, the 29th day of August, 2013",0.08,20.0
14,2,2,25,2013/08/29 12:22:01,0.074074,False,False,2013-08-29 12:22:01,"Thursday, the 29th day of August, 2013",0.08,20.0
15,2,2,25,2013/08/29 12:23:01,0.074074,False,False,2013-08-29 12:23:01,"Thursday, the 29th day of August, 2013",0.08,20.0
16,2,2,25,2013/08/29 12:25:01,0.074074,False,False,2013-08-29 12:25:01,"Thursday, the 29th day of August, 2013",0.08,20.0
17,2,2,25,2013/08/29 12:26:01,0.074074,False,False,2013-08-29 12:26:01,"Thursday, the 29th day of August, 2013",0.08,20.0
18,2,2,25,2013/08/29 12:27:04,0.074074,False,False,2013-08-29 12:27:04,"Thursday, the 29th day of August, 2013",0.08,20.0
19,2,2,25,2013/08/29 12:29:01,0.074074,False,False,2013-08-29 12:29:01,"Thursday, the 29th day of August, 2013",0.08,20.0


# Resampler apply example

In [None]:
data.set_index("date", inplace=True)

In [9]:
%time data["daily_avg_bikes_available"] = data["bikes_available"].swifter.resample("1d").apply(np.mean)



HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=95.0, style=ProgressStyle(description_wi…




# Modin apply example

In [4]:
modin_data = md.DataFrame(data)



CPU times: user 101 ms, sys: 68.4 ms, total: 170 ms
Wall time: 1.61 s


In [6]:
%time modin_data["bikes_available_plus1"] = modin_data["bikes_available"].swifter.apply(lambda x: x+1)

CPU times: user 79.9 ms, sys: 30.7 ms, total: 111 ms
Wall time: 1.53 s


In [7]:
modin_data.head()

Unnamed: 0,station_id,bikes_available,docks_available,time,bikes_available_plus1
0,2,2,25,2013/08/29 12:06:01,3
1,2,2,25,2013/08/29 12:07:01,3
2,2,2,25,2013/08/29 12:08:01,3
3,2,2,25,2013/08/29 12:09:01,3
4,2,2,25,2013/08/29 12:10:01,3
