In [1]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

In [2]:
import sys
import os
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir('..')

from tqdm.notebook import trange, tqdm
    
import numpy as np
from numba import jit
import pandas as pd
pd.set_option('display.max_rows', 100)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from bundle_notifications.bundle_notifications import load_data,create_message
from bundle_notifications.optimal_delay import delay, local_search



# 1. Load data

We first load the whole dataset and then get a sample of it in order to do some speed tests

In [3]:
url = 'https://static-eu-komoot.s3.amazonaws.com/backend/challenge/notifications.csv'
df = load_data(url, nrows=15000)

df['timestamp_ns'] = df.timestamp.copy().astype("int")
df['dayofyear'] = df.timestamp.copy().dt.dayofyear
    
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   timestamp     15000 non-null  datetime64[ns]
 1   user_id       15000 non-null  object        
 2   friend_id     15000 non-null  object        
 3   friend_name   15000 non-null  object        
 4   timestamp_ns  15000 non-null  int64         
 5   dayofyear     15000 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 703.2+ KB


In [4]:
# Sample for an ID
df_g = df.loc[(df.user_id == 'CFFEC5978B0A4A05FA6DCEFB2C82CC') & (df.dayofyear == 213),].copy()#.head(20)
df_g.head()

Unnamed: 0,timestamp,user_id,friend_id,friend_name,timestamp_ns,dayofyear
3,2017-08-01 01:20:47,CFFEC5978B0A4A05FA6DCEFB2C82CC,2BB0471CAA78ED0FCEE143E175F034,Mona,1501550447000000000,213
25,2017-08-01 02:28:27,CFFEC5978B0A4A05FA6DCEFB2C82CC,2BB0471CAA78ED0FCEE143E175F034,Mona,1501554507000000000,213
29,2017-08-01 03:00:42,CFFEC5978B0A4A05FA6DCEFB2C82CC,74C09338D7CA031859AE26A1586692,Toomas,1501556442000000000,213
39,2017-08-01 03:51:05,CFFEC5978B0A4A05FA6DCEFB2C82CC,F039A0F7A3245F7B2D7BD0942F3680,Sean,1501559465000000000,213
99,2017-08-01 05:03:44,CFFEC5978B0A4A05FA6DCEFB2C82CC,DF6A386FE701217C2A12292DB8D142,Buse,1501563824000000000,213


## Speed tests: __add_notif_counter()__


In [5]:
def notif_counter_pandas(df_g,x):
    df_g['notification_bool'] = False
    df_g['notification_bool'].iloc[x] = True

    # Now do a cumsum counter
    df_g['notification_counter'] = df_g.notification_bool.cumsum().shift()+1
    df_g['notification_counter'].iloc[0] = 1
    return df_g.notification_counter

@jit
def add_notif_counter_j(x,notification_counter):
    """ Equivalent to add_notif_counter but compilting it with jit
    """
    
    notification_counter[:x[0]+1] = 1
    notification_counter[x[0]+1:x[1]+1] = 2
    notification_counter[x[1]+1:x[2]+1] = 3
    notification_counter[(x[2]+1):] = 4
    
    return notification_counter


def add_notif_counter(x):
    """ 
    
    Equivalent function to:
    
    df_g['notification_bool'] = False
    df_g.notification_bool.iloc[x] = True

    # Now do a cumsum counter
    df_g['notification_counter'] = df_g.notification_bool.cumsum().shift()+1
    df_g.notification_counter.iloc[0] = 1
    """
    notification_counter = np.zeros(x[-1]+1,dtype='int')
    notification_counter[:x[0]+1] = 1
    notification_counter[x[0]+1:x[1]+1] = 2
    notification_counter[x[1]+1:x[2]+1] = 3
    notification_counter[(x[2]+1):] = 4
    
    return notification_counter

In [7]:
print(f"Shape of test dataset: {df_g.shape}")
x = local_search(df_g.timestamp_ns.to_numpy())

print('With @jit:')
%time n1 = add_notif_counter_j(x,np.zeros(x[-1]+1,dtype='int')) 

print('Without @jit:')
%time n2 = add_notif_counter(x)

print('With pandas:')
%time n3 = notif_counter_pandas(df_g,x)

np.allclose(n1,n2.astype("int") ) 
np.allclose(n2,n3.astype("int") ) 

Shape of test dataset: (58, 8)
With @jit:
CPU times: user 24 µs, sys: 1 µs, total: 25 µs
Wall time: 26.9 µs
Without @jit:
CPU times: user 25 µs, sys: 1e+03 ns, total: 26 µs
Wall time: 28.1 µs
With pandas:
CPU times: user 2.2 ms, sys: 449 µs, total: 2.65 ms
Wall time: 2.35 ms


True

## Speed test: groupbyuy

In [8]:
# prev code
df_g = df.loc[(df.user_id == 'CFFEC5978B0A4A05FA6DCEFB2C82CC') & (df.dayofyear == 213),].copy()#.head(20)

# Calculate when to send notification
x = local_search(df_g.timestamp_ns.to_numpy())

# Add notification counter
df_g['notification_counter'] = add_notif_counter(x) 


In [9]:
print('Speed with pandas:')
%time tours_pd =  df_g.groupby('notification_counter')['friend_id'].apply(lambda x: (1- x.duplicated()).cumsum())#.droplevel(0)

from bundle_notifications.bundle_notifications import count_tours_per_notif

# Inputs
notification_counter = df_g.notification_counter.to_numpy('int')
friend_id = df_g.friend_id.to_numpy()
friend_name = df_g.friend_name.to_numpy()
timestamp = df_g.timestamp.to_numpy()

print('Speed with custom numpy:')
%time tours_np,_,_,_ = count_tours_per_notif(notification_counter, friend_id,friend_name,timestamp)

np.allclose(tours_pd.astype("int"),tours_np ) 

Speed with pandas:
CPU times: user 3.84 ms, sys: 533 µs, total: 4.37 ms
Wall time: 3.91 ms
Speed with custom numpy:
CPU times: user 78 µs, sys: 14 µs, total: 92 µs
Wall time: 87 µs


True

A custom numpy functtion goes 35x faster! It is not even 100% comparable because inside _count_tours_per_notif_ we output some extra information.

## Speed analysis for *_bundle()_*

In [10]:
from bundle_notifications.bundle_notifications import bundle_func, bundle

# Use a subset of the dataset
users = df.user_id.unique()[:50]
df_subset = df.loc[ (df.user_id.isin(users)),].copy()
print(df_subset.shape)

%lprun -T notebooks/profile_bundle_func -f bundle_func bundle(df_subset)
print(open('notebooks/profile_bundle_func', 'r').read())

(2513, 6)

*** Profile printout saved to text file 'notebooks/profile_bundle_func'. 
Timer unit: 1e-06 s

Total time: 0.6141 s
File: /Users/jsg/Documents/GitHub/bundle_notifications_ds/bundle_notifications/bundle_notifications.py
Function: bundle_func at line 229

Line #      Hits         Time  Per Hit   % Time  Line Contents
   229                                           def bundle_func(df_g):
   230                                               """Bundles notifications for a user_id
   231                                           
   232                                               This function is meant to be used after a pandas groupping (or manual filtering) of user_ids. 
   233                                           
   234                                               Parameters
   235                                               ----------
   236                                               df_g : pd.DataFrame
   237                                                   Da

1. I am surprised that a simple filter of rows (`df_g = df_g.iloc[x,]`) takes as much time as a much more complex custom function `df_g['tours'], name_first,timestamp_first_tour, message = count_tours_per_notif(...)`
2. Assigning a value to a new column `df_g['message'] = message` takes also quite some time.

