In [2]:
import pandas as pd
import datetime
import numpy as np
from pandarallel import pandarallel

pd.set_option('display.max_columns',None)

Your first task is to create a stacked chart of **unique online devices per
day**, segregated by fleet size. The fleet size is an attribute of each user
and is defined as the number of online devices that this user had at a
particular day. You can split the dataset in the following fleet sizes:

* 1-2 devices
* 3-9 devices
* 10-99 devices
* 100-999 devices

A device should be counted as online for a particular day if it was online for
any amount of time during that day. For example, a device that appear online
for only a second should still be counted for that day.

The specific rules we have selected to deal with problematic sections
of a device's timeline can be summarised in the following table:

| current event | current server | next event | next server | rule        |
|---------------|----------------|------------|-------------|-------------|
| online        | X              | online     | X           | Assume device was online from current event's timestamp until next event's timestamp
| online        | X              | online     | Y           | Assume device was online from current event's timestamp until X's destruction time or next event's timestamp, whichever is smaller
| online        | X              | offline    | X           | Normal case
| online        | X              | offline    | Y           | Assume device was online from current event's timestamp until X's destruction time. Ignore next event.
| offline       | X              | online     | X           | Normal case
| offline       | X              | online     | Y           | Normal case
| offline       | X              | offline    | X           | Ignore next event
| offline       | X              | offline    | Y           | Ignore next event

In [3]:
def prev_server(df):
    """
    calculate previous server connection's data: previous server id, previous connected status and previous timestamp
    """
    df['prev_server_id']=(df.sort_values(['timestamp'], ascending=True)
                               .groupby(['device_id'])['server_id'].shift(-1))
    df['prev_connected']=(df.sort_values(['timestamp'], ascending=True)
                             .groupby(['device_id'])['connected'].shift(-1)).astype(bool)
    df['prev_timestamp']=(df.sort_values(['timestamp'], ascending=True)
                                             .groupby(['device_id'])['timestamp'].shift(-1))
    df['ignored']=False

def mark_ignored_rows(row):
    """
    function to mark rows as ignored, cases:
    1. X offline -> X offline
    2. X offline -> Y offline
    """
    return row['connected']==False and row['prev_connected']==False

    
def process_dataframe(df):
    df['ignored'] = df.apply(lambda row: prepare_final(row), axis=1)

def parallelize_dataframe(df, func, n_cores=4):
    pool = Pool(n_cores)
    df_split = np.array_split(df, n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [4]:
names=['id','created_at','destroyed_at']
servers = pd.read_csv('data/servers.csv',names=names)

In [5]:
names = ['timestamp','device_id', 'user_id','server_id','connected']
events = pd.read_csv('data/connectivity_events.csv',names=names)#,nrows=1000)

In [8]:
events_merged=events.merge(servers,left_on='server_id',right_on='id')
events_merged.shape[0]

9308207

In [9]:
%%timeit
prev_server(events_merged)

29.1 s ± 1.89 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
events_merged.head()

Unnamed: 0,timestamp,device_id,user_id,server_id,connected,id,created_at,destroyed_at,prev_server_id,prev_connected,prev_timestamp,ignored
0,2015-05-31 14:30:00,100,4,2,True,2,2015-05-31 14:30:00.000000,2017-08-03 22:42:57.929,2.0,False,2015-06-02 10:30:00,False
1,2015-05-31 14:30:00,101,1,2,True,2,2015-05-31 14:30:00.000000,2017-08-03 22:42:57.929,2.0,False,2015-05-31 18:30:00,False
2,2015-05-31 14:30:00,102,1,2,True,2,2015-05-31 14:30:00.000000,2017-08-03 22:42:57.929,2.0,False,2015-05-31 19:30:00,False
3,2015-05-31 14:30:00,103,26,2,True,2,2015-05-31 14:30:00.000000,2017-08-03 22:42:57.929,2.0,False,2015-06-01 02:30:00,False
4,2015-05-31 14:30:00,104,1,2,True,2,2015-05-31 14:30:00.000000,2017-08-03 22:42:57.929,2.0,False,2015-06-06 02:30:00,False


In [25]:
%%timeit
pandarallel.initialize()
events_merged.parallel_apply(generate_previous_connection)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


TypeError: sort_values() got an unexpected keyword argument 'by'

In [15]:
events_merged[events_merged['prev_timestamp'].notna()].shape[0]

9251905

In [35]:
test = events_merged.sample(100000)
test.head()

Unnamed: 0,timestamp,device_id,user_id,server_id,connected,id,created_at,destroyed_at
3684102,2017-11-02 19:38:59.192,9529,168,16,False,16,2017-10-17 00:50:09.487556,2017-11-03 20:15:19.644
4729335,2017-11-26 22:13:32.274,40702,2359,20,False,20,2017-11-22 16:15:57.285482,2017-12-08 23:35:12.164
2401275,2017-10-04 15:22:18.894,9529,168,13,False,13,2017-09-26 16:10:36.230545,2017-10-10 20:31:43.062
7586288,2018-01-15 10:55:43.185,20685,1325,29,True,29,2018-01-12 17:01:54.553283,2018-02-12 15:06:52.974
9205922,2018-02-26 17:17:01.703,32193,3455,33,False,33,2018-02-26 15:52:13.678326,2018-03-01 18:05:54.051


In [69]:
test.dtypes

timestamp          object
device_id           int64
user_id             int64
server_id           int64
connected            bool
id                  int64
created_at         object
destroyed_at       object
prev_server_id    float64
prev_connected       bool
prev_timestamp     object
ignored              bool
dtype: object

In [97]:
test['ignored'].value_counts()

False    10000
Name: ignored, dtype: int64

In [103]:
# test['ignored'] = test.apply (lambda row: prepare_final(row), axis=1)
process_dataframe(test)

In [104]:
test['ignored'].value_counts()

False    9621
True      379
Name: ignored, dtype: int64

In [109]:
test[(test['connected']==False) & (test['prev_connected']==False)].shape[0]

4092

In [113]:
%%timeit
process_dataframe(events_merged)

2min 7s ± 3.16 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [134]:
%%timeit
pandarallel.initialize()
events_merged['ignored'] = events_merged.parallel_apply(lambda row: prepare_final(row), axis=1)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process

In [135]:
print(events_merged[(events_merged['connected']==False) & (events_merged['prev_connected']==False)].shape[0])
print(events_merged[events_merged['ignored']==True].shape[0])

379931
379931


In [None]:
print("end:{}".format(datetime.datetime.now()))