In [19]:
import pandas as pd
import numpy as np

from datetime import timedelta

from functools import reduce

In [20]:
work = pd.read_csv('../data/raw/WorkingDay.csv')
work.head()

Unnamed: 0,startTime,activeTime,Вых/Будни,monitorTime,id
0,"2021-11-30 00:00:00,000",2,Будни,2,ОРГ1-01553
1,"2021-11-30 00:00:00,000",2,Будни,2,ОРГ1-02112
2,"2021-11-30 00:00:00,000",2,Будни,2,ОРГ1-02112
3,"2021-11-30 00:00:00,000",2,Будни,2,ОРГ1-01846
4,"2021-11-30 00:00:00,000",2,Будни,2,ОРГ1-01846


In [21]:
work['date'] = pd.to_datetime(work['startTime'].apply(lambda x: x[:10]), format='%Y-%m-%d')
max_date = work['date'].max()
print('start_date_in_data: ', work['date'].min())
print('end_date_in_data: ', work['date'].max())

start_date_in_data:  2021-06-20 00:00:00
end_date_in_data:  2021-12-31 00:00:00


In [22]:
work['is_workday_work'] = np.where(work['Вых/Будни'] == 'Будни', 1, 0)
work['is_weekend_work'] = np.where(work['Вых/Будни'] == 'Выходные дни', 1, 0)

In [23]:
work = work.groupby(['id','date'], as_index=False).agg({
    'activeTime': 'sum',
    'monitorTime': 'sum',
    'is_workday_work': 'mean',
    'is_weekend_work': 'mean'
})

In [24]:
work['active_monitor_ratio'] = work['activeTime'] / (work['monitorTime'])

In [25]:
work

Unnamed: 0,id,date,activeTime,monitorTime,is_workday_work,is_weekend_work,active_monitor_ratio
0,ОРГ1-00004,2021-06-21,22859,51487,1.0,0.0,0.443976
1,ОРГ1-00004,2021-06-22,15461,84485,1.0,0.0,0.183003
2,ОРГ1-00004,2021-06-23,23349,90135,1.0,0.0,0.259045
3,ОРГ1-00004,2021-06-24,13019,85376,1.0,0.0,0.152490
4,ОРГ1-00004,2021-06-25,14982,82894,1.0,0.0,0.180737
...,...,...,...,...,...,...,...
186179,ОРГ2-08387,2021-12-24,25632,31646,1.0,0.0,0.809960
186180,ОРГ2-08387,2021-12-25,23265,33549,1.0,0.0,0.693463
186181,ОРГ2-08387,2021-12-26,7650,37742,0.0,1.0,0.202692
186182,ОРГ2-08387,2021-12-27,24212,34053,1.0,0.0,0.711009


In [26]:
def work_period_by_work(dates):
    return (dates.max() - dates.min()).days

In [27]:
def days_since_last_work(dates, max_date=max_date):
    return (max_date - dates.max()).days

In [28]:
def aggregation(work): 
    grouped_work = work.groupby('id', as_index=False).agg({
    'date':[work_period_by_work, days_since_last_work],
    'activeTime': ['mean', 'median', 'min', 'max', 'count','std'],
    'monitorTime': ['mean', 'median', 'min', 'max', 'count','std'],
    'active_monitor_ratio': ['mean', 'median', 'min', 'max', 'count','std'],
    'is_weekend_work':['sum','mean'],
    'is_workday_work':['sum','mean']
    })
    grouped_work.columns = grouped_work.columns.map('_'.join).str.strip('_')
    return grouped_work

In [29]:
work_last_150 = work[work['date'] > (max_date - timedelta(days=150))]
work_last_100 = work[work['date'] > (max_date - timedelta(days=100))]
work_last_50 = work[work['date'] > (max_date - timedelta(days=50))]

In [30]:
grouped_work = aggregation(work)
grouped_work.columns = grouped_work.columns.map(
    lambda x: x+'_total' if x != 'id' else x)

grouped_work_last_50 = aggregation(work_last_50)
grouped_work_last_50.columns = grouped_work_last_50.columns.map(
    lambda x: x+'_last_50' if x != 'id' else x)

grouped_work_last_100 = aggregation(work_last_100)
grouped_work_last_100.columns = grouped_work_last_100.columns.map(
    lambda x: x+'_last_100' if x != 'id' else x)

grouped_work_last_150 = aggregation(work_last_150)
grouped_work_last_150.columns = grouped_work_last_150.columns.map(
    lambda x: x+'_last_150' if x != 'id' else x)

In [31]:
# merge all dfs in one
work_dfs_for_merge = [grouped_work, grouped_work_last_50,
                            grouped_work_last_100, grouped_work_last_150]

grouped_data = reduce(lambda left, right: pd.merge(left, right, on=['id'],
                                                   how='left'), work_dfs_for_merge)

grouped_data.fillna(0, inplace=True)

In [32]:
grouped_data['exists_in_work'] = 1

In [33]:
grouped_data.to_csv('../data/prepared/grouped_work.csv', index=False)