In [1]:
import pandas as pd
import numpy as np
import tqdm

In [2]:
tr_party = pd.read_csv("../../train/train_party.csv")
te_party = pd.read_csv("../../test/test_party.csv")
tr_party.head()

Unnamed: 0,party_start_week,party_start_day,party_start_time,party_end_week,party_end_day,party_end_time,hashed
0,1,1,09:14:58.558,1,1,09:41:30.200,11fc85879e5ac9d5c83bfba10d73c4c84c154b9f4d9e1d...
1,3,3,11:05:05.176,3,3,13:07:42.515,7176c1516207692857535c30a4650b8e8e586af1fed0fd...
2,3,6,02:18:43.172,3,6,02:28:58.177,8092e194a750aae539862ed4405f67a6dd5b492e7e57e3...
3,4,1,09:22:01.936,4,1,09:47:40.192,4ec597c569b92bd0e1bae4e2a06e13b9657fb81795e194...
4,4,5,06:29:21.182,4,5,06:50:55.004,a4b6aea6cb58e43911e7cb7d6c0497197db7c4ed16e1c9...


# party 데이터 전처리
* 파티의 시작과 끝 시간으로부터 유지된 시간을 계산한다.
* hashed를 분리하여 각 acc_id의 해당 주 단위 party 정보를 계산한다.

In [3]:
# 시간은 분 단위로 설정한다.
tr_party['party_time'] = (
    ((tr_party['party_end_week'] - tr_party['party_start_week'])*7 + tr_party['party_end_day'] - tr_party['party_start_day'])*1440
    + (tr_party['party_end_time'].str.split(':', expand=True).astype('float')*[60, 1, 1/60]).sum(axis=1)
    - (tr_party['party_start_time'].str.split(':', expand=True).astype('float')*[60, 1, 1/60]).sum(axis=1)
)

te_party['party_time'] = (
    ((te_party['party_end_week'] - te_party['party_start_week'])*7 + te_party['party_end_day'] - te_party['party_start_day'])*1440
    + (te_party['party_end_time'].str.split(':', expand=True).astype('float')*[60, 1, 1/60]).sum(axis=1)
    - (te_party['party_start_time'].str.split(':', expand=True).astype('float')*[60, 1, 1/60]).sum(axis=1)
)

In [4]:
def untie_hash(df):
    dic = {'acc_id': [], 'wk': [], 'party_time': [], 'party_mem_count': []}
    gen_hash = (row for row in df['hashed'])
    gen_time = (row for row in df['party_time'])
    gen_wk = (row for row in df['party_start_week'])
    for _ in tqdm.trange(len(df)):
        ids = next(gen_hash).split(',')
        n = len(ids)
        dic['acc_id'].extend(ids)
        dic['wk'].extend([next(gen_wk)] * n)
        dic['party_time'].extend([next(gen_time)] * n)
        dic['party_mem_count'].extend([n] * n)
    dataframe = pd.DataFrame(dic).groupby(['acc_id', 'wk']).agg({
                                                                'party_time': ['sum', 'count'],
                                                                'party_mem_count': 'sum'
                                                            }).reset_index(drop=False)
    dataframe.columns = ['acc_id', 'wk', 'party_time', 'party_count', 'party_mem_count']
    return dataframe.sort_values(by=['acc_id', 'wk']).reset_index(drop=True)

In [5]:
tr_party_pre = untie_hash(tr_party)
te_party_pre = untie_hash(te_party)
tr_party_pre.head(15)

100%|████████████████████████████████████████████████████████████████████| 6962341/6962341 [00:32<00:00, 212429.35it/s]
100%|████████████████████████████████████████████████████████████████████| 4121512/4121512 [00:20<00:00, 200988.61it/s]


Unnamed: 0,acc_id,wk,party_time,party_count,party_mem_count
0,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,1,4104.470133,48,288
1,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,2,12131.481117,180,1122
2,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,3,6120.785067,139,846
3,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,4,3060.952683,158,948
4,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,5,11915.25735,151,917
5,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,6,6330.478183,159,954
6,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,7,5849.367767,125,756
7,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,8,7273.76275,89,539
8,0000264b01392acfde44f9d8494f112a701dc5d3e5fda6...,7,953.650267,335,1563
9,0000264b01392acfde44f9d8494f112a701dc5d3e5fda6...,8,1560.719133,442,2062


In [6]:
tr_party_pre['mean_party_time'] = tr_party_pre['party_time'] / tr_party_pre['party_count']
tr_party_pre['mean_party_mem'] = tr_party_pre['party_mem_count'] / tr_party_pre['party_count']
te_party_pre['mean_party_time'] = te_party_pre['party_time'] / te_party_pre['party_count']
te_party_pre['mean_party_mem'] = te_party_pre['party_mem_count'] / te_party_pre['party_count']
tr_party_pre.head()

Unnamed: 0,acc_id,wk,party_time,party_count,party_mem_count,mean_party_time,mean_party_mem
0,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,1,4104.470133,48,288,85.509794,6.0
1,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,2,12131.481117,180,1122,67.397117,6.233333
2,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,3,6120.785067,139,846,44.034425,6.086331
3,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,4,3060.952683,158,948,19.373118,6.0
4,000020b4fa2af1e40c813436e5054bce70b703d0039dcd...,5,11915.25735,151,917,78.908989,6.072848


# party 데이터 처리 완료

In [7]:
tr_party_pre.to_csv("train_party_pre.csv", index=False)
te_party_pre.to_csv("test_party_pre.csv", index=False)