# Party 관련 feature engineering

In [1]:
from tqdm import tqdm
from tqdm import trange
from itertools import chain
from scipy import stats
from datetime import timedelta 
import warnings
warnings.filterwarnings('ignore')
import pickle
import datetime as dt
import networkx as nx
import pickle

-----

# Train

In [2]:
# label = pd.read_csv("../Data/train_label.csv")
label = pd.read_csv("~/documents/chaser_data/train_label.csv")
# label = pd.read_csv("../data/train_label.csv")

In [3]:
%%time
# party = pd.read_csv("../data/new_train_party.csv", memory_map=True)
party = pd.read_csv("~/documents/chaser_data/train_party.csv", memory_map=True)

CPU times: user 28.1 s, sys: 3.28 s, total: 31.4 s
Wall time: 34.9 s


In [4]:
party.tail()

Unnamed: 0,party_start_week,party_start_day,party_start_time,party_end_week,party_end_day,party_end_time,hashed
6962336,8,6,08:09:30.086,8,6,08:29:51.324,c87c2fad141edf323f3787335b54be22945a02fe052448...
6962337,5,7,11:25:25.719,5,7,11:47:41.557,aafb40d212fe18ff4eafb82fdcf3b53f2161cb3ce59de4...
6962338,7,5,16:29:59.882,7,5,16:30:27.386,86022904c5cf72a54978479c94041f4256d6c3c2a1f71c...
6962339,7,6,23:43:52.265,7,6,23:47:50.285,02181a0c962f34f019bc9d5b582fb0ec79b1441f96aa4d...
6962340,6,5,23:07:31.761,6,5,23:11:01.968,967393e81d99ce8e577ee130b7ce8e4fd45e3e9cecb560...


In [5]:
party.rename(columns = {"hashed":"party_members_acc_id"}, inplace=True)
print(party.shape)
party.tail()

(6962341, 7)


Unnamed: 0,party_start_week,party_start_day,party_start_time,party_end_week,party_end_day,party_end_time,party_members_acc_id
6962336,8,6,08:09:30.086,8,6,08:29:51.324,c87c2fad141edf323f3787335b54be22945a02fe052448...
6962337,5,7,11:25:25.719,5,7,11:47:41.557,aafb40d212fe18ff4eafb82fdcf3b53f2161cb3ce59de4...
6962338,7,5,16:29:59.882,7,5,16:30:27.386,86022904c5cf72a54978479c94041f4256d6c3c2a1f71c...
6962339,7,6,23:43:52.265,7,6,23:47:50.285,02181a0c962f34f019bc9d5b582fb0ec79b1441f96aa4d...
6962340,6,5,23:07:31.761,6,5,23:11:01.968,967393e81d99ce8e577ee130b7ce8e4fd45e3e9cecb560...


## 1. party df에 column 추가

### 1.1 party 지속시간 계산
party 지속시간 구하기: make_duration(df)
- "duration_time" column: 초단위 시간
- "duration_days" column: 일단위 시간

In [6]:
def make_duration(df):
    """
    지속시간 생성 함수
    duration_time = 초단위
    party_duration_days = 일단위 
    - party_start_time, party_end_time 은 마이크로 세컨드 단위로 기록되어 있어서 
    'HH:MM:SS.FFF' 에서 'HH:MM:SS'로 슬라이스해야함.
    """
    df['duration_time'] = pd.to_datetime(df.party_end_time.apply(lambda x: x[:-4]), infer_datetime_format=True, format = '%H:%M:%S') - pd.to_datetime(df.party_start_time.apply(lambda x: x[:-4]), infer_datetime_format=True, format = '%H:%M:%S')
    df.duration_time[df.duration_time<timedelta(days=0)] = df.duration_time[df.duration_time<timedelta(days=0)] + timedelta(days=1)
    df['duration_days']=(df.party_end_week - df.party_start_week)*7 + (df.party_end_day - df.party_start_day) +1

In [7]:
%%time
make_duration(party)
print(party.shape)
party.tail()

(6962341, 9)
CPU times: user 34.7 s, sys: 754 ms, total: 35.4 s
Wall time: 35.5 s


### 1.2 party당 참여 인원 수
- make_party_member_count(df)

In [8]:
def make_party_member_count(df):
    """
    각 파티에 참여한 인원수
    """
    df['party_member_count'] = [len(party_list.split(',')) for party_list in tqdm(df['party_members_acc_id'])]

In [9]:
make_party_member_count(party)
print(party.shape)
party.tail()

100%|██████████| 6962341/6962341 [00:06<00:00, 1149161.00it/s]


(6962341, 10)


Unnamed: 0,party_start_week,party_start_day,party_start_time,party_end_week,party_end_day,party_end_time,party_members_acc_id,duration_time,duration_days,party_member_count
6962336,8,6,08:09:30.086,8,6,08:29:51.324,c87c2fad141edf323f3787335b54be22945a02fe052448...,00:20:21,1,6
6962337,5,7,11:25:25.719,5,7,11:47:41.557,aafb40d212fe18ff4eafb82fdcf3b53f2161cb3ce59de4...,00:22:16,1,6
6962338,7,5,16:29:59.882,7,5,16:30:27.386,86022904c5cf72a54978479c94041f4256d6c3c2a1f71c...,00:00:28,1,3
6962339,7,6,23:43:52.265,7,6,23:47:50.285,02181a0c962f34f019bc9d5b582fb0ec79b1441f96aa4d...,00:03:58,1,5
6962340,6,5,23:07:31.761,6,5,23:11:01.968,967393e81d99ce8e577ee130b7ce8e4fd45e3e9cecb560...,00:03:30,1,3


## 2. party 관련 기본 변수

### 2.1 total party member count 변수 생성
- 유저가 참여한 모든 파티의 파티 인원의 총합을 구함.

In [10]:
%%time
party_member_lists = [party['party_members_acc_id'][i].split(',') for i in trange(len(party['party_members_acc_id']))]
party_member_1D_lists = list(chain.from_iterable(party_member_lists))
member_id_value_count = pd.Series(party_member_1D_lists).value_counts()

100%|██████████| 6962341/6962341 [02:45<00:00, 41956.49it/s]


CPU times: user 3min, sys: 4.1 s, total: 3min 4s
Wall time: 3min 4s


In [11]:
%%time 
increased_party_TMC = [[party['party_member_count'][i]]*party['party_member_count'][i] for i in trange(len(party))]
flat_increased_party_TMC = list(chain.from_iterable(increased_party_TMC))

100%|██████████| 6962341/6962341 [04:45<00:00, 24409.13it/s]


CPU times: user 4min 43s, sys: 2.88 s, total: 4min 46s
Wall time: 4min 46s


In [12]:
%%time
all_id_and_party_TMC_df = pd.concat([pd.Series(party_member_1D_lists), pd.Series(flat_increased_party_TMC)],axis=1)
all_id_and_party_TMC_df.columns = ['acc_id','party_TMC']

member_party_TMC = all_id_and_party_TMC_df.groupby('acc_id')['party_TMC'].sum()
party_TMC_df = pd.DataFrame(member_party_TMC).reset_index()
party_TMC_df.columns = ['acc_id','party_total_member_count']

label = pd.merge(label, party_TMC_df, how='left', on='acc_id')
label['party_total_member_count'].fillna(0, inplace=True)

CPU times: user 24 s, sys: 3.44 s, total: 27.4 s
Wall time: 27.7 s


In [13]:
label.tail()

Unnamed: 0,acc_id,label,party_total_member_count
99995,da6d33b03968d8e35821f6eb88ad22e12e37aa8867084e...,retained,1625.0
99996,676c944f4b6ae63818b3cad824a61233690f16a2275d5d...,retained,0.0
99997,695e1f28e234fc4cc53085e332fa7a76d7895ca4cc745b...,retained,3032.0
99998,0c87fabaad5542e533f958a1d6fd739993b94e95e00989...,retained,0.0
99999,47ff575cb94019df5695c5d81ec285b0d801607b2a8697...,retained,2299.0


### 2.2 Party total retained minute 변수 생성
- 파티 시작 시간과 파티 종료 시간으로 유저가 참여한 모든 파티의 지속시간의 총합을 구한다.

In [14]:
%%time 

party['party_start_time'] = [i.split('.')[0] for i in list(party['party_start_time'])]
party['party_end_time'] = [i.split('.')[0] for i in list(party['party_end_time'])]
party['party_start_time'] = pd.to_datetime(party['party_start_time'], format='%H:%M:%S')
party['party_end_time'] = pd.to_datetime(party['party_end_time'], format='%H:%M:%S')

party['retained_week'] = party['party_end_week']-party['party_start_week']
party['retained_day'] = party['party_end_day']-party['party_start_day']
party['retained_time'] = (party['party_end_time'] - party['party_start_time'])
party['retained_time'][party['retained_time'] < dt.timedelta(days=0)] = \
party['retained_time'][party['retained_time'] < dt.timedelta(days=0)]*(-1)

CPU times: user 43.7 s, sys: 11.5 s, total: 55.3 s
Wall time: 56 s


In [15]:
%%time

retained_second=[t.total_seconds() for t in tqdm(party['retained_time'])]
party['total_retained_day'] = party['retained_week']*7+party['retained_day']
party['total_retained_second'] = party['total_retained_day']*(24*60*60)+retained_second

100%|██████████| 6962341/6962341 [00:50<00:00, 138209.90it/s]


CPU times: user 48.6 s, sys: 1.8 s, total: 50.4 s
Wall time: 50.9 s


In [16]:
%%time
increased_party_TRS = [[party['total_retained_second'][i]]*party['party_member_count'][i] for i in trange(len(party))]

100%|██████████| 6962341/6962341 [05:17<00:00, 21943.19it/s]

CPU times: user 5min 1s, sys: 12.3 s, total: 5min 14s
Wall time: 5min 17s





In [18]:
%%time 

flat_increased_party_TRS = list(chain.from_iterable(increased_party_TRS))
all_id_and_party_TRS_df = pd.concat([pd.Series(party_member_1D_lists), 
                                     pd.Series(flat_increased_party_TRS)],axis=1)
all_id_and_party_TRS_df.columns = ['acc_id','party_TRS']

member_party_TRS_frist = all_id_and_party_TRS_df.groupby('acc_id')['party_TRS'].sum()
party_TRS_frist_df = pd.DataFrame(member_party_TRS_frist).reset_index()
party_TRS_frist_df.columns = ['acc_id','party_total_retained_second']
label = pd.merge(label, party_TRS_frist_df, how='left', on='acc_id')
label['party_total_retained_second'].fillna(0, inplace=True)
label['party_total_retained_minute']=round(label['party_total_retained_second']/60,1)
label.drop(columns='party_total_retained_second', inplace=True)

CPU times: user 13.4 s, sys: 1.18 s, total: 14.6 s
Wall time: 14.8 s


In [25]:
# 메모리 부족해서 위에서 만든 리스트, 데이터프레임 리셋하는 코드
# %reset_selective -f increase_party_TMC
# %reset_selective -f flat_increased_party_TMC
# %reset_selective -f acc_id_and_party_TMC_df
# %reset_selective -f member_party_TMC_df
# %reset_selective -f party_TMC_df
# %reset_selective -f member_id_value_count
# %reset_selective -f party_member_1D_lists
# %reset_selective -f party_member_lists
# %reset_selective -f member_party_TMC 
# %reset_selective -f increased_party_TMC 

### 2.3 party members acc id 전체를 리스트로 만들기

In [20]:
def get_party_ids(df):
    party_id = df["party_members_acc_id"].tolist()
    party_id = [x.split(',') for x in party_id]
    party_id = [item for sublist in party_id for item in sublist]
    return party_id

In [21]:
%%time
party_id_ls = get_party_ids(party)

print(len(party_id_ls))
print(len(list(set(party_id_ls))))

34284282
268148
CPU times: user 28.1 s, sys: 23.2 s, total: 51.4 s
Wall time: 54.4 s


### 2.4 party start week/day & end week/day의 first, mode, last 변수
- first/mode/last_party_start_week : 유저가 참여한 파티가 최초,최후에 생성된 주차, 가장 자주 파티가 생성된 주차
- first/mode/last_party_end_week : 유저가 참여한 파티가 최초,최후에 종료된 주차, 가장 자주 파티가 종료된 주차

In [25]:
def make_all_ID_and_column_df(df, column):
    print('start making all ID & {} df'.format(column))
    increased_column = [[df[column][i]] * df['party_member_count'][i] for i in trange(len(df))]
    increased_column_ls = list(chain.from_iterable(increased_column))
    
    all_ID_and_column_df = pd.concat([pd.Series(get_party_ids(df)), 
                                      pd.Series(increased_column_ls)],axis=1)
    all_ID_and_column_df.columns = ['acc_id',column]
    print('end of making all ID & {} df'.format(column))
    return all_ID_and_column_df


def make_first_mode_last_df_and_merge_with_label(df, column, label):
    all_ID_and_column_df = make_all_ID_and_column_df(df, column)
    
    print('start making {} first & mode & last df'.format(column))
    print('working first_df...')
    first_df = all_ID_and_column_df.groupby('acc_id')[column].min()
    first_df = pd.DataFrame(first_df).reset_index()
    first_df.columns = ['acc_id','first_'+column]
    label = pd.merge(label, first_df, how='left', on='acc_id')
    label['first_'+column].fillna(0, inplace=True)
    
    print('working mode_df...')
    mode_df = all_ID_and_column_df.groupby('acc_id')[column].agg(lambda x: stats.mode(x)[0][0])
    mode_df = pd.DataFrame(mode_df).reset_index()
    mode_df.columns = ['acc_id','mode_'+column]
    label = pd.merge(label, mode_df, how='left', on='acc_id')
    label['mode_'+column].fillna(0, inplace=True)
    
    print('working last_df...')
    last_df = all_ID_and_column_df.groupby('acc_id')[column].max()
    last_df = pd.DataFrame(last_df).reset_index()
    last_df.columns = ['acc_id','last_'+column]
    label = pd.merge(label, last_df, how='left', on='acc_id')
    label['last_'+column].fillna(0, inplace=True)
    
    print('end of making {} first & mode & last df'.format(column))
    
    return label

In [27]:
%time label = make_first_mode_last_df_and_merge_with_label(party, 'party_start_week', label)

start making all ID & party_start_week df


100%|██████████| 6962341/6962341 [04:44<00:00, 24477.78it/s]


end of making all ID & party_start_week df
start making party_start_week first & mode & last df
working first_df...
working mode_df...
working last_df...
end of making party_start_week first & mode & last df
Wall time: 7min


In [28]:
%time label = make_first_mode_last_df_and_merge_with_label(party, 'party_start_day', label)

start making all ID & party_start_day df


100%|██████████| 6962341/6962341 [04:55<00:00, 23594.90it/s]


end of making all ID & party_start_day df
start making party_start_day first & mode & last df
working first_df...
working mode_df...
working last_df...
end of making party_start_day first & mode & last df
Wall time: 7min 2s


In [29]:
%time label = make_first_mode_last_df_and_merge_with_label(party, 'party_end_week', label)

start making all ID & party_end_week df


100%|██████████| 6962341/6962341 [04:50<00:00, 23931.16it/s]


end of making all ID & party_end_week df
start making party_end_week first & mode & last df
working first_df...
working mode_df...
working last_df...
end of making party_end_week first & mode & last df
Wall time: 6min 55s


In [30]:
%time label = make_first_mode_last_df_and_merge_with_label(party, 'party_end_day', label)

start making all ID & party_end_day df


100%|██████████| 6962341/6962341 [04:53<00:00, 23757.94it/s]


end of making all ID & party_end_day df
start making party_end_day first & mode & last df
working first_df...
working mode_df...
working last_df...
end of making party_end_day first & mode & last df
Wall time: 6min 59s


In [31]:
label.tail()

Unnamed: 0,acc_id,label,party_total_member_count,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,mode_party_end_week,last_party_end_week,first_party_end_day,mode_party_end_day,last_party_end_day
99995,da6d33b03968d8e35821f6eb88ad22e12e37aa8867084e...,retained,1625.0,1.0,7.0,8.0,1.0,5.0,7.0,1.0,7.0,8.0,1.0,5.0,7.0
99996,676c944f4b6ae63818b3cad824a61233690f16a2275d5d...,retained,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,695e1f28e234fc4cc53085e332fa7a76d7895ca4cc745b...,retained,3032.0,1.0,3.0,8.0,1.0,1.0,7.0,1.0,3.0,8.0,1.0,1.0,7.0
99998,0c87fabaad5542e533f958a1d6fd739993b94e95e00989...,retained,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99999,47ff575cb94019df5695c5d81ec285b0d801607b2a8697...,retained,2299.0,1.0,8.0,8.0,1.0,5.0,7.0,1.0,8.0,9.0,1.0,5.0,7.0


## 3. 10분 이상 party만 남도록 data filtering
- 10분 미만으로 지속된 파티는 사실상 제 기능을 하지 못한 파티라고 가정하고 전체 데이터에서 10분 이상 지속된 파티만을 필터링하여 feature engineering을 진행함
- 별도의 언급이 없다면 10분 이상의 파티를 이하 '파티'라고 명명

In [9]:
def time_filter(df):  
    """
    지속시간이 10 분 이상인 필터링 함수
    """
    ten = timedelta(minutes = 10)
    return df[(df['duration_days'] >= 3) | (df['duration_time'] >= ten)]

In [34]:
%%time
filtered_party = time_filter(party)
print(len(filtered_party))

3355480
Wall time: 9.94 s


## 4. party 참여 횟수 변수 생성
- 8주간의 모든 파티 참여 횟수의 총합, week별 파티 참여 횟수의 총계 등 총 9개 변수 생성

### 4.1 party members acc id 전체를 리스트로 만들기

In [35]:
party_id_ls = get_party_ids(filtered_party)

### 4.2 party_cnt(파티 참여횟수) 변수 생성

In [10]:
def get_party_cnt(ls, merging_df):
    df_party_id = pd.DataFrame(ls, columns=["acc_id"])
    df_party_id = df_party_id.groupby('acc_id').size().reset_index(name='party_cnt')
    party_df = pd.merge(merging_df, df_party_id, how='left')
    party_df["party_cnt"].fillna(0, inplace=True)
    return party_df    

In [37]:
%%time
party_1 = get_party_cnt(party_id_ls, label)

Wall time: 9.97 s


In [38]:
party_1.tail()

Unnamed: 0,acc_id,label,party_total_member_count,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,mode_party_end_week,last_party_end_week,first_party_end_day,mode_party_end_day,last_party_end_day,party_cnt
99995,da6d33b03968d8e35821f6eb88ad22e12e37aa8867084e...,retained,1625.0,1.0,7.0,8.0,1.0,5.0,7.0,1.0,7.0,8.0,1.0,5.0,7.0,158.0
99996,676c944f4b6ae63818b3cad824a61233690f16a2275d5d...,retained,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,695e1f28e234fc4cc53085e332fa7a76d7895ca4cc745b...,retained,3032.0,1.0,3.0,8.0,1.0,1.0,7.0,1.0,3.0,8.0,1.0,1.0,7.0,318.0
99998,0c87fabaad5542e533f958a1d6fd739993b94e95e00989...,retained,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99999,47ff575cb94019df5695c5d81ec285b0d801607b2a8697...,retained,2299.0,1.0,8.0,8.0,1.0,5.0,7.0,1.0,8.0,9.0,1.0,5.0,7.0,212.0


In [39]:
# label 대신 party_1 사용
# %reset_selective -f label

### 4.3 week별 party_cnt
각 활동주 별 파티에 참여한 횟수의 합계.

#### (1) 파티 지속 기간(week) 확인

In [41]:
filtered_party["party_duration_week"] = filtered_party["party_end_week"] - filtered_party["party_start_week"]

In [42]:
filtered_party.groupby("party_duration_week").size().reset_index()

Unnamed: 0,party_duration_week,0
0,0,3325297
1,1,30183


#### (2) 다음 주로 넘어가는 경우 끝나는 요일 확인
- 모두 1일에 끝나므로 시작 week만으로 count하기로 함

In [43]:
dur_1w = filtered_party[filtered_party["party_duration_week"]==1]
dur_1w.groupby(dur_1w["party_end_day"]).size().reset_index()

Unnamed: 0,party_end_day,0
0,1,30183


#### (3) week별 party_cnt 구하기

In [11]:
# week별 id당 party cnt를 계산한 df를 만드는 함수
def week_cnt(week, merging_df, df = party):
    party = df[df["party_start_week"] == week]
    party_id = party["party_members_acc_id"].tolist()
    party_id = [x.split(',') for x in party_id]
    party_id = [item for sublist in party_id for item in sublist]
    print("week {} party id: {}".format(week, len(party_id)))
    party_id_df = pd.DataFrame(party_id, columns=["acc_id"])
    party_id_df = party_id_df.groupby('acc_id').size().reset_index(name = "party_cnt_w"+str(week))
    merged_df = pd.merge(merging_df, party_id_df, how='left')
    merged_df.fillna(0, inplace=True)
    return merged_df

In [45]:
%%time
for i in trange(1,9):
    party_1 = week_cnt(i, party_1, df = filtered_party)

  0%|          | 0/8 [00:00<?, ?it/s]

week 1 party id: 1803242


 12%|█▎        | 1/8 [00:02<00:20,  2.92s/it]

week 2 party id: 1964602


 25%|██▌       | 2/8 [00:05<00:17,  2.85s/it]

week 3 party id: 1970979


 38%|███▊      | 3/8 [00:08<00:14,  2.86s/it]

week 4 party id: 2128263


 50%|█████     | 4/8 [00:11<00:11,  2.88s/it]

week 5 party id: 2561052


 62%|██████▎   | 5/8 [00:15<00:09,  3.04s/it]

week 6 party id: 2540356


 75%|███████▌  | 6/8 [00:18<00:06,  3.13s/it]

week 7 party id: 4046387


 88%|████████▊ | 7/8 [00:23<00:03,  3.40s/it]

week 8 party id: 3766526


100%|██████████| 8/8 [00:29<00:00,  3.64s/it]


Wall time: 29.1 s


In [46]:
party_1.tail()

Unnamed: 0,acc_id,label,party_total_member_count,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,...,last_party_end_day,party_cnt,party_cnt_w1,party_cnt_w2,party_cnt_w3,party_cnt_w4,party_cnt_w5,party_cnt_w6,party_cnt_w7,party_cnt_w8
99995,da6d33b03968d8e35821f6eb88ad22e12e37aa8867084e...,retained,1625.0,1.0,7.0,8.0,1.0,5.0,7.0,1.0,...,7.0,158.0,9.0,8.0,9.0,30.0,32.0,7.0,29.0,34.0
99996,676c944f4b6ae63818b3cad824a61233690f16a2275d5d...,retained,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,695e1f28e234fc4cc53085e332fa7a76d7895ca4cc745b...,retained,3032.0,1.0,3.0,8.0,1.0,1.0,7.0,1.0,...,7.0,318.0,39.0,32.0,56.0,57.0,34.0,23.0,40.0,37.0
99998,0c87fabaad5542e533f958a1d6fd739993b94e95e00989...,retained,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99999,47ff575cb94019df5695c5d81ec285b0d801607b2a8697...,retained,2299.0,1.0,8.0,8.0,1.0,5.0,7.0,1.0,...,7.0,212.0,30.0,22.0,10.0,3.0,27.0,38.0,36.0,46.0


## 5. party count 7-8주와 6-8주의 전체 중 비중 변수 생성
8주 동안의 파티 참여 횟수 중 7-8주와 6-8주의 파티참여 횟수가 차지하는 비율.

In [47]:
party_1["party_78_ratio"] = party_1.loc[:,"party_cnt_w7":"party_cnt_w8"].sum(axis=1) / party_1["party_cnt"]
party_1.fillna(0, inplace=True)
party_1.tail()

Unnamed: 0,acc_id,label,party_total_member_count,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,...,party_cnt,party_cnt_w1,party_cnt_w2,party_cnt_w3,party_cnt_w4,party_cnt_w5,party_cnt_w6,party_cnt_w7,party_cnt_w8,party_78_ratio
99995,da6d33b03968d8e35821f6eb88ad22e12e37aa8867084e...,retained,1625.0,1.0,7.0,8.0,1.0,5.0,7.0,1.0,...,158.0,9.0,8.0,9.0,30.0,32.0,7.0,29.0,34.0,0.398734
99996,676c944f4b6ae63818b3cad824a61233690f16a2275d5d...,retained,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,695e1f28e234fc4cc53085e332fa7a76d7895ca4cc745b...,retained,3032.0,1.0,3.0,8.0,1.0,1.0,7.0,1.0,...,318.0,39.0,32.0,56.0,57.0,34.0,23.0,40.0,37.0,0.242138
99998,0c87fabaad5542e533f958a1d6fd739993b94e95e00989...,retained,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99999,47ff575cb94019df5695c5d81ec285b0d801607b2a8697...,retained,2299.0,1.0,8.0,8.0,1.0,5.0,7.0,1.0,...,212.0,30.0,22.0,10.0,3.0,27.0,38.0,36.0,46.0,0.386792


In [48]:
party_1["party_678_ratio"] = party_1.loc[:,"party_cnt_w6":"party_cnt_w8"].sum(axis=1) / party_1["party_cnt"]
party_1.fillna(0, inplace=True)
party_1.tail()

Unnamed: 0,acc_id,label,party_total_member_count,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,...,party_cnt_w1,party_cnt_w2,party_cnt_w3,party_cnt_w4,party_cnt_w5,party_cnt_w6,party_cnt_w7,party_cnt_w8,party_78_ratio,party_678_ratio
99995,da6d33b03968d8e35821f6eb88ad22e12e37aa8867084e...,retained,1625.0,1.0,7.0,8.0,1.0,5.0,7.0,1.0,...,9.0,8.0,9.0,30.0,32.0,7.0,29.0,34.0,0.398734,0.443038
99996,676c944f4b6ae63818b3cad824a61233690f16a2275d5d...,retained,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,695e1f28e234fc4cc53085e332fa7a76d7895ca4cc745b...,retained,3032.0,1.0,3.0,8.0,1.0,1.0,7.0,1.0,...,39.0,32.0,56.0,57.0,34.0,23.0,40.0,37.0,0.242138,0.314465
99998,0c87fabaad5542e533f958a1d6fd739993b94e95e00989...,retained,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99999,47ff575cb94019df5695c5d81ec285b0d801607b2a8697...,retained,2299.0,1.0,8.0,8.0,1.0,5.0,7.0,1.0,...,30.0,22.0,10.0,3.0,27.0,38.0,36.0,46.0,0.386792,0.566038


## 6. week별 party count의 표준편차 변수 생성
유저가 활동주 별로 파티에 참여한 횟수의 표준편차 

In [49]:
party_1["party_cnt_std"] = party_1.loc[:,"party_cnt_w1":"party_cnt_w8"].std(axis=1)
party_1.fillna(0, inplace=True)
party_1.tail()

Unnamed: 0,acc_id,label,party_total_member_count,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,...,party_cnt_w2,party_cnt_w3,party_cnt_w4,party_cnt_w5,party_cnt_w6,party_cnt_w7,party_cnt_w8,party_78_ratio,party_678_ratio,party_cnt_std
99995,da6d33b03968d8e35821f6eb88ad22e12e37aa8867084e...,retained,1625.0,1.0,7.0,8.0,1.0,5.0,7.0,1.0,...,8.0,9.0,30.0,32.0,7.0,29.0,34.0,0.398734,0.443038,12.395276
99996,676c944f4b6ae63818b3cad824a61233690f16a2275d5d...,retained,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,695e1f28e234fc4cc53085e332fa7a76d7895ca4cc745b...,retained,3032.0,1.0,3.0,8.0,1.0,1.0,7.0,1.0,...,32.0,56.0,57.0,34.0,23.0,40.0,37.0,0.242138,0.314465,11.609725
99998,0c87fabaad5542e533f958a1d6fd739993b94e95e00989...,retained,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99999,47ff575cb94019df5695c5d81ec285b0d801607b2a8697...,retained,2299.0,1.0,8.0,8.0,1.0,5.0,7.0,1.0,...,22.0,10.0,3.0,27.0,38.0,36.0,46.0,0.386792,0.566038,14.442002


## 7. 전체 파티 참여횟수 중 10분 이내 짧은 파티 참여의 비율 변수 생성
유저가 8주동안 참여한 모든 파티 횟수의 총합에서 10분 이내에 종료된 파티에 참여했던 횟수가 차지하는 비중

In [50]:
def time_filter_short(df):
    """
    지속시간이 10분 미만인 필터링
    """
    ten = timedelta(minutes = 10)
    return df[(df['duration_days'] < 3) & (df['duration_time'] < ten)]

def get_ratio(df, label):
    
    totalcnt = get_party_cnt(get_party_ids(df),label)
    totalcnt.rename(columns={'party_cnt':'totalcnt'}, inplace=True)
    
    shortcnt = get_party_cnt(get_party_ids(time_filter_short(df)), label)
    shortcnt.rename(columns={'party_cnt':'shortcnt'}, inplace=True)
    
    shortcnt['shortparty_ratio'] = round(shortcnt['shortcnt']/totalcnt['totalcnt'], 4)
    shortcnt['shortparty_ratio'].fillna(value=0, inplace=True)
    
    return shortcnt[['acc_id','shortparty_ratio']]

In [52]:
party_1 = pd.merge(party_1, get_ratio(party, party_1[['acc_id']]), how='left')

In [53]:
party_1.head()

Unnamed: 0,acc_id,label,party_total_member_count,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,...,party_cnt_w3,party_cnt_w4,party_cnt_w5,party_cnt_w6,party_cnt_w7,party_cnt_w8,party_78_ratio,party_678_ratio,party_cnt_std,shortparty_ratio
0,b8fbf3f6a70e3f36843bffc70c18ff51a0d755a87616ec...,week,1175.0,8.0,8.0,8.0,5.0,7.0,7.0,8.0,...,0.0,0.0,0.0,0.0,0.0,11.0,1.0,1.0,3.889087,0.9495
1,ed500c4957956b3e99dc3985666850b582f812405eefb6...,week,639.0,5.0,5.0,8.0,1.0,2.0,7.0,5.0,...,0.0,0.0,4.0,0.0,17.0,35.0,0.928571,0.928571,12.75035,0.7241
2,acc6afa23a6bf15e18151e4794c7789225ef9d682f473c...,week,695.0,8.0,8.0,8.0,5.0,6.0,7.0,8.0,...,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.707107,0.9873
3,34095a3c9a2937ced3ea3fd75e22ce177dc5879d2a53f7...,week,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,26f3db6e8817a93c4ceda9a16f0832945e43d950b95882...,week,237.0,6.0,8.0,8.0,1.0,4.0,7.0,6.0,...,0.0,0.0,0.0,0.0,0.0,33.0,1.0,1.0,11.667262,0.4211


In [54]:
party_1.columns

Index(['acc_id', 'label', 'party_total_member_count', 'first_party_start_week',
       'mode_party_start_week', 'last_party_start_week',
       'first_party_start_day', 'mode_party_start_day', 'last_party_start_day',
       'first_party_end_week', 'mode_party_end_week', 'last_party_end_week',
       'first_party_end_day', 'mode_party_end_day', 'last_party_end_day',
       'party_cnt', 'party_cnt_w1', 'party_cnt_w2', 'party_cnt_w3',
       'party_cnt_w4', 'party_cnt_w5', 'party_cnt_w6', 'party_cnt_w7',
       'party_cnt_w8', 'party_78_ratio', 'party_678_ratio', 'party_cnt_std',
       'shortparty_ratio'],
      dtype='object')

## 8. party network에서의 degree centrality 변수 생성
유저의 파티 네트워크 중심성 생성

### 8.1 생성해둔 party network 불러오기 

In [5]:
# G = nx.read_gpickle("train_party_network.gpickle")

node의 수가 2명 이상짜리 파티에 참여한 unique id 수와 동일함

In [14]:
# len(G.nodes())

223613

### 8.2 degree centrality 구하기 

In [23]:
# degree_centrality = nx.degree_centrality(G)
# type(degree_centrality)

dict

In [24]:
# centrality = pd.DataFrame(columns=["acc_id","degree_cent"])

In [25]:
# centrality["acc_id"] = degree_centrality.keys()

In [26]:
# centrality["degree_cent"] = degree_centrality.values()

변수의 단위가 너무 작아 100배씩 scaling함

In [36]:
# centrality["degree_cent"] = centrality["degree_cent"]*100

In [38]:
# party_1 = pd.merge(party_1, centrality, how='left').fillna(0, inplace = True)

## 9. 고정파티 최대 횟수 변수 생성

유저가 한 명 이상의 특정 유저와 반복해서 파티에 참여한 횟수의 최대값을 생성

In [4]:
len(filtered_party)

3355480

### 9.1 10분 이상 지속한 party에 참여한 acc_id 구하기

#### (1) party members acc id 전체를 리스트로 만들기

In [7]:
%%time
party_ids = get_party_ids(filtered_party)

CPU times: user 8.48 s, sys: 853 ms, total: 9.33 s
Wall time: 9.34 s


#### (2) party에 참여한 id 수

In [8]:
party_unique_ids = list(set(party_ids))

In [9]:
print("party에 참여한 id 수(중복카운트):", len(party_ids))
print(len(list(set(party_ids))))
print("party에 참여한 id 수(중복 없음):", len(party_unique_ids))

party에 참여한 id 수(중복카운트): 20781407
223621
party에 참여한 id 수(중복 없음): 223621


In [10]:
label_id = label["acc_id"].tolist()
len(label_id)

100000

#### (3) train data 유저 중 filtered party에 참여한 사람 수

In [11]:
label_party_id = list(set(label_id) & set(party_unique_ids))
len(label_party_id)

51745

### 9.2 고정파티 최대 횟수 구하기

#### (1) get_fix_party(): 한 유저가 특정 유저와 반복해서 party에 참여한 최대 횟수

In [12]:
def get_fix_party(base_id):
    '''
    기준 아이디와 타 유저의 고정 파티 횟수 중 최댓값을 찾는 함수
    input: base_id - party에 참여한 유저의 acc_id
    output: key가 "acc_id", "fix_party_max"인 dictionary
        - "acc_id": 유저의 acc_id
        - "fix_party_max": 타 유저와 반복해서 같은 party에 참여한 횟수 중 최댓값 
    '''
    # 기준 id가 참여한 party member 리스트 뽑기 (party_id는 이중리스트 형태)
    with_members = list(filter(lambda a: base_id in a, party_id))
    
    # 이중 리스트인 with_members를 flat list로 풀어주기
    with_members = [item for sublist in with_members for item in sublist]
    
    # 기준id 리스트에서 빼기
    with_members = list(filter(lambda a: a != base_id, with_members))
    
    # 함께한 횟수 df로 구하기
    df_party_id = pd.DataFrame(with_members, columns=["acc_id"])
    df_party_id = df_party_id.groupby('acc_id').size().reset_index(name='party_cnt')
    
    return {"acc_id": base_id,
            "fix_party_max": df_party_id["party_cnt"].max()}

In [13]:
# party member 전체 리스트 미리 받기
party_id = filtered_party["party_members_acc_id"].tolist()
party_id = [x.split(',') for x in party_id]

In [22]:
get_fix_party(label_party_id[0])

{'acc_id': 'cc3be3c6517858a3dbdc1702a374c23a3ebb60b096df8e835acde0f8b77e11f1',
 'fix_party_max': 166}

#### (2) label_party_id에 대하여 고정파티 최대 횟수 구하기

In [26]:
fix_party = pd.DataFrame(columns=["acc_id", "fix_party_max"])

In [27]:
for i in tqdm(range(10000)):
    fix_party.loc[len(fix_party)] = get_fix_party(label_party_id[i])

100%|██████████| 10000/10000 [2:27:36<00:00,  1.13it/s]


In [7]:
for i in tqdm(range(10000,20000)):
    fix_party.loc[len(fix_party)] = get_fix_party(label_party_id[i])

100%|██████████| 10000/10000 [2:25:27<00:00,  1.15it/s]


In [7]:
for i in tqdm(range(20000,30000)):
    fix_party.loc[len(fix_party)] = get_fix_party(label_party_id[i])

100%|██████████| 10000/10000 [2:25:44<00:00,  1.14it/s]


In [37]:
for i in tqdm(range(30000,40000)):
    fix_party.loc[len(fix_party)] = get_fix_party(label_party_id[i])

100%|██████████| 10000/10000 [2:46:31<00:00,  1.00it/s]


In [11]:
for i in tqdm(range(40000,len(label_party_id))):
    fix_party.loc[len(fix_party)] = get_fix_party(label_party_id[i])

100%|██████████| 11745/11745 [3:06:29<00:00,  1.05it/s]


In [42]:
len(fix_party)

51745

In [None]:
party_1 = party_1.merge(fix_party, how = "left").fillna(0)

## 10. 최종 party 변수 저장하기

In [55]:
pickle.dump(party_1,open('../data/merged_train_party.pkl','wb'))

---

# Test

In [23]:
activity = pd.read_csv('../data/test_activity.csv')
label = pd.DataFrame(list(activity['acc_id'].unique()))
label.columns = ['acc_id']

In [24]:
%%time
party = pd.read_csv("../data/new_test_party.csv", memory_map=True)

CPU times: user 18.3 s, sys: 3.37 s, total: 21.7 s
Wall time: 27.3 s


In [25]:
party.tail()

Unnamed: 0,party_start_week,party_start_day,party_start_time,party_end_week,party_end_day,party_end_time,hashed
4121507,2,2,11:14:12.477,2,2,11:34:35.997,d4843247de5b8a0f34d04b418b55bbde84fe7d31dc2192...
4121508,7,1,04:32:18.515,7,1,05:03:05.828,52f4a6a555803e8b239e8b69288d4787d39dd40c2a126e...
4121509,8,6,08:09:30.086,8,6,08:29:51.324,c87c2fad141edf323f3787335b54be22945a02fe052448...
4121510,7,6,23:43:52.265,7,6,23:47:50.285,02181a0c962f34f019bc9d5b582fb0ec79b1441f96aa4d...
4121511,6,7,06:25:13.121,6,7,06:39:47.036,a46cce8d986daaca5492e28cae8979c8601f1fe4ba898b...


In [26]:
party.rename(columns = {"hashed":"party_members_acc_id"}, inplace=True)
print(party.shape)
party.tail()

(4121512, 7)


Unnamed: 0,party_start_week,party_start_day,party_start_time,party_end_week,party_end_day,party_end_time,party_members_acc_id
4121507,2,2,11:14:12.477,2,2,11:34:35.997,d4843247de5b8a0f34d04b418b55bbde84fe7d31dc2192...
4121508,7,1,04:32:18.515,7,1,05:03:05.828,52f4a6a555803e8b239e8b69288d4787d39dd40c2a126e...
4121509,8,6,08:09:30.086,8,6,08:29:51.324,c87c2fad141edf323f3787335b54be22945a02fe052448...
4121510,7,6,23:43:52.265,7,6,23:47:50.285,02181a0c962f34f019bc9d5b582fb0ec79b1441f96aa4d...
4121511,6,7,06:25:13.121,6,7,06:39:47.036,a46cce8d986daaca5492e28cae8979c8601f1fe4ba898b...


## 1. party df에 column 추가

### 1.1 party 지속시간 계산
party 지속시간 구하기: make_duration(df)
- "duration_time" column: 초단위 시간
- "duration_days" column: 일단위 시간

In [27]:
%%time
make_duration(party)
print(party.shape)

(4121512, 9)
CPU times: user 25.3 s, sys: 3.21 s, total: 28.5 s
Wall time: 29.4 s


In [28]:
party.tail()

Unnamed: 0,party_start_week,party_start_day,party_start_time,party_end_week,party_end_day,party_end_time,party_members_acc_id,duration_time,duration_days
4121507,2,2,11:14:12.477,2,2,11:34:35.997,d4843247de5b8a0f34d04b418b55bbde84fe7d31dc2192...,00:20:23,1
4121508,7,1,04:32:18.515,7,1,05:03:05.828,52f4a6a555803e8b239e8b69288d4787d39dd40c2a126e...,00:30:47,1
4121509,8,6,08:09:30.086,8,6,08:29:51.324,c87c2fad141edf323f3787335b54be22945a02fe052448...,00:20:21,1
4121510,7,6,23:43:52.265,7,6,23:47:50.285,02181a0c962f34f019bc9d5b582fb0ec79b1441f96aa4d...,00:03:58,1
4121511,6,7,06:25:13.121,6,7,06:39:47.036,a46cce8d986daaca5492e28cae8979c8601f1fe4ba898b...,00:14:34,1


### 1.2 party당 참여 인원 수
- make_party_member_count(df)

In [29]:
make_party_member_count(party)
print(party.shape)
party.tail()

100%|██████████| 4121512/4121512 [00:03<00:00, 1038007.57it/s]


(4121512, 10)


Unnamed: 0,party_start_week,party_start_day,party_start_time,party_end_week,party_end_day,party_end_time,party_members_acc_id,duration_time,duration_days,party_member_count
4121507,2,2,11:14:12.477,2,2,11:34:35.997,d4843247de5b8a0f34d04b418b55bbde84fe7d31dc2192...,00:20:23,1,6
4121508,7,1,04:32:18.515,7,1,05:03:05.828,52f4a6a555803e8b239e8b69288d4787d39dd40c2a126e...,00:30:47,1,5
4121509,8,6,08:09:30.086,8,6,08:29:51.324,c87c2fad141edf323f3787335b54be22945a02fe052448...,00:20:21,1,6
4121510,7,6,23:43:52.265,7,6,23:47:50.285,02181a0c962f34f019bc9d5b582fb0ec79b1441f96aa4d...,00:03:58,1,5
4121511,6,7,06:25:13.121,6,7,06:39:47.036,a46cce8d986daaca5492e28cae8979c8601f1fe4ba898b...,00:14:34,1,6


## 2. party 관련 기본 변수

### 2.1 total party member count 변수 생성
- 유저가 참여한 파티의 멤버 수의 총 합을 구한다.

In [30]:
%%time

party_member_lists = [party['party_members_acc_id'][i].split(',') for i in trange(len(party['party_members_acc_id']))]
party_member_1D_lists = list(chain.from_iterable(party_member_lists))
member_id_value_count = pd.Series(party_member_1D_lists).value_counts()

100%|██████████| 4121512/4121512 [01:36<00:00, 42624.08it/s]


CPU times: user 1min 50s, sys: 10.9 s, total: 2min 1s
Wall time: 2min 2s


In [31]:
%%time 
increased_party_TMC = [[party['party_member_count'][i]]*party['party_member_count'][i] for i in trange(len(party))]
flat_increased_party_TMC = list(chain.from_iterable(increased_party_TMC))

100%|██████████| 4121512/4121512 [02:54<00:00, 23616.80it/s]


CPU times: user 2min 48s, sys: 6.41 s, total: 2min 54s
Wall time: 2min 56s


### 2.2 Party total retained minute 변수 생성
- 유저가 참여한 파티의 지속시간의 총합을 구한다.

In [32]:
%%time
all_id_and_party_TMC_df = pd.concat([pd.Series(party_member_1D_lists), pd.Series(flat_increased_party_TMC)],axis=1)
all_id_and_party_TMC_df.columns = ['acc_id','party_TMC']

member_party_TMC = all_id_and_party_TMC_df.groupby('acc_id')['party_TMC'].sum()
party_TMC_df = pd.DataFrame(member_party_TMC).reset_index()
party_TMC_df.columns = ['acc_id','party_total_member_count']

label = pd.merge(label, party_TMC_df, how='left', on='acc_id')
label['party_total_member_count'].fillna(0, inplace=True)

CPU times: user 16.7 s, sys: 4.94 s, total: 21.6 s
Wall time: 21.9 s


In [33]:
label.tail()

Unnamed: 0,acc_id,party_total_member_count
39995,d048b24f5205a50916339d24fc9f7b1c69352d2ee10f1c...,659.0
39996,ad5b3ef19e64aceb9ffea55310607ba62146218e616a83...,0.0
39997,972114a3bb57c377d162fb66b9198a2eec40de9576af51...,2119.0
39998,3d10d680df3d5fa2e370997c18274bad7355e3d0ff9fb0...,2065.0
39999,813ad8039e2692aecc1b7875f3a9b1e951477937ec30bd...,1064.0


### 2.3 party members acc id 전체를 리스트로 만들기

In [166]:
%%time
party_id_ls = get_party_ids(party)

print(len(party_id_ls))
print(len(list(set(party_id_ls))))

21849944
227710
CPU times: user 25.9 s, sys: 6.92 s, total: 32.8 s
Wall time: 37.9 s


### 2.4 party start week/day & end week/day의 first, mode, last 변수

In [169]:
%time label = make_first_mode_last_df_and_merge_with_label(party, 'party_start_week', label)

  0%|          | 2234/4121512 [00:00<03:04, 22327.25it/s]

start making all ID & party_start_week df


100%|██████████| 4121512/4121512 [02:55<00:00, 23534.25it/s]


end of making all ID & party_start_week df
start making party_start_week first & mode & last df
working first_df...
working mode_df...
working last_df...
end of making party_start_week first & mode & last df
CPU times: user 4min 18s, sys: 5.63 s, total: 4min 24s
Wall time: 4min 24s


In [170]:
%time label = make_first_mode_last_df_and_merge_with_label(party, 'party_start_day', label)

  0%|          | 2166/4121512 [00:00<03:10, 21652.84it/s]

start making all ID & party_start_day df


100%|██████████| 4121512/4121512 [02:50<00:00, 24227.63it/s]


end of making all ID & party_start_day df
start making party_start_day first & mode & last df
working first_df...
working mode_df...
working last_df...
end of making party_start_day first & mode & last df
CPU times: user 4min 21s, sys: 5.41 s, total: 4min 26s
Wall time: 4min 26s


In [171]:
%time label = make_first_mode_last_df_and_merge_with_label(party, 'party_end_week', label)

  0%|          | 2326/4121512 [00:00<02:57, 23254.87it/s]

start making all ID & party_end_week df


100%|██████████| 4121512/4121512 [02:49<00:00, 24350.95it/s]


end of making all ID & party_end_week df
start making party_end_week first & mode & last df
working first_df...
working mode_df...
working last_df...
end of making party_end_week first & mode & last df
CPU times: user 4min 15s, sys: 5.13 s, total: 4min 21s
Wall time: 4min 20s


In [172]:
%time label = make_first_mode_last_df_and_merge_with_label(party, 'party_end_day', label)

  0%|          | 2158/4121512 [00:00<03:11, 21563.62it/s]

start making all ID & party_end_day df


100%|██████████| 4121512/4121512 [02:56<00:00, 23411.67it/s]


end of making all ID & party_end_day df
start making party_end_day first & mode & last df
working first_df...
working mode_df...
working last_df...
end of making party_end_day first & mode & last df
CPU times: user 4min 25s, sys: 5.46 s, total: 4min 30s
Wall time: 4min 31s


In [173]:
label.tail()

Unnamed: 0,acc_id,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,mode_party_end_week,last_party_end_week,first_party_end_day,mode_party_end_day,last_party_end_day
39995,d048b24f5205a50916339d24fc9f7b1c69352d2ee10f1c...,8.0,8.0,8.0,5.0,6.0,7.0,8.0,8.0,8.0,5.0,6.0,7.0
39996,ad5b3ef19e64aceb9ffea55310607ba62146218e616a83...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39997,972114a3bb57c377d162fb66b9198a2eec40de9576af51...,2.0,7.0,8.0,1.0,4.0,7.0,2.0,7.0,8.0,1.0,4.0,7.0
39998,3d10d680df3d5fa2e370997c18274bad7355e3d0ff9fb0...,1.0,1.0,8.0,1.0,2.0,7.0,1.0,1.0,8.0,1.0,2.0,7.0
39999,813ad8039e2692aecc1b7875f3a9b1e951477937ec30bd...,1.0,3.0,8.0,1.0,2.0,7.0,1.0,3.0,8.0,1.0,2.0,7.0


## 3. 10분 이상 party만 남도록 data filtering

In [175]:
%%time
filtered_party = time_filter(party)
print(len(filtered_party))

2199093
CPU times: user 1.11 s, sys: 4.96 s, total: 6.07 s
Wall time: 9.07 s


## 4. party 참여 횟수 변수 생성
- 전체 횟수, week별 횟수 → 9개 변수

### 4.1 party members acc id 전체를 리스트로 만들기

In [177]:
party_id_ls = get_party_ids(filtered_party)

### 4.2 party_cnt(파티 참여횟수) 변수 생성

In [178]:
%%time
party_1 = get_party_cnt(party_id_ls, label)

CPU times: user 5.72 s, sys: 439 ms, total: 6.16 s
Wall time: 6.18 s


In [179]:
party_1.tail()

Unnamed: 0,acc_id,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,mode_party_end_week,last_party_end_week,first_party_end_day,mode_party_end_day,last_party_end_day,party_cnt
39995,d048b24f5205a50916339d24fc9f7b1c69352d2ee10f1c...,8.0,8.0,8.0,5.0,6.0,7.0,8.0,8.0,8.0,5.0,6.0,7.0,9.0
39996,ad5b3ef19e64aceb9ffea55310607ba62146218e616a83...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39997,972114a3bb57c377d162fb66b9198a2eec40de9576af51...,2.0,7.0,8.0,1.0,4.0,7.0,2.0,7.0,8.0,1.0,4.0,7.0,320.0
39998,3d10d680df3d5fa2e370997c18274bad7355e3d0ff9fb0...,1.0,1.0,8.0,1.0,2.0,7.0,1.0,1.0,8.0,1.0,2.0,7.0,52.0
39999,813ad8039e2692aecc1b7875f3a9b1e951477937ec30bd...,1.0,3.0,8.0,1.0,2.0,7.0,1.0,3.0,8.0,1.0,2.0,7.0,131.0


### 4.3 week별 party_cnt

#### (1) 파티 지속 기간(week) 확인

In [180]:
filtered_party["party_duration_week"] = filtered_party["party_end_week"] - filtered_party["party_start_week"]

In [181]:
filtered_party.groupby("party_duration_week").size().reset_index()

Unnamed: 0,party_duration_week,0
0,0,2176535
1,1,22558


#### (2) 다음 주로 넘어가는 경우 끝나는 요일 확인
- 모두 1일에 끝나므로 시작 week만으로 count하기로 함

In [182]:
dur_1w = filtered_party[filtered_party["party_duration_week"]==1]
dur_1w.groupby(dur_1w["party_end_day"]).size().reset_index()

Unnamed: 0,party_end_day,0
0,1,22558


#### (3) week별 party_cnt 구하기

In [184]:
%%time
for i in trange(1,9):
    party_1 = week_cnt(i, party_1, df = filtered_party)

  0%|          | 0/8 [00:00<?, ?it/s]

week 1 party id: 1198772


 12%|█▎        | 1/8 [00:01<00:13,  1.92s/it]

week 2 party id: 1307784


 25%|██▌       | 2/8 [00:04<00:12,  2.15s/it]

week 3 party id: 1311707


 38%|███▊      | 3/8 [00:06<00:10,  2.13s/it]

week 4 party id: 1418090


 50%|█████     | 4/8 [00:08<00:08,  2.17s/it]

week 5 party id: 1709974


 62%|██████▎   | 5/8 [00:11<00:06,  2.25s/it]

week 6 party id: 1720101


 75%|███████▌  | 6/8 [00:13<00:04,  2.29s/it]

week 7 party id: 2713731


 88%|████████▊ | 7/8 [00:17<00:02,  2.55s/it]

week 8 party id: 2549254


100%|██████████| 8/8 [00:21<00:00,  2.72s/it]

CPU times: user 20.8 s, sys: 848 ms, total: 21.7 s
Wall time: 21.8 s





In [185]:
party_1.tail()

Unnamed: 0,acc_id,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,mode_party_end_week,last_party_end_week,...,last_party_end_day,party_cnt,party_cnt_w1,party_cnt_w2,party_cnt_w3,party_cnt_w4,party_cnt_w5,party_cnt_w6,party_cnt_w7,party_cnt_w8
39995,d048b24f5205a50916339d24fc9f7b1c69352d2ee10f1c...,8.0,8.0,8.0,5.0,6.0,7.0,8.0,8.0,8.0,...,7.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
39996,ad5b3ef19e64aceb9ffea55310607ba62146218e616a83...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39997,972114a3bb57c377d162fb66b9198a2eec40de9576af51...,2.0,7.0,8.0,1.0,4.0,7.0,2.0,7.0,8.0,...,7.0,320.0,0.0,0.0,0.0,0.0,0.0,90.0,132.0,98.0
39998,3d10d680df3d5fa2e370997c18274bad7355e3d0ff9fb0...,1.0,1.0,8.0,1.0,2.0,7.0,1.0,1.0,8.0,...,7.0,52.0,8.0,7.0,4.0,8.0,1.0,7.0,8.0,9.0
39999,813ad8039e2692aecc1b7875f3a9b1e951477937ec30bd...,1.0,3.0,8.0,1.0,2.0,7.0,1.0,3.0,8.0,...,7.0,131.0,12.0,18.0,23.0,19.0,15.0,11.0,20.0,13.0


## 5. party count 7-8주와 6-8주의 전체 중 비중 변수 생성

In [186]:
party_1["party_78_ratio"] = party_1.loc[:,"party_cnt_w7":"party_cnt_w8"].sum(axis=1) / party_1["party_cnt"]
party_1.fillna(0, inplace=True)
party_1.tail()

Unnamed: 0,acc_id,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,mode_party_end_week,last_party_end_week,...,party_cnt,party_cnt_w1,party_cnt_w2,party_cnt_w3,party_cnt_w4,party_cnt_w5,party_cnt_w6,party_cnt_w7,party_cnt_w8,party_78_ratio
39995,d048b24f5205a50916339d24fc9f7b1c69352d2ee10f1c...,8.0,8.0,8.0,5.0,6.0,7.0,8.0,8.0,8.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,1.0
39996,ad5b3ef19e64aceb9ffea55310607ba62146218e616a83...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39997,972114a3bb57c377d162fb66b9198a2eec40de9576af51...,2.0,7.0,8.0,1.0,4.0,7.0,2.0,7.0,8.0,...,320.0,0.0,0.0,0.0,0.0,0.0,90.0,132.0,98.0,0.71875
39998,3d10d680df3d5fa2e370997c18274bad7355e3d0ff9fb0...,1.0,1.0,8.0,1.0,2.0,7.0,1.0,1.0,8.0,...,52.0,8.0,7.0,4.0,8.0,1.0,7.0,8.0,9.0,0.326923
39999,813ad8039e2692aecc1b7875f3a9b1e951477937ec30bd...,1.0,3.0,8.0,1.0,2.0,7.0,1.0,3.0,8.0,...,131.0,12.0,18.0,23.0,19.0,15.0,11.0,20.0,13.0,0.251908


In [187]:
party_1["party_678_ratio"] = party_1.loc[:,"party_cnt_w6":"party_cnt_w8"].sum(axis=1) / party_1["party_cnt"]
party_1.fillna(0, inplace=True)
party_1.tail()

Unnamed: 0,acc_id,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,mode_party_end_week,last_party_end_week,...,party_cnt_w1,party_cnt_w2,party_cnt_w3,party_cnt_w4,party_cnt_w5,party_cnt_w6,party_cnt_w7,party_cnt_w8,party_78_ratio,party_678_ratio
39995,d048b24f5205a50916339d24fc9f7b1c69352d2ee10f1c...,8.0,8.0,8.0,5.0,6.0,7.0,8.0,8.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,1.0,1.0
39996,ad5b3ef19e64aceb9ffea55310607ba62146218e616a83...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39997,972114a3bb57c377d162fb66b9198a2eec40de9576af51...,2.0,7.0,8.0,1.0,4.0,7.0,2.0,7.0,8.0,...,0.0,0.0,0.0,0.0,0.0,90.0,132.0,98.0,0.71875,1.0
39998,3d10d680df3d5fa2e370997c18274bad7355e3d0ff9fb0...,1.0,1.0,8.0,1.0,2.0,7.0,1.0,1.0,8.0,...,8.0,7.0,4.0,8.0,1.0,7.0,8.0,9.0,0.326923,0.461538
39999,813ad8039e2692aecc1b7875f3a9b1e951477937ec30bd...,1.0,3.0,8.0,1.0,2.0,7.0,1.0,3.0,8.0,...,12.0,18.0,23.0,19.0,15.0,11.0,20.0,13.0,0.251908,0.335878


## 6. week별 party count의 표준편차

In [188]:
party_1["party_cnt_std"] = party_1.loc[:,"party_cnt_w1":"party_cnt_w8"].std(axis=1)
party_1.fillna(0, inplace=True)
party_1.tail()

Unnamed: 0,acc_id,first_party_start_week,mode_party_start_week,last_party_start_week,first_party_start_day,mode_party_start_day,last_party_start_day,first_party_end_week,mode_party_end_week,last_party_end_week,...,party_cnt_w2,party_cnt_w3,party_cnt_w4,party_cnt_w5,party_cnt_w6,party_cnt_w7,party_cnt_w8,party_78_ratio,party_678_ratio,party_cnt_std
39995,d048b24f5205a50916339d24fc9f7b1c69352d2ee10f1c...,8.0,8.0,8.0,5.0,6.0,7.0,8.0,8.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,1.0,1.0,3.181981
39996,ad5b3ef19e64aceb9ffea55310607ba62146218e616a83...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39997,972114a3bb57c377d162fb66b9198a2eec40de9576af51...,2.0,7.0,8.0,1.0,4.0,7.0,2.0,7.0,8.0,...,0.0,0.0,0.0,0.0,90.0,132.0,98.0,0.71875,1.0,56.477556
39998,3d10d680df3d5fa2e370997c18274bad7355e3d0ff9fb0...,1.0,1.0,8.0,1.0,2.0,7.0,1.0,1.0,8.0,...,7.0,4.0,8.0,1.0,7.0,8.0,9.0,0.326923,0.461538,2.672612
39999,813ad8039e2692aecc1b7875f3a9b1e951477937ec30bd...,1.0,3.0,8.0,1.0,2.0,7.0,1.0,3.0,8.0,...,18.0,23.0,19.0,15.0,11.0,20.0,13.0,0.251908,0.335878,4.274091


## 7. 전체 파티 참여횟수 중 10분 이내 짧은 파티 참여의 비율 변수 생성

In [52]:
party_1 = pd.merge(party_1, get_ratio(party, party_1[['acc_id']]), how='left')

## 8. party network에서의 degree centrality

In [5]:
# G = nx.read_gpickle("data/test_party_network.gpickle")

node의 수가 2명 이상짜리 파티에 참여한 unique id 수와 동일함

In [6]:
# len(G.nodes())

192120

### 중심성 계산하기

In [7]:
# degree_centrality = nx.degree_centrality(G)
# type(degree_centrality)

dict

In [8]:
# centrality = pd.DataFrame(columns=["acc_id","degree_cent"])

In [9]:
# centrality["acc_id"] = degree_centrality.keys()

In [10]:
# centrality["degree_cent"] = degree_centrality.values()

변수의 단위가 너무 작아 100배씩 scaling함

In [36]:
# centrality["degree_cent"] = centrality["degree_cent"]*100

In [38]:
# party_1 = pd.merge(party_1, centrality, how='left').fillna(0, inplace = True)

## 9. 고정파티 최대 횟수 변수 생성

In [57]:
len(filtered_party)

2192953

### 9.1 10분 이상 지속한 party에 참여한 acc_id 구하기

#### (1) party members acc id 전체를 리스트로 만들기

In [64]:
%%time
party_ids = get_party_ids(party_filtered)
len(party_ids)

CPU times: user 9.31 s, sys: 11.4 s, total: 20.7 s
Wall time: 28 s


#### (2) party에 참여한 id 수

In [65]:
party_unique_ids = list(set(party_ids))

In [66]:
print("party에 참여한 id 수(중복카운트):", len(party_ids))
print(len(list(set(party_ids))))
print("party에 참여한 id 수(중복 없음):", len(party_unique_ids))

party에 참여한 id 수(중복카운트): 13896820
192124
party에 참여한 id 수(중복 없음): 192124


In [67]:
label_id = label["acc_id"].tolist()
len(label_id)

40000

#### (3) test data 유저 중 filtered party에 참여한 사람 수

In [68]:
label_party_id = list(set(label_id) & set(party_unique_ids))
len(label_party_id)

20828

### 9.2 고정파티 최대 횟수 구하기

#### (1) get_fix_party(): 한 유저가 특정 유저와 반복해서 party에 참여한 최대 횟수

In [69]:
# party member 전체 리스트 미리 받기
party_id = party_filtered["party_members_acc_id"].tolist()
party_id = [x.split(',') for x in party_id]

In [70]:
# 함수 체크
get_fix_party(label_party_id[0])

{'acc_id': '7d77e300a46b2ab78ec82e6f1f2a41ed95eedb452f2691dad03512a121c958ab',
 'fix_party_max': 3}

#### (2) label_party_id에 대하여 고정파티 최대 횟수 구하기

In [71]:
fix_party = pd.DataFrame(columns=["acc_id", "fix_party_max"])

In [72]:
for i in tqdm(range(len(label_party_id))):
    fix_party.loc[len(fix_party)] = get_fix_party(label_party_id[i])

100%|██████████| 20828/20828 [3:14:57<00:00,  1.78it/s]


In [None]:
# test label과 merge하기
party_1 = party_1.merge(fix_party, how = "left").fillna(0)

## 10. 최종 party 변수 저장하기

In [55]:
pickle.dump(party_1, open('../data/merged_test_party.pkl','wb'))