In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


plt.rc('font', family='Malgun Gothic') # For Windows
print(plt.rcParams['font.family'])

%matplotlib inline

['Malgun Gothic']


In [2]:
# 지수표현 제거
pd.options.display.float_format = '{:.5f}'.format

In [3]:
log_data = pd.read_csv('data/log_data.csv')

# 1. 로그 데이터 전처리

## 1.1. log data 전처리
- mp_os, mp_app_version 열 제거
- timestamp에서 시간만 추출

In [4]:
log_data.head()

Unnamed: 0,user_id,event,timestamp,mp_os,mp_app_version,date_cd
0,576409,StartLoanApply,2022-03-25 11:12:09,Android,3.8.2,2022-03-25
1,576409,ViewLoanApplyIntro,2022-03-25 11:12:09,Android,3.8.2,2022-03-25
2,72878,EndLoanApply,2022-03-25 11:14:44,Android,3.8.4,2022-03-25
3,645317,OpenApp,2022-03-25 11:15:09,iOS,3.6.1,2022-03-25
4,645317,UseLoanManage,2022-03-25 11:15:11,iOS,3.6.1,2022-03-25


In [5]:
# 중복된 데이터 제거
log_data = log_data.drop_duplicates().reset_index(drop=True)

# mp_os, mp_app_version 제거
log_data = log_data.drop(['mp_os', 'mp_app_version'], axis=1)

In [6]:
# user_id, 시간대별로 정렬해준다
log_data_sort = log_data.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)


In [7]:
from tqdm import tqdm
from datetime import datetime

In [8]:
# 각 user_id별로 전체 로그 데이터를 검사하기 위해 user_id 찾아서 넣어준다
log_data_users = log_data_sort['user_id'].unique()

## 1.2. 군집용 log데이터 생성
- 1. 각 이벤트들의 간격이 15분(900초) 이상이라면 다른 행동이라고 가정
- 2. 따라서 15분 이내에 발생한 user의 이벤트(event)와 시간(timestamp)들을 하나의 user의 데이터로 합침

In [9]:
# 군집용 데이터프레임 틀을 만들어준다
user_activity_df = pd.DataFrame(columns=['user_id', 'date', 'events', 'timestamp'])
user_activity_df

Unnamed: 0,user_id,date,events,timestamp


In [10]:
file_index = 1
iteration = 0
checkpoint_start = 0
checkpoint_end = 0
for user_id in tqdm(log_data_users):
    
    # log_data_sort의 row를 다 돌때까지
    while checkpoint_end < len(log_data_sort):
        # 이미 정렬해놓은 데이터프레임에서, 같은 user끼리의 임시 데이터프레임을 만들기 위함
        if log_data_sort.loc[checkpoint_end, :]['user_id'] == user_id:
            checkpoint_end += 1
        else:
            break
            
    # 임시 데이터프레임
    tmp = log_data_sort.iloc[checkpoint_start:checkpoint_end+1, :].reset_index(drop=True)
    
    pivot = tmp.iloc[0, :]
    event_save = pivot['event']
    time_save = pivot['timestamp']
    
    
    # 처음 df 초기화
    tmp_df = pd.DataFrame( {'user_id': [user_id], 'date': [pivot['date_cd']], 'events': ['']} )
    # 시간 check
    for row_index in range(1, len(tmp)):
        row = tmp.iloc[row_index, :]
        dt1 = datetime.strptime(pivot['timestamp'], '%Y-%m-%d %H:%M:%S')
        dt2 = datetime.strptime(row['timestamp'], '%Y-%m-%d %H:%M:%S')
        
        # 두 timestamp의 시간차가 15분이 되질 않는다면
        if (dt2 - dt1).seconds < 900:
            event_save += ("," + row['event'])
            time_save += ("," + row['timestamp'])
            
        else:
            # 이제까지 한 거 저장
            tmp_df['events'] = event_save
            tmp_df['timestamp'] = time_save
            # 임시 데이터프레임과 기존 데이터프레임을 concat
            user_activity_df = pd.concat([user_activity_df, tmp_df], ignore_index = True)
            # 임시 데이터프레임 초기화
            tmp_df = pd.DataFrame( {'user_id': [user_id], 'date': [row['date_cd']], 'events': ['']} )
            event_save = row['event']
            time_save = row['timestamp']
        pivot = row
        
    # 다음 user_id를 선택하기 위해 checkpoint start 갱신
    checkpoint_start = checkpoint_end
    iteration += 1
    
    # 데이터프레임을 계속해서 concat해서 커지게 되면, concat할 때마다 overhead가 발생해서 processing 속도 느려짐
    # 따라서 3000개의 user_id데이터를 처리할 때마다 임시파일로 저장해둔다
    if iteration > 0 and iteration % 3000 == 0:
        user_activity_df.to_csv(f'data/new_log/log_events_{file_index}.csv', index=False)
        user_activity_df = pd.DataFrame(columns=['user_id', 'date', 'events', 'timestamp'])
        print(f'log_events_{file_index}.csv completed!')
        file_index += 1
        
user_activity_df.to_csv(f'data/new_log/log_events_{file_index}.csv', index=False)



  1%|          | 3013/584636 [01:05<3:24:10, 47.48it/s]

log_events_1.csv completed!


  1%|          | 6012/584636 [02:08<2:52:28, 55.92it/s]

log_events_2.csv completed!


  2%|▏         | 9008/584636 [03:20<5:08:14, 31.12it/s] 

log_events_3.csv completed!


  2%|▏         | 12007/584636 [04:51<6:39:54, 23.87it/s] 

log_events_4.csv completed!


  3%|▎         | 15009/584636 [06:41<5:41:42, 27.78it/s] 

log_events_5.csv completed!


  3%|▎         | 18000/584636 [08:20<6:37:43, 23.74it/s] 

log_events_6.csv completed!


  4%|▎         | 21002/584636 [09:56<5:27:10, 28.71it/s] 

log_events_7.csv completed!


  4%|▍         | 24005/584636 [11:25<10:25:02, 14.95it/s]

log_events_8.csv completed!


  5%|▍         | 27001/584636 [13:02<5:58:26, 25.93it/s] 

log_events_9.csv completed!


  5%|▌         | 30009/584636 [14:36<6:23:37, 24.10it/s] 

log_events_10.csv completed!


  6%|▌         | 33005/584636 [16:11<5:48:49, 26.36it/s] 

log_events_11.csv completed!


  6%|▌         | 36013/584636 [17:44<5:04:17, 30.05it/s] 

log_events_12.csv completed!


  7%|▋         | 39006/584636 [19:18<5:03:50, 29.93it/s] 

log_events_13.csv completed!


  7%|▋         | 42009/584636 [20:52<6:05:32, 24.74it/s] 

log_events_14.csv completed!


  8%|▊         | 45006/584636 [22:31<6:53:32, 21.75it/s] 

log_events_15.csv completed!


  8%|▊         | 48013/584636 [23:56<3:32:05, 42.17it/s] 

log_events_16.csv completed!


  9%|▊         | 51005/584636 [25:35<6:33:23, 22.61it/s] 

log_events_17.csv completed!


  9%|▉         | 54014/584636 [27:06<4:17:30, 34.34it/s] 

log_events_18.csv completed!


 10%|▉         | 57009/584636 [28:42<3:57:10, 37.08it/s] 

log_events_19.csv completed!


 10%|█         | 60005/584636 [30:08<5:01:14, 29.03it/s] 

log_events_20.csv completed!


 11%|█         | 63005/584636 [31:31<3:16:19, 44.28it/s]

log_events_21.csv completed!


 11%|█▏        | 66017/584636 [32:57<2:24:40, 59.74it/s] 

log_events_22.csv completed!


 12%|█▏        | 69011/584636 [34:29<3:44:18, 38.31it/s] 

log_events_23.csv completed!


 12%|█▏        | 72008/584636 [36:01<4:24:00, 32.36it/s] 

log_events_24.csv completed!


 13%|█▎        | 75006/584636 [37:36<4:25:02, 32.05it/s] 

log_events_25.csv completed!


 13%|█▎        | 78016/584636 [39:07<3:22:03, 41.79it/s] 

log_events_26.csv completed!


 14%|█▍        | 81002/584636 [40:31<7:23:54, 18.91it/s] 

log_events_27.csv completed!


 14%|█▍        | 84008/584636 [42:00<5:27:25, 25.48it/s] 

log_events_28.csv completed!


 15%|█▍        | 86999/584636 [43:28<4:15:24, 32.47it/s] 

log_events_29.csv completed!


 15%|█▌        | 90001/584636 [45:04<6:02:59, 22.71it/s] 

log_events_30.csv completed!


 16%|█▌        | 93004/584636 [46:31<5:19:20, 25.66it/s] 

log_events_31.csv completed!


 16%|█▋        | 96005/584636 [48:00<4:39:56, 29.09it/s] 

log_events_32.csv completed!


 17%|█▋        | 99004/584636 [49:30<5:03:09, 26.70it/s]

log_events_33.csv completed!


 17%|█▋        | 102000/584636 [50:58<5:16:23, 25.42it/s] 

log_events_34.csv completed!


 18%|█▊        | 105011/584636 [52:25<4:24:44, 30.19it/s] 

log_events_35.csv completed!


 18%|█▊        | 108010/584636 [53:57<3:48:38, 34.74it/s] 

log_events_36.csv completed!


 19%|█▉        | 111004/584636 [55:29<4:07:08, 31.94it/s] 

log_events_37.csv completed!


 20%|█▉        | 114007/584636 [57:06<3:46:15, 34.67it/s] 

log_events_38.csv completed!


 20%|██        | 117018/584636 [58:37<3:50:48, 33.77it/s] 

log_events_39.csv completed!


 21%|██        | 120012/584636 [1:00:06<3:37:41, 35.57it/s]

log_events_40.csv completed!


 21%|██        | 123006/584636 [1:01:40<4:21:35, 29.41it/s] 

log_events_41.csv completed!


 22%|██▏       | 126000/584636 [1:03:14<4:30:47, 28.23it/s] 

log_events_42.csv completed!


 22%|██▏       | 129008/584636 [1:04:50<3:54:43, 32.35it/s] 

log_events_43.csv completed!


 23%|██▎       | 132001/584636 [1:06:23<6:47:38, 18.51it/s] 

log_events_44.csv completed!


 23%|██▎       | 135000/584636 [1:07:58<4:32:27, 27.50it/s] 

log_events_45.csv completed!


 24%|██▎       | 138003/584636 [1:09:32<5:41:55, 21.77it/s] 

log_events_46.csv completed!


 24%|██▍       | 141005/584636 [1:11:08<4:30:56, 27.29it/s] 

log_events_47.csv completed!


 25%|██▍       | 144004/584636 [1:12:29<4:50:08, 25.31it/s]

log_events_48.csv completed!


 25%|██▌       | 147010/584636 [1:13:49<2:47:19, 43.59it/s]

log_events_49.csv completed!


 26%|██▌       | 150010/584636 [1:15:11<3:51:27, 31.30it/s]

log_events_50.csv completed!


 26%|██▌       | 153005/584636 [1:16:28<4:17:14, 27.97it/s] 

log_events_51.csv completed!


 27%|██▋       | 156005/584636 [1:17:49<4:34:34, 26.02it/s] 

log_events_52.csv completed!


 27%|██▋       | 159001/584636 [1:19:03<4:31:45, 26.10it/s]

log_events_53.csv completed!


 28%|██▊       | 162010/584636 [1:20:24<2:39:39, 44.12it/s]

log_events_54.csv completed!


 28%|██▊       | 165009/584636 [1:21:41<2:41:33, 43.29it/s]

log_events_55.csv completed!


 29%|██▊       | 168009/584636 [1:22:56<3:47:37, 30.51it/s]

log_events_56.csv completed!


 29%|██▉       | 171010/584636 [1:24:10<3:15:45, 35.22it/s]

log_events_57.csv completed!


 30%|██▉       | 174007/584636 [1:25:24<3:51:23, 29.58it/s]

log_events_58.csv completed!


 30%|███       | 177020/584636 [1:26:41<2:25:03, 46.83it/s]

log_events_59.csv completed!


 31%|███       | 180000/584636 [1:27:55<3:37:19, 31.03it/s]

log_events_60.csv completed!


 31%|███▏      | 183007/584636 [1:29:14<3:03:46, 36.42it/s]

log_events_61.csv completed!


 32%|███▏      | 186005/584636 [1:30:32<4:58:42, 22.24it/s] 

log_events_62.csv completed!


 32%|███▏      | 189007/584636 [1:31:48<5:57:47, 18.43it/s]

log_events_63.csv completed!


 33%|███▎      | 192011/584636 [1:33:00<2:18:20, 47.30it/s]

log_events_64.csv completed!


 33%|███▎      | 195004/584636 [1:34:18<5:18:52, 20.37it/s]

log_events_65.csv completed!


 34%|███▍      | 198009/584636 [1:35:29<3:49:05, 28.13it/s]

log_events_66.csv completed!


 34%|███▍      | 201013/584636 [1:36:45<2:02:53, 52.03it/s]

log_events_67.csv completed!


 35%|███▍      | 204005/584636 [1:37:56<4:07:12, 25.66it/s]

log_events_68.csv completed!


 35%|███▌      | 207012/584636 [1:39:09<2:22:04, 44.30it/s]

log_events_69.csv completed!


 36%|███▌      | 210004/584636 [1:40:27<2:46:39, 37.46it/s]

log_events_70.csv completed!


 36%|███▋      | 213009/584636 [1:41:46<2:39:05, 38.93it/s]

log_events_71.csv completed!


 37%|███▋      | 216001/584636 [1:43:00<2:50:58, 35.93it/s]

log_events_72.csv completed!


 37%|███▋      | 219010/584636 [1:44:14<2:26:28, 41.60it/s]

log_events_73.csv completed!


 38%|███▊      | 222007/584636 [1:45:25<2:06:22, 47.82it/s]

log_events_74.csv completed!


 38%|███▊      | 225012/584636 [1:46:40<3:12:18, 31.17it/s]

log_events_75.csv completed!


 39%|███▉      | 228012/584636 [1:47:46<2:34:11, 38.55it/s]

log_events_76.csv completed!


 40%|███▉      | 231007/584636 [1:48:51<2:10:37, 45.12it/s]

log_events_77.csv completed!


 40%|████      | 234017/584636 [1:49:56<1:44:29, 55.93it/s]

log_events_78.csv completed!


 41%|████      | 237012/584636 [1:51:01<1:43:24, 56.03it/s]

log_events_79.csv completed!


 41%|████      | 240012/584636 [1:52:06<1:36:10, 59.72it/s]

log_events_80.csv completed!


 42%|████▏     | 243011/584636 [1:53:13<2:48:13, 33.84it/s]

log_events_81.csv completed!


 42%|████▏     | 246014/584636 [1:54:19<1:59:04, 47.40it/s]

log_events_82.csv completed!


 43%|████▎     | 249013/584636 [1:55:25<1:41:12, 55.27it/s]

log_events_83.csv completed!


 43%|████▎     | 252015/584636 [1:56:38<1:55:47, 47.88it/s] 

log_events_84.csv completed!


 44%|████▎     | 255009/584636 [1:57:42<2:14:26, 40.86it/s]

log_events_85.csv completed!


 44%|████▍     | 258000/584636 [1:58:46<2:14:55, 40.35it/s]

log_events_86.csv completed!


 45%|████▍     | 261016/584636 [1:59:53<1:31:13, 59.12it/s]

log_events_87.csv completed!


 45%|████▌     | 264006/584636 [2:00:59<3:53:13, 22.91it/s]

log_events_88.csv completed!


 46%|████▌     | 267007/584636 [2:02:12<2:40:01, 33.08it/s]

log_events_89.csv completed!


 46%|████▌     | 270011/584636 [2:03:16<1:55:51, 45.26it/s]

log_events_90.csv completed!


 47%|████▋     | 273005/584636 [2:04:18<2:11:55, 39.37it/s]

log_events_91.csv completed!


 47%|████▋     | 276010/584636 [2:05:24<2:00:35, 42.65it/s]

log_events_92.csv completed!


 48%|████▊     | 279011/584636 [2:06:27<1:41:16, 50.29it/s]

log_events_93.csv completed!


 48%|████▊     | 282008/584636 [2:07:30<1:45:17, 47.91it/s]

log_events_94.csv completed!


 49%|████▊     | 285009/584636 [2:08:36<1:58:02, 42.30it/s] 

log_events_95.csv completed!


 49%|████▉     | 288007/584636 [2:09:43<2:13:01, 37.17it/s]

log_events_96.csv completed!


 50%|████▉     | 291012/584636 [2:10:47<1:53:44, 43.03it/s]

log_events_97.csv completed!


 50%|█████     | 294006/584636 [2:11:54<1:48:59, 44.44it/s]

log_events_98.csv completed!


 51%|█████     | 297010/584636 [2:13:00<1:38:32, 48.65it/s]

log_events_99.csv completed!


 51%|█████▏    | 300000/584636 [2:14:08<2:12:00, 35.94it/s]

log_events_100.csv completed!


 52%|█████▏    | 303011/584636 [2:15:12<1:41:03, 46.44it/s]

log_events_101.csv completed!


 52%|█████▏    | 306001/584636 [2:16:15<1:49:14, 42.51it/s]

log_events_102.csv completed!


 53%|█████▎    | 309009/584636 [2:17:19<1:32:42, 49.55it/s]

log_events_103.csv completed!


 53%|█████▎    | 312004/584636 [2:18:20<1:56:17, 39.07it/s]

log_events_104.csv completed!


 54%|█████▍    | 315011/584636 [2:19:31<1:25:23, 52.62it/s]

log_events_105.csv completed!


 54%|█████▍    | 318015/584636 [2:20:33<1:10:41, 62.86it/s]

log_events_106.csv completed!


 55%|█████▍    | 321009/584636 [2:21:39<1:46:38, 41.20it/s]

log_events_107.csv completed!


 55%|█████▌    | 324009/584636 [2:22:42<1:35:11, 45.63it/s]

log_events_108.csv completed!


 56%|█████▌    | 327015/584636 [2:23:44<1:10:34, 60.83it/s]

log_events_109.csv completed!


 56%|█████▋    | 330007/584636 [2:24:55<1:26:04, 49.30it/s] 

log_events_110.csv completed!


 57%|█████▋    | 333015/584636 [2:26:01<1:24:50, 49.43it/s]

log_events_111.csv completed!


 57%|█████▋    | 336012/584636 [2:27:08<1:40:09, 41.37it/s]

log_events_112.csv completed!


 58%|█████▊    | 339007/584636 [2:28:09<1:34:23, 43.37it/s]

log_events_113.csv completed!


 58%|█████▊    | 342010/584636 [2:29:17<1:37:26, 41.50it/s]

log_events_114.csv completed!


 59%|█████▉    | 345007/584636 [2:30:28<1:41:00, 39.54it/s]

log_events_115.csv completed!


 60%|█████▉    | 348009/584636 [2:31:31<1:41:36, 38.82it/s]

log_events_116.csv completed!


 60%|██████    | 351007/584636 [2:32:33<1:24:38, 46.01it/s]

log_events_117.csv completed!


 61%|██████    | 354014/584636 [2:33:37<1:09:44, 55.12it/s]

log_events_118.csv completed!


 61%|██████    | 357004/584636 [2:34:42<1:28:28, 42.88it/s]

log_events_119.csv completed!


 62%|██████▏   | 360008/584636 [2:35:45<1:03:04, 59.36it/s]

log_events_120.csv completed!


 62%|██████▏   | 363004/584636 [2:36:50<2:13:01, 27.77it/s]

log_events_121.csv completed!


 63%|██████▎   | 366008/584636 [2:37:58<1:21:44, 44.57it/s]

log_events_122.csv completed!


 63%|██████▎   | 369003/584636 [2:39:04<2:46:27, 21.59it/s]

log_events_123.csv completed!


 64%|██████▎   | 372008/584636 [2:40:05<1:39:54, 35.47it/s]

log_events_124.csv completed!


 64%|██████▍   | 375007/584636 [2:41:12<2:23:47, 24.30it/s]

log_events_125.csv completed!


 65%|██████▍   | 378007/584636 [2:42:18<1:34:20, 36.50it/s]

log_events_126.csv completed!


 65%|██████▌   | 381012/584636 [2:43:23<1:29:00, 38.13it/s]

log_events_127.csv completed!


 66%|██████▌   | 384000/584636 [2:44:26<2:10:06, 25.70it/s]

log_events_128.csv completed!


 66%|██████▌   | 387004/584636 [2:45:36<1:31:21, 36.06it/s]

log_events_129.csv completed!


 67%|██████▋   | 390023/584636 [2:46:40<47:50, 67.80it/s]  

log_events_130.csv completed!


 67%|██████▋   | 393019/584636 [2:47:44<54:25, 58.67it/s]  

log_events_131.csv completed!


 68%|██████▊   | 396006/584636 [2:48:45<1:28:21, 35.58it/s]

log_events_132.csv completed!


 68%|██████▊   | 399001/584636 [2:49:42<1:11:26, 43.31it/s]

log_events_133.csv completed!


 69%|██████▉   | 402014/584636 [2:50:50<1:03:59, 47.57it/s]

log_events_134.csv completed!


 69%|██████▉   | 405005/584636 [2:51:52<1:24:07, 35.59it/s]

log_events_135.csv completed!


 70%|██████▉   | 408007/584636 [2:52:58<58:08, 50.63it/s]  

log_events_136.csv completed!


 70%|███████   | 411012/584636 [2:54:01<53:38, 53.94it/s]  

log_events_137.csv completed!


 71%|███████   | 414015/584636 [2:55:07<57:47, 49.21it/s]  

log_events_138.csv completed!


 71%|███████▏  | 417013/584636 [2:56:08<57:11, 48.85it/s]  

log_events_139.csv completed!


 72%|███████▏  | 420011/584636 [2:57:14<55:52, 49.10it/s]  

log_events_140.csv completed!


 72%|███████▏  | 423014/584636 [2:58:15<45:30, 59.19it/s]  

log_events_141.csv completed!


 73%|███████▎  | 426007/584636 [2:59:16<53:05, 49.80it/s]  

log_events_142.csv completed!


 73%|███████▎  | 429010/584636 [3:00:17<1:00:58, 42.54it/s]

log_events_143.csv completed!


 74%|███████▍  | 432015/584636 [3:01:23<51:35, 49.31it/s]  

log_events_144.csv completed!


 74%|███████▍  | 435006/584636 [3:02:34<58:04, 42.94it/s]  

log_events_145.csv completed!


 75%|███████▍  | 438007/584636 [3:03:39<1:13:26, 33.27it/s]

log_events_146.csv completed!


 75%|███████▌  | 441009/584636 [3:04:44<1:07:49, 35.30it/s]

log_events_147.csv completed!


 76%|███████▌  | 444018/584636 [3:05:49<39:16, 59.66it/s]  

log_events_148.csv completed!


 76%|███████▋  | 447007/584636 [3:06:53<1:12:25, 31.67it/s]

log_events_149.csv completed!


 77%|███████▋  | 450003/584636 [3:07:59<1:10:59, 31.60it/s]

log_events_150.csv completed!


 77%|███████▋  | 453013/584636 [3:09:08<42:31, 51.59it/s]  

log_events_151.csv completed!


 78%|███████▊  | 456006/584636 [3:10:11<47:24, 45.21it/s]  

log_events_152.csv completed!


 79%|███████▊  | 459008/584636 [3:11:17<39:25, 53.10it/s]  

log_events_153.csv completed!


 79%|███████▉  | 462014/584636 [3:12:20<38:05, 53.66it/s]  

log_events_154.csv completed!


 80%|███████▉  | 465000/584636 [3:13:23<51:07, 39.00it/s]  

log_events_155.csv completed!


 80%|████████  | 468021/584636 [3:14:26<29:26, 66.00it/s]  

log_events_156.csv completed!


 81%|████████  | 471010/584636 [3:15:30<40:25, 46.84it/s]  

log_events_157.csv completed!


 81%|████████  | 474011/584636 [3:16:30<45:32, 40.48it/s]  

log_events_158.csv completed!


 82%|████████▏ | 477006/584636 [3:17:32<47:01, 38.14it/s]  

log_events_159.csv completed!


 82%|████████▏ | 480013/584636 [3:18:34<31:28, 55.40it/s]  

log_events_160.csv completed!


 83%|████████▎ | 483002/584636 [3:19:40<34:54, 48.52it/s]  

log_events_161.csv completed!


 83%|████████▎ | 486012/584636 [3:20:42<31:57, 51.42it/s]  

log_events_162.csv completed!


 84%|████████▎ | 489007/584636 [3:21:45<40:04, 39.76it/s]  

log_events_163.csv completed!


 84%|████████▍ | 492011/584636 [3:22:47<31:06, 49.63it/s]  

log_events_164.csv completed!


 85%|████████▍ | 495010/584636 [3:23:51<37:03, 40.32it/s]  

log_events_165.csv completed!


 85%|████████▌ | 498015/584636 [3:24:56<37:39, 38.34it/s]  

log_events_166.csv completed!


 86%|████████▌ | 501012/584636 [3:26:00<27:55, 49.92it/s]  

log_events_167.csv completed!


 86%|████████▌ | 504019/584636 [3:27:01<22:45, 59.04it/s]

log_events_168.csv completed!


 87%|████████▋ | 507007/584636 [3:28:08<37:53, 34.15it/s]  

log_events_169.csv completed!


 87%|████████▋ | 510015/584636 [3:29:17<21:22, 58.21it/s]  

log_events_170.csv completed!


 88%|████████▊ | 513019/584636 [3:30:19<19:26, 61.40it/s]  

log_events_171.csv completed!


 88%|████████▊ | 516005/584636 [3:31:25<25:18, 45.21it/s]

log_events_172.csv completed!


 89%|████████▉ | 519012/584636 [3:32:32<22:44, 48.09it/s]  

log_events_173.csv completed!


 89%|████████▉ | 522018/584636 [3:33:37<17:15, 60.44it/s]  

log_events_174.csv completed!


 90%|████████▉ | 525015/584636 [3:34:39<18:15, 54.41it/s]

log_events_175.csv completed!


 90%|█████████ | 528012/584636 [3:35:41<26:33, 35.54it/s]  

log_events_176.csv completed!


 91%|█████████ | 531000/584636 [3:36:42<21:20, 41.89it/s]

log_events_177.csv completed!


 91%|█████████▏| 534006/584636 [3:38:43<43:08, 19.56it/s]  

log_events_178.csv completed!


 92%|█████████▏| 537005/584636 [3:40:49<29:18, 27.09it/s]  

log_events_179.csv completed!


 92%|█████████▏| 540007/584636 [3:42:52<26:52, 27.67it/s]  

log_events_180.csv completed!


 93%|█████████▎| 543001/584636 [3:44:49<37:25, 18.54it/s]  

log_events_181.csv completed!


 93%|█████████▎| 546009/584636 [3:47:04<25:19, 25.43it/s]  

log_events_182.csv completed!


 94%|█████████▍| 549001/584636 [3:49:10<50:55, 11.66it/s]  

log_events_183.csv completed!


 94%|█████████▍| 552005/584636 [3:51:14<30:59, 17.54it/s]  

log_events_184.csv completed!


 95%|█████████▍| 555005/584636 [3:53:29<28:24, 17.38it/s]  

log_events_185.csv completed!


 95%|█████████▌| 558003/584636 [3:57:25<1:01:18,  7.24it/s]

log_events_186.csv completed!


 96%|█████████▌| 561002/584636 [4:01:09<41:28,  9.50it/s]  

log_events_187.csv completed!


 96%|█████████▋| 564001/584636 [4:04:59<53:39,  6.41it/s]  

log_events_188.csv completed!


 97%|█████████▋| 566999/584636 [4:07:55<12:20, 23.82it/s]

log_events_189.csv completed!


 97%|█████████▋| 570002/584636 [4:10:00<16:59, 14.36it/s]

log_events_190.csv completed!


 98%|█████████▊| 573000/584636 [4:12:03<08:18, 23.33it/s]

log_events_191.csv completed!


 99%|█████████▊| 576005/584636 [4:14:08<06:38, 21.66it/s]

log_events_192.csv completed!


 99%|█████████▉| 579005/584636 [4:16:11<04:38, 20.24it/s]

log_events_193.csv completed!


100%|█████████▉| 582012/584636 [4:18:10<01:27, 30.15it/s]

log_events_194.csv completed!


100%|██████████| 584636/584636 [4:19:58<00:00, 37.48it/s]


In [12]:
# 임시파일을 다시 하나로 합쳐주는 작업을 한다
log_event_df = pd.DataFrame(columns=['user_id', 'date', 'events', 'timestamp'])
for i in range(1, 195+1):
    tmp_df = pd.read_csv(f"data/new_log/log_events_{i}.csv")
    log_event_df = pd.concat([log_event_df, tmp_df]).reset_index(drop=True)

# 중간 저장
log_event_df.to_csv('data/new_log_events.csv', index=False)
    

## 1.3 군집을 위한 추가 전처리작업
- OpenApp 제거(너무 당연한 로그라서(어플을 키는 것 자체가 OpenApp을 수반하기 때문) 일종의 불용어 처리
- 각 row의 timestamp에서 event 지속시간 추출

In [13]:
# 중간 저장 파일 로드
log_event_df = pd.read_csv('data/new_log_events.csv')
log_event_df

Unnamed: 0,user_id,date,events,timestamp
0,1,2022-05-03,"GetCreditInfo,GetCreditInfo","2022-05-03 14:52:28,2022-05-03 14:52:35"
1,1,2022-06-16,"UseLoanManage,Login,GetCreditInfo","2022-06-16 23:58:41,2022-06-16 23:58:41,2022-0..."
2,7,2022-05-22,GetCreditInfo,2022-05-22 16:39:49
3,9,2022-05-21,"GetCreditInfo,GetCreditInfo,GetCreditInfo","2022-05-21 23:37:58,2022-05-21 23:43:33,2022-0..."
4,11,2022-03-24,"OpenApp,GetCreditInfo,UseLoanManage,GetCreditI...","2022-03-24 10:53:59,2022-03-24 10:54:07,2022-0..."
...,...,...,...,...
3608000,879693,2022-06-29,OpenApp,2022-06-29 17:20:45
3608001,879694,2022-03-31,"StartLoanApply,ViewLoanApplyIntro,CompleteIDCe...","2022-03-31 20:07:23,2022-03-31 20:07:23,2022-0..."
3608002,879695,2022-05-27,"StartLoanApply,ViewLoanApplyIntro,CompleteIDCe...","2022-05-27 12:48:32,2022-05-27 12:48:32,2022-0..."
3608003,879696,2022-03-14,"Login,UseLoanManage,GetCreditInfo,GetCreditInf...","2022-03-14 05:35:34,2022-03-14 05:35:41,2022-0..."


In [14]:
# 시간 길이 측정하기
from datetime import datetime

def check_time_length(row):
    time_list = row.split(',')
    
    # event가 1개인 경우일 때는 0초로 처리
    if len(time_list) < 2:
        return 0
    else:
        start_time = time_list[0]
        end_time   = time_list[-1]
        sec = ( datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S') - datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') ).seconds
        return sec

In [15]:
# 각 행마다 이벤트 지속시간 측정(초단위)
log_event_df['time'] = log_event_df['timestamp'].apply(lambda x: check_time_length(x))
log_event_df.head()

Unnamed: 0,user_id,date,events,timestamp,time
0,1,2022-05-03,"GetCreditInfo,GetCreditInfo","2022-05-03 14:52:28,2022-05-03 14:52:35",7
1,1,2022-06-16,"UseLoanManage,Login,GetCreditInfo","2022-06-16 23:58:41,2022-06-16 23:58:41,2022-0...",1
2,7,2022-05-22,GetCreditInfo,2022-05-22 16:39:49,0
3,9,2022-05-21,"GetCreditInfo,GetCreditInfo,GetCreditInfo","2022-05-21 23:37:58,2022-05-21 23:43:33,2022-0...",354
4,11,2022-03-24,"OpenApp,GetCreditInfo,UseLoanManage,GetCreditI...","2022-03-24 10:53:59,2022-03-24 10:54:07,2022-0...",692


In [16]:
# openapp을 events에서 제거
log_event_df['events'] = log_event_df['events'].apply(lambda x: x.replace('OpenApp,', ''))
log_event_df['events'] = log_event_df['events'].apply(lambda x: x.replace(',OpenApp', ''))
log_event_df['events'] = log_event_df['events'].apply(lambda x: x.replace('OpenApp', ''))

log_event_df.head()

Unnamed: 0,user_id,date,events,timestamp,time
0,1,2022-05-03,"GetCreditInfo,GetCreditInfo","2022-05-03 14:52:28,2022-05-03 14:52:35",7
1,1,2022-06-16,"UseLoanManage,Login,GetCreditInfo","2022-06-16 23:58:41,2022-06-16 23:58:41,2022-0...",1
2,7,2022-05-22,GetCreditInfo,2022-05-22 16:39:49,0
3,9,2022-05-21,"GetCreditInfo,GetCreditInfo,GetCreditInfo","2022-05-21 23:37:58,2022-05-21 23:43:33,2022-0...",354
4,11,2022-03-24,"GetCreditInfo,UseLoanManage,GetCreditInfo,UseP...","2022-03-24 10:53:59,2022-03-24 10:54:07,2022-0...",692


In [17]:
only_openapp = log_event_df[log_event_df['events'].apply(lambda x: len(x)) == 0].reset_index(drop=True)
# openapp time은 0으로 통일
only_openapp['time'] = 0
only_openapp.head()

Unnamed: 0,user_id,date,events,timestamp,time
0,20,2022-03-15,,2022-03-15 16:21:32,0
1,20,2022-03-15,,2022-03-15 17:47:51,0
2,20,2022-03-17,,2022-03-17 17:14:47,0
3,20,2022-04-21,,2022-04-21 20:20:49,0
4,20,2022-05-05,,2022-05-05 15:25:31,0


In [18]:
# 기존 데이터(log_event_df) OpenApp만 있는 경우의 수 제거!
new_log_event_df = log_event_df.drop(log_event_df[log_event_df['events'].apply(lambda x: len(x)) == 0].index).reset_index(drop=True)
new_log_event_df

Unnamed: 0,user_id,date,events,timestamp,time
0,1,2022-05-03,"GetCreditInfo,GetCreditInfo","2022-05-03 14:52:28,2022-05-03 14:52:35",7
1,1,2022-06-16,"UseLoanManage,Login,GetCreditInfo","2022-06-16 23:58:41,2022-06-16 23:58:41,2022-0...",1
2,7,2022-05-22,GetCreditInfo,2022-05-22 16:39:49,0
3,9,2022-05-21,"GetCreditInfo,GetCreditInfo,GetCreditInfo","2022-05-21 23:37:58,2022-05-21 23:43:33,2022-0...",354
4,11,2022-03-24,"GetCreditInfo,UseLoanManage,GetCreditInfo,UseP...","2022-03-24 10:53:59,2022-03-24 10:54:07,2022-0...",692
...,...,...,...,...,...
3055454,879693,2022-06-27,"StartLoanApply,ViewLoanApplyIntro,CompleteIDCe...","2022-06-27 10:13:48,2022-06-27 10:13:54,2022-0...",24
3055455,879694,2022-03-31,"StartLoanApply,ViewLoanApplyIntro,CompleteIDCe...","2022-03-31 20:07:23,2022-03-31 20:07:23,2022-0...",655
3055456,879695,2022-05-27,"StartLoanApply,ViewLoanApplyIntro,CompleteIDCe...","2022-05-27 12:48:32,2022-05-27 12:48:32,2022-0...",171
3055457,879696,2022-03-14,"Login,UseLoanManage,GetCreditInfo,GetCreditInf...","2022-03-14 05:35:34,2022-03-14 05:35:41,2022-0...",400


## 1.4. Glove 활용
- 카운트 기반과 예측 기반 모두 사용하는 방법론
- events의 문자열은 10여개의 단어로만 구성된 문자열
- events를 일일히 전처리하기 보다는, 임베딩 벡터를 활용
- 2단계로 임베딩함 : 10여개의 단어 임베딩 -> 단어 임베딩을 바탕으로 문장(events)를 통채로 임베딩

In [19]:
# # https://velog.io/@ann9902/Glove-%EC%84%A4%EC%B9%98-%EC%97%90%EB%9F%AC-%ED%95%B4%EA%B2%B0%EB%B2%95
# ! pip install glove_python_binary

In [20]:
from glove import Corpus, Glove

In [21]:
# 단어 단위 split
result = [events.split(',') for events in new_log_event_df.loc[:, 'events']]

corpus = Corpus()
corpus.fit(result, window=5)

In [22]:
# 훈련
## no_components는 임베딩 벡터 dimension
glove = Glove(no_components=32, learning_rate=0.01)     # 0.05
%time glove.fit(corpus.matrix, epochs=100, no_threads=4, verbose=False)    # Wall time: 8min 32s
glove.add_dictionary(corpus.dictionary)




Wall time: 24 ms


In [23]:
# # model save
# glove.save('glove_32_epoch100.model')

In [24]:
#model load
glove_model = Glove.load('glove_32_epoch100.model')

In [25]:
# word dict 생성

In [26]:
word_dict = {}
for word in  glove_model.dictionary.keys():
    word_dict[word] = glove_model.word_vectors[glove_model.dictionary[word]]
print('[Success !] Lengh of word dict... : ', len(word_dict))

[Success !] Lengh of word dict... :  10


In [27]:
# 훈련시킨 glove model의 단어 및 대응되는 임베딩 벡터 표시
word_dict

{'GetCreditInfo': array([-0.51256538, -0.51179434, -0.49176959,  0.51354897, -0.51007658,
         0.53767853,  0.5127457 ,  0.50309013,  0.50984219,  0.49872453,
         0.50695612,  0.50368135, -0.52142305, -0.51020081,  0.52667964,
         0.52412489,  0.50351871,  0.48969901,  0.52049258, -0.50656359,
        -0.50241932, -0.53583172, -0.52180443, -0.50781765,  0.51770425,
         0.50871608, -0.49971704, -0.50146263,  0.48732218, -0.51015109,
         0.52351302, -0.5303776 ]),
 'UseLoanManage': array([-0.49470323, -0.50163492, -0.491423  ,  0.50558585, -0.51842026,
         0.50611289,  0.51407371,  0.48189495,  0.48874736,  0.5070971 ,
         0.49648585,  0.50529098, -0.51998235, -0.50707956,  0.51682825,
         0.50781484,  0.51227228,  0.47680176,  0.50457247, -0.51463778,
        -0.50713232, -0.51849126, -0.51313478, -0.49172867,  0.52452718,
         0.48606383, -0.49986497, -0.49087542,  0.47712826, -0.49803301,
         0.51246036, -0.52210129]),
 'Login': array([-

In [28]:
def sent2vec_glove(tokens, embedding_dim=32):
    '''문장 token 리스트를 받아서 임베딩 시킨다.'''
  
    size = len(tokens)
    matrix = np.zeros((size, embedding_dim))
    word_table = word_dict     # glove word_dict

    for i, token in enumerate(tokens):
        vector = np.array([
            word_table[t] for t in token
            if t in word_table
        ])

        if vector.size != 0:
            final_vector = np.mean(vector, axis=0)
            matrix[i] = final_vector

    return matrix

In [29]:
sentence_glove = sent2vec_glove(result)
sentence_glove[0]

array([-0.51256538, -0.51179434, -0.49176959,  0.51354897, -0.51007658,
        0.53767853,  0.5127457 ,  0.50309013,  0.50984219,  0.49872453,
        0.50695612,  0.50368135, -0.52142305, -0.51020081,  0.52667964,
        0.52412489,  0.50351871,  0.48969901,  0.52049258, -0.50656359,
       -0.50241932, -0.53583172, -0.52180443, -0.50781765,  0.51770425,
        0.50871608, -0.49971704, -0.50146263,  0.48732218, -0.51015109,
        0.52351302, -0.5303776 ])

# 2. k-means 군집화
- 우선 events에서 32차원으로 임베딩벡터를 추출한 값을 토대로 k-means를 활용해 log데이터를 우선 군집분석
- log데이터에서 유저의 행동양식을 분류하기 위함

In [30]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

## 2.1 전체 데이터 군집화 (k=5)

In [31]:
# clustering
# 전체 데이터

k = 5
kmeans = KMeans(n_clusters=k, random_state=2022, init = 'k-means++')
y_pred = kmeans.fit_predict(sentence_glove)

In [32]:
new_log_event_df['cluster'] = y_pred
new_log_event_df.head()

Unnamed: 0,user_id,date,events,timestamp,time,cluster
0,1,2022-05-03,"GetCreditInfo,GetCreditInfo","2022-05-03 14:52:28,2022-05-03 14:52:35",7,1
1,1,2022-06-16,"UseLoanManage,Login,GetCreditInfo","2022-06-16 23:58:41,2022-06-16 23:58:41,2022-0...",1,1
2,7,2022-05-22,GetCreditInfo,2022-05-22 16:39:49,0,1
3,9,2022-05-21,"GetCreditInfo,GetCreditInfo,GetCreditInfo","2022-05-21 23:37:58,2022-05-21 23:43:33,2022-0...",354,1
4,11,2022-03-24,"GetCreditInfo,UseLoanManage,GetCreditInfo,UseP...","2022-03-24 10:53:59,2022-03-24 10:54:07,2022-0...",692,1


In [33]:
# 0번 클러스터 : 기타
print(len(new_log_event_df[new_log_event_df['cluster'] == 0]))
new_log_event_df[new_log_event_df['cluster'] == 0]['events'].value_counts()[:30]

731618


Login,GetCreditInfo                                                                                                       154574
Login,GetCreditInfo,GetCreditInfo                                                                                          21562
GetCreditInfo,Login                                                                                                        14160
Login,UseLoanManage,GetCreditInfo,StartLoanApply,ViewLoanApplyIntro,CompleteIDCertification,EndLoanApply                   13336
Login,GetCreditInfo,StartLoanApply,ViewLoanApplyIntro                                                                      11411
Login,GetCreditInfo,UseLoanManage,StartLoanApply,ViewLoanApplyIntro,CompleteIDCertification,EndLoanApply                    7748
Login,UseLoanManage,GetCreditInfo,ViewLoanApplyIntro                                                                        5745
Login,UseLoanManage,GetCreditInfo,ViewLoanApplyIntro,CompleteIDCertification,EndLoanApply        

In [34]:
# 1번 클러스터 : 부가서비스(getcredit , useloan)
print(len(new_log_event_df[new_log_event_df['cluster'] == 1]))
new_log_event_df[new_log_event_df['cluster'] == 1]['events'].value_counts()[:30]

902713


GetCreditInfo                                                    179540
Login,UseLoanManage,GetCreditInfo                                122819
Login,GetCreditInfo,UseLoanManage                                 63477
GetCreditInfo,GetCreditInfo                                       56647
Login,UseLoanManage                                               53499
GetCreditInfo,UseLoanManage                                       43560
UseLoanManage,GetCreditInfo                                       36187
Login,GetCreditInfo,UseLoanManage,GetCreditInfo                   30408
Login,UseLoanManage,GetCreditInfo,GetCreditInfo                   25556
UseLoanManage,Login,GetCreditInfo                                 16714
GetCreditInfo,GetCreditInfo,GetCreditInfo                         16264
GetCreditInfo,Login,UseLoanManage                                 12503
Login,GetCreditInfo,GetCreditInfo,UseLoanManage                    9581
UseLoanManage                                                   

In [35]:
# 2번 클러스터 : start - endloan
print(len(new_log_event_df[new_log_event_df['cluster'] == 2]))
new_log_event_df[new_log_event_df['cluster'] == 2]['events'].value_counts()[:30]

1229283


Login,StartLoanApply,ViewLoanApplyIntro,CompleteIDCertification,EndLoanApply                                      217816
StartLoanApply,ViewLoanApplyIntro,CompleteIDCertification,EndLoanApply                                            115784
Login,StartLoanApply,EndLoanApply                                                                                  99071
StartLoanApply,EndLoanApply                                                                                        45835
Login,ViewLoanApplyIntro                                                                                           39653
Login,GetCreditInfo,StartLoanApply,ViewLoanApplyIntro,CompleteIDCertification,EndLoanApply                         39375
EndLoanApply                                                                                                       36063
Login,StartLoanApply,ViewLoanApplyIntro                                                                            35850
Login,StartLoanApply,ViewLoanApp

In [36]:
# 3번 클러스터 : 신규회원가입
print(len(new_log_event_df[new_log_event_df['cluster'] == 3]))
new_log_event_df[new_log_event_df['cluster'] == 3]['events'].value_counts()[:30]

18755


CompleteIDCertification,SignUp                                  7951
GetCreditInfo,CompleteIDCertification,SignUp                    1191
SignUp                                                          1035
SignUp,GetCreditInfo                                             501
SignUp,UseLoanManage                                             445
GetCreditInfo,UseLoanManage,UsePrepayCalc                        426
GetCreditInfo,UseLoanManage,UseDSRCalc                           412
Login,SignUp                                                     252
SignUp,UseLoanManage,GetCreditInfo                               233
UseLoanManage,GetCreditInfo,CompleteIDCertification,SignUp       140
SignUp,ViewLoanApplyIntro                                        140
SignUp,GetCreditInfo,UseLoanManage                               138
Login,SignUp,UseLoanManage                                       132
GetCreditInfo,UseLoanManage,CompleteIDCertification,SignUp       124
SignUp,GetCreditInfo,UseLoanManage

In [37]:
# 4번클러스터 : 주로 login
print(len(new_log_event_df[new_log_event_df['cluster'] == 4]))
new_log_event_df[new_log_event_df['cluster'] == 4]['events'].value_counts()[:30]

173090


Login                                                                159373
Login,Login                                                            6286
Login,Login,Login                                                      1221
Login,Login,StartLoanApply,ViewLoanApplyIntro                           980
Login,StartLoanApply,ViewLoanApplyIntro,Login                           624
Login,Login,Login,Login                                                 564
Login,ViewLoanApplyIntro,Login                                          369
Login,Login,Login,Login,Login                                           348
Login,GetCreditInfo,Login,Login                                         304
Login,Login,ViewLoanApplyIntro                                          261
Login,Login,Login,Login,Login,Login                                     246
Login,Login,Login,Login,Login,Login,Login                               144
Login,Login,Login,GetCreditInfo                                         128
Login,Login,

In [38]:
# 어플만 킨 경우(openapp) cluster = 5로 해주자
only_openapp['cluster'] = 5
only_openapp.head()

Unnamed: 0,user_id,date,events,timestamp,time,cluster
0,20,2022-03-15,,2022-03-15 16:21:32,0,5
1,20,2022-03-15,,2022-03-15 17:47:51,0,5
2,20,2022-03-17,,2022-03-17 17:14:47,0,5
3,20,2022-04-21,,2022-04-21 20:20:49,0,5
4,20,2022-05-05,,2022-05-05 15:25:31,0,5


In [39]:
# 기존 df와 openapp만 있는 df을 합쳐준다
new_log_event_df_merge = pd.concat([new_log_event_df, only_openapp]).reset_index(drop=True)
new_log_event_df_merge.head()

Unnamed: 0,user_id,date,events,timestamp,time,cluster
0,1,2022-05-03,"GetCreditInfo,GetCreditInfo","2022-05-03 14:52:28,2022-05-03 14:52:35",7,1
1,1,2022-06-16,"UseLoanManage,Login,GetCreditInfo","2022-06-16 23:58:41,2022-06-16 23:58:41,2022-0...",1,1
2,7,2022-05-22,GetCreditInfo,2022-05-22 16:39:49,0,1
3,9,2022-05-21,"GetCreditInfo,GetCreditInfo,GetCreditInfo","2022-05-21 23:37:58,2022-05-21 23:43:33,2022-0...",354,1
4,11,2022-03-24,"GetCreditInfo,UseLoanManage,GetCreditInfo,UseP...","2022-03-24 10:53:59,2022-03-24 10:54:07,2022-0...",692,1


In [40]:
# user_id, date 기준으로 정렬해준다
new_log_event_df_merge = new_log_event_df_merge.sort_values(by=['user_id', 'date']).reset_index(drop=True)
new_log_event_df_merge.head()

Unnamed: 0,user_id,date,events,timestamp,time,cluster
0,1,2022-05-03,"GetCreditInfo,GetCreditInfo","2022-05-03 14:52:28,2022-05-03 14:52:35",7,1
1,1,2022-06-16,"UseLoanManage,Login,GetCreditInfo","2022-06-16 23:58:41,2022-06-16 23:58:41,2022-0...",1,1
2,7,2022-05-22,GetCreditInfo,2022-05-22 16:39:49,0,1
3,9,2022-05-21,"GetCreditInfo,GetCreditInfo,GetCreditInfo","2022-05-21 23:37:58,2022-05-21 23:43:33,2022-0...",354,1
4,11,2022-03-24,"GetCreditInfo,UseLoanManage,GetCreditInfo,UseP...","2022-03-24 10:53:59,2022-03-24 10:54:07,2022-0...",692,1


In [41]:
# event, timestamp는 더이상 필요없으므로 삭제하기
new_log_event_df_merge.drop(['events', 'timestamp'], axis=1, inplace=True)

In [42]:
new_log_event_df_merge

Unnamed: 0,user_id,date,time,cluster
0,1,2022-05-03,7,1
1,1,2022-06-16,1,1
2,7,2022-05-22,0,1
3,9,2022-05-21,354,1
4,11,2022-03-24,692,1
...,...,...,...,...
3608000,879693,2022-06-29,0,5
3608001,879694,2022-03-31,655,2
3608002,879695,2022-05-27,171,2
3608003,879696,2022-03-14,400,0


In [43]:
new_log_event_df_merge.to_csv('data/new_log_event_cluster.csv', index=False)