# 비회원 고객 학습 데이터 구성
- 트래커 모델에 의해 비회원 고객의 고객 유형(`clnt_type`)이 예측되었습니다.
- 상품 추천 모델에 맞게 비회원 고객 학습 데이터를 구성합니다.

# import modules & load data

In [1]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_rows", 500)            
pd.set_option("display.max_columns", 500)

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib
import missingno as msno
import seaborn as sns

# 그래프 마이너스 기호 깨짐 방지 '-'
matplotlib.rcParams['axes.unicode_minus'] = False
# 그래프 한글 깨짐 방지
font_path = 'C:/Windows/Fonts/malgun.ttf'
fontprop = fm.FontProperties(fname=font_path, size=18)

In [2]:
cks_dtype = {'clnt_id':'int64','sess_id':'int64','hit_seq':'int64','action_type':'int64','biz_unit':'category','sess_dt':'object','hit_tm':'object',
             'hit_pss_tm':'int64','trans_id':'float64','sech_kwd':'object','tot_pag_view_ct':'float64','tot_sess_hr_v':'float64','trfc_src':'category',
             'dvc_ctg_nm':'object','pd_c':'object','de_dt':'object','de_tm':'object','buy_am':'int64','buy_ct':'int64','clnt_gender':'category'}
df1online = pd.read_csv('./DATA/온라인 행동 정보(CKS).csv', dtype=cks_dtype)
df2trade = pd.read_csv('./DATA/거래 정보(CKS).csv', dtype=cks_dtype)
df3custom = pd.read_csv('./DATA/고객 정보(CKS).csv', dtype=cks_dtype)
df4item = pd.read_csv('./DATA/상품분류 정보(CKS).csv', dtype=cks_dtype)

df2trade.sort_values(by = ['clnt_id', 'trans_seq'], inplace=True)
df2trade.reset_index(drop = True, inplace = True)
df2trade.head()

Unnamed: 0,clnt_id,trans_id,trans_seq,biz_unit,pd_c,de_dt,de_tm,buy_am,buy_ct
0,2,42449.0,1,A02,1015.0,20190704,15:34,46430,1
1,2,62037.0,1,A03,92.0,20190729,23:47,36000,20
2,2,64691.0,1,A03,186.0,20190731,21:25,3790,1
3,2,65505.0,1,A03,92.0,20190801,22:00,9000,5
4,2,72330.0,1,A02,857.0,20190809,10:14,34970,1


# 네비게이터 모델을 위한 데이터프레임 생성
- 네비게이터 모델은 맞춤 상품 추천 모델의 별칭입니다.

### for_navi_raw
- 본격적으로 네비게이터 모델에 필요한 데이터형태의 시초가 되는 데이터입니다.

In [3]:
df3_unknown = df3custom[df3custom['clnt_gender'].isnull()] # 비회원만을 가져옵니다.
df3_unknown = df3_unknown[df3_unknown['clnt_age'] != 10].reset_index(drop = True) # 10대를 제거합니다.
print(df3_unknown.shape)
display(df3_unknown.head())

(60154, 3)


Unnamed: 0,clnt_id,clnt_gender,clnt_age
0,1,,
1,3,,
2,4,,
3,5,,
4,6,,


In [72]:
for_navi_raw = df1online[df1online['clnt_id'].isin(df3_unknown['clnt_id'].to_list())]
for_navi_raw = for_navi_raw.sort_values(by = ['clnt_id', 'sess_id', 'hit_seq']).reset_index(drop = True)
for_navi_raw.head()

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,pv_hr
0,1,1,1,0,A01,20190911,16:14,11880,,과일선물세트,14.0,124.0,unknown,mobile_app,0.112903
1,1,1,2,0,A01,20190911,16:15,22432,,과일선물세트 백화점,14.0,124.0,unknown,mobile_app,0.112903
2,1,1,3,0,A01,20190911,16:15,36140,,과일바구니,14.0,124.0,unknown,mobile_app,0.112903
3,1,2,1,0,A01,20190922,14:09,41584,,초등가을잠바,45.0,424.0,unknown,mobile_app,0.106132
4,1,2,2,0,A01,20190922,14:10,56113,,초등가을점퍼,45.0,424.0,unknown,mobile_app,0.106132


- 데이터 저장

In [73]:
for_navi_raw[['clnt_id','sess_id']].to_csv('./navigator/# 식별화된 비회원 데이터_id정보.csv', index=False)

#### `buy_id`변수 생성
- 기존 온라인행동데이터는 고객이 로그인을 하는 시점에서 로그아웃의 시점까지 데이터가 기록됩니다.
- `buy_id`변수는 고객 행동을 로그인/로그아웃이 아닌, 상품 구매를 기점으로 행동을 구분하기 위한 컬럼입니다.
- 상품 구매에 맞추어진 데이터로 맞춤 상품을 추천합니다.
- `buy_id`를 고려하여 아래의 `realtime`, `latest` 키워드를 생성합니다.

In [74]:
def buy_id(df):
    trans_list = list(df[~df['trans_id'].isnull()]['trans_id'].unique())
    if len(trans_list) > 0:
        idx_list = [df.index[0]]
        for i in trans_list:
            idx_list.append(df[df['trans_id'] == i].index.max())
        for j in range(len(idx_list)-1):
            if j == 0:
                df.loc[idx_list[j] : idx_list[j+1], 'buy_id'] = j+1
            else:
                df.loc[ idx_list[j]+1 :idx_list[j+1], 'buy_id'] = j+1
    return df

In [75]:
for_navi_raw['buy_id'] = 0
for_navi_raw = for_navi_raw.groupby(['clnt_id', 'sess_id']).apply(lambda x: buy_id(x))
for_navi_raw.reset_index(drop = True, inplace = True)
for_navi_raw.head(3)

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,pv_hr,buy_id
0,1,1,1,0,A01,20190911,16:14,11880,,과일선물세트,14.0,124.0,unknown,mobile_app,0.112903,0
1,1,1,2,0,A01,20190911,16:15,22432,,과일선물세트 백화점,14.0,124.0,unknown,mobile_app,0.112903,0
2,1,1,3,0,A01,20190911,16:15,36140,,과일바구니,14.0,124.0,unknown,mobile_app,0.112903,0


#### `hit_tm_class`변수 생성
- `22:05`, `17:17` 등 시각으로 표현되던 `hit_tm` 변수에서 시간대만을 가져와 카테고리화 합니다.

In [76]:
def time_to_int(x):
    return int(x.split(':')[0])

In [77]:
for_navi_raw['hit_tm_class'] = for_navi_raw['hit_tm'].apply(lambda x: time_to_int(x))
for_navi_raw.head(3)

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,pv_hr,buy_id,hit_tm_class
0,1,1,1,0,A01,20190911,16:14,11880,,과일선물세트,14.0,124.0,unknown,mobile_app,0.112903,0,16
1,1,1,2,0,A01,20190911,16:15,22432,,과일선물세트 백화점,14.0,124.0,unknown,mobile_app,0.112903,0,16
2,1,1,3,0,A01,20190911,16:15,36140,,과일바구니,14.0,124.0,unknown,mobile_app,0.112903,0,16


#### `realtime`변수 생성
- 고객의 실시간 행동 정보를 반영하는 변수입니다.

##### `realtime_hit_pss_tm`
- 구매 발생까지 걸린 시간(밀리초)

In [78]:
def realtime_hit_pss_tm(df):
    if df['buy_id'].nunique() > 1:
        id_list = list(df['buy_id'].unique())
        for i in range(len(id_list)-1):
            if i > 1:
                temp_idx = df[df['buy_id'] == id_list[i]].index
                tail_value = df[df['buy_id'] == id_list[i+1]]['hit_pss_tm'].iloc[-1]
                df.loc[temp_idx, 'realtime_hit_pss_tm'] = df.loc[temp_idx, 'hit_pss_tm'] - tail_value
            else:
                temp_idx = df[df['buy_id'] == i].index
                df.loc[temp_idx, 'realtime_hit_pss_tm'] = df.loc[temp_idx, 'hit_pss_tm']
    else:
        df['realtime_hit_pss_tm'] = df['hit_pss_tm']
    return df

In [79]:
for_navi_raw = for_navi_raw.groupby(['clnt_id', 'sess_id']).apply(lambda x: realtime_hit_pss_tm(x))
for_navi_raw.head(3)

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,pv_hr,buy_id,hit_tm_class,realtime_hit_pss_tm
0,1,1,1,0,A01,20190911,16:14,11880,,과일선물세트,14.0,124.0,unknown,mobile_app,0.112903,0,16,11880.0
1,1,1,2,0,A01,20190911,16:15,22432,,과일선물세트 백화점,14.0,124.0,unknown,mobile_app,0.112903,0,16,22432.0
2,1,1,3,0,A01,20190911,16:15,36140,,과일바구니,14.0,124.0,unknown,mobile_app,0.112903,0,16,36140.0


##### `realtime_hit_seq`
- 구매 발생까지의 히트시퀀스

In [80]:
def realtime_hit_seq(df):
    if df['buy_id'].nunique() > 1:
        id_list = df['buy_id'].unique()
        for j, i in enumerate(id_list):
            if j > 1:
                temp_idx = df[df['buy_id'] == id_list[j]].index
                tail_value = df[df['buy_id'] == id_list[j-1]]['hit_seq'].iloc[-1]
                df.loc[temp_idx, 'realtime_hit_seq'] = df.loc[temp_idx, 'hit_seq'] - tail_value
            else:
                temp_idx = df[df['buy_id'] == i].index
                df.loc[temp_idx, 'realtime_hit_seq'] = df.loc[temp_idx, 'hit_seq']
    else:
        df['realtime_tot_pag_v'] = df['hit_seq']
    return df

In [81]:
for_navi_raw = for_navi_raw.groupby(['clnt_id', 'sess_id']).apply(lambda x: realtime_hit_seq(x))
for_navi_raw.head(3)

Unnamed: 0,action_type,biz_unit,buy_id,clnt_id,dvc_ctg_nm,hit_pss_tm,hit_seq,hit_tm,hit_tm_class,pv_hr,realtime_hit_pss_tm,realtime_hit_seq,realtime_tot_pag_v,sech_kwd,sess_dt,sess_id,tot_pag_view_ct,tot_sess_hr_v,trans_id,trfc_src
0,0,A01,0,1,mobile_app,11880,1,16:14,16,0.112903,11880.0,,1.0,과일선물세트,20190911,1,14.0,124.0,,unknown
1,0,A01,0,1,mobile_app,22432,2,16:15,16,0.112903,22432.0,,2.0,과일선물세트 백화점,20190911,1,14.0,124.0,,unknown
2,0,A01,0,1,mobile_app,36140,3,16:15,16,0.112903,36140.0,,3.0,과일바구니,20190911,1,14.0,124.0,,unknown


##### `realtime_num_kwds`
- 세션 내 구매가 발생하기까지 검색 행동을 한 횟수입니다.
- 세션에서 구매완료 행동을 하기 까지 3번의 검색을 했다 => `realtime_num_kwds` = 3

In [82]:
for_navi_raw['realtime_num_kwds'] = 0 # 초기값을 넣어준다

In [83]:
def realtime_num_kwds(df):
    for i in range(len(df)): # buy_id 씩 끊어서 반복문 
        if i >= 1: # i = 0인건 제외하고 
            temp_df = df.iloc[ : i , :] # i 이전까지에 대하여 검색어가 있는 로우 개수가 i번째 로우값에 들어가 
            df.iloc[i, -1] = len(temp_df[~temp_df['sech_kwd'].isnull()])
    return df

In [84]:
# for_navi_raw = for_navi_raw.groupby(['clnt_id', 'sess_id', 'buy_id']).apply(lambda x: realtime_num_kwds(x))
for_navi_raw[['clnt_id', 'sess_id', 'hit_seq', 'action_type', 'sech_kwd', 'realtime_num_kwds']].head()

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,sech_kwd,realtime_num_kwds
0,1,1,1,0,과일선물세트,0
1,1,1,2,0,과일선물세트 백화점,0
2,1,1,3,0,과일바구니,0
3,1,2,1,0,초등가을잠바,0
4,1,2,2,0,초등가을점퍼,0


#### `latest`변수 생성
- 최근 세션에 기록된 행동 정보를 반영하는 변수입니다.
- `latest_kwd_1`, `latest_kwd_2`, `latest_kwd_3`, `latest_kwd_4`, `latest_kwd_5`, `latest_kwd_6`
    - 최근 검색한 6개의 검색어를 반영합니다.
- `latest_pv_hr_1`, `latest_pv_hr_2`, `latest_pv_hr_3`
    - 최근 3번의 쇼핑 동안의 페이지 서핑 속도를 반영합니다.

##### `latest_kwd`

In [85]:
def latest_kwds(df):
    for j in range(len(df)):
        if j > 0:
            temp_df = df.iloc[ : j , :]
            temp_idx = temp_df.tail(1).index
            kwd_list = temp_df[~temp_df['sech_kwd'].isnull()]['sech_kwd'].tail(6).tolist()
            if len(kwd_list) > 0:
                kwd_list.reverse()
                for idx, i in enumerate(kwd_list) :
                    df.loc[temp_idx+1, f'latest_kwd_{idx+1}'] = i
    return df

In [86]:
# 먼저 latest_kwd_x 컬럼 생성
for i in range(1,7):
    for_navi_raw[f'latest_kwd_{i}'] = -1

In [87]:
for_navi_raw = for_navi_raw.groupby(['clnt_id', 'sess_id', 'buy_id']).apply(lambda x: latest_kwds(x))
for_navi_raw[['clnt_id', 'sess_id', 'hit_seq', 'action_type', 'sech_kwd', 'latest_kwd_1', 'latest_kwd_2', 'latest_kwd_3', 'latest_kwd_4', 'latest_kwd_5', 'latest_kwd_6']].head()

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,sech_kwd,latest_kwd_1,latest_kwd_2,latest_kwd_3,latest_kwd_4,latest_kwd_5,latest_kwd_6
0,1,1,1,0,과일선물세트,-1,-1,-1,-1,-1,-1
1,1,1,2,0,과일선물세트 백화점,과일선물세트,-1,-1,-1,-1,-1
2,1,1,3,0,과일바구니,과일선물세트 백화점,과일선물세트,-1,-1,-1,-1
3,1,2,1,0,초등가을잠바,-1,-1,-1,-1,-1,-1
4,1,2,2,0,초등가을점퍼,초등가을잠바,-1,-1,-1,-1,-1


##### `latest_pv_hr`

- 온라인행동데이터에 `pv_hr`변수를 생성합니다
- `pv_hr`은 세션내 총페이지뷰수/총 세션시간 으로, 단위 페이지당 머무른 시간, 즉, 서핑속도를 의미합니다.
- `latest_pv_hr`은 고객의 최근 세 페이지 서핑 속도를 반영합니다.

In [88]:
df1online['pv_hr'] = df1online['tot_pag_view_ct'] /  df1online['tot_sess_hr_v']
temp = df1online[['clnt_id', 'sess_id', 'tot_pag_view_ct', 'tot_sess_hr_v', 'pv_hr']] 
df_pv_hr = temp.groupby(['clnt_id', 'sess_id'], as_index = False).first().reset_index(drop = True)
df_pv_hr.head()

Unnamed: 0,clnt_id,sess_id,tot_pag_view_ct,tot_sess_hr_v,pv_hr
0,1,1,14.0,124.0,0.112903
1,1,2,45.0,424.0,0.106132
2,2,1,61.0,911.0,0.066959
3,2,2,12.0,134.0,0.089552
4,2,3,12.0,942.0,0.012739


In [89]:
for_navi_raw = for_navi_raw.merge(df_pv_hr, how = 'left')
for_navi_raw['latest_pv_hr_1'] = 0
for_navi_raw['latest_pv_hr_2'] = 0
for_navi_raw['latest_pv_hr_3'] = 0
for_navi_raw.head()

Unnamed: 0,action_type,biz_unit,buy_id,clnt_id,dvc_ctg_nm,hit_pss_tm,hit_seq,hit_tm,hit_tm_class,pv_hr,realtime_hit_pss_tm,realtime_hit_seq,realtime_tot_pag_v,sech_kwd,sess_dt,sess_id,tot_pag_view_ct,tot_sess_hr_v,trans_id,trfc_src,realtime_num_kwds,latest_kwd_1,latest_kwd_2,latest_kwd_3,latest_kwd_4,latest_kwd_5,latest_kwd_6,latest_pv_hr_1,latest_pv_hr_2,latest_pv_hr_3
0,0,A01,0,1,mobile_app,11880,1,16:14,16,0.112903,11880.0,,1.0,과일선물세트,20190911,1,14.0,124.0,,unknown,0,-1,-1,-1,-1,-1,-1,0,0,0
1,0,A01,0,1,mobile_app,22432,2,16:15,16,0.112903,22432.0,,2.0,과일선물세트 백화점,20190911,1,14.0,124.0,,unknown,0,과일선물세트,-1,-1,-1,-1,-1,0,0,0
2,0,A01,0,1,mobile_app,36140,3,16:15,16,0.112903,36140.0,,3.0,과일바구니,20190911,1,14.0,124.0,,unknown,0,과일선물세트 백화점,과일선물세트,-1,-1,-1,-1,0,0,0
3,0,A01,0,1,mobile_app,41584,1,14:09,14,0.106132,41584.0,,1.0,초등가을잠바,20190922,2,45.0,424.0,,unknown,0,-1,-1,-1,-1,-1,-1,0,0,0
4,0,A01,0,1,mobile_app,56113,2,14:10,14,0.106132,56113.0,,2.0,초등가을점퍼,20190922,2,45.0,424.0,,unknown,0,초등가을잠바,-1,-1,-1,-1,-1,0,0,0


In [90]:
# clnt_id로 그룹화
def latest_pv_hr(df):
    temp_df = df[['sess_id', 'pv_hr']].groupby('sess_id').first()
    for s in df['sess_id'].unique():
        temp = temp_df[temp_df.index < s].tail(3)
        temp_idx = df[df['sess_id'] == s].index
        for i in range(len(temp)):
              df.loc[temp_idx, f'latest_pv_hr_{i+1}'] = temp.iloc[-(i+1), 0]
    return df    

In [91]:
for_navi_raw = for_navi_raw.groupby('clnt_id').apply(lambda x : latest_pv_hr(x))
for_navi_raw.head()

Unnamed: 0,action_type,biz_unit,buy_id,clnt_id,dvc_ctg_nm,hit_pss_tm,hit_seq,hit_tm,hit_tm_class,pv_hr,realtime_hit_pss_tm,realtime_hit_seq,realtime_tot_pag_v,sech_kwd,sess_dt,sess_id,tot_pag_view_ct,tot_sess_hr_v,trans_id,trfc_src,realtime_num_kwds,latest_kwd_1,latest_kwd_2,latest_kwd_3,latest_kwd_4,latest_kwd_5,latest_kwd_6,latest_pv_hr_1,latest_pv_hr_2,latest_pv_hr_3
0,0,A01,0,1,mobile_app,11880,1,16:14,16,0.112903,11880.0,,1.0,과일선물세트,20190911,1,14.0,124.0,,unknown,0,-1,-1,-1,-1,-1,-1,0.0,0.0,0.0
1,0,A01,0,1,mobile_app,22432,2,16:15,16,0.112903,22432.0,,2.0,과일선물세트 백화점,20190911,1,14.0,124.0,,unknown,0,과일선물세트,-1,-1,-1,-1,-1,0.0,0.0,0.0
2,0,A01,0,1,mobile_app,36140,3,16:15,16,0.112903,36140.0,,3.0,과일바구니,20190911,1,14.0,124.0,,unknown,0,과일선물세트 백화점,과일선물세트,-1,-1,-1,-1,0.0,0.0,0.0
3,0,A01,0,1,mobile_app,41584,1,14:09,14,0.106132,41584.0,,1.0,초등가을잠바,20190922,2,45.0,424.0,,unknown,0,-1,-1,-1,-1,-1,-1,0.112903,0.0,0.0
4,0,A01,0,1,mobile_app,56113,2,14:10,14,0.106132,56113.0,,2.0,초등가을점퍼,20190922,2,45.0,424.0,,unknown,0,초등가을잠바,-1,-1,-1,-1,-1,0.112903,0.0,0.0


#### 외부데이터 join 및 요일변수 생성
- 더위체감지수: `scorch`
- 미세먼지: `pm2.5`, `pm10`
- `sess_dt`값에 따른 요일 변수 생성

In [92]:
df_hot = pd.read_csv('./DATA/외부데이터_더위체감지수(정오기준).csv')
df_dust = pd.read_csv('./DATA/외부데이터_미세먼지.csv')

In [93]:
for_navi_raw.loc[:, 'sess_dt'] = for_navi_raw.loc[:, 'sess_dt'].astype('int')
for_navi_raw = for_navi_raw.merge(df_hot, how ='left', on = 'sess_dt')
for_navi_raw = for_navi_raw.merge(df_dust, how = 'left', on = 'sess_dt')
for_navi_raw['weekday'] = pd.to_datetime(for_navi_raw['sess_dt'], format='%Y%m%d').apply(lambda x: x.weekday())
for_navi_raw.head()

Unnamed: 0,action_type,biz_unit,buy_id,clnt_id,dvc_ctg_nm,hit_pss_tm,hit_seq,hit_tm,hit_tm_class,pv_hr,realtime_hit_pss_tm,realtime_hit_seq,realtime_tot_pag_v,sech_kwd,sess_dt,sess_id,tot_pag_view_ct,tot_sess_hr_v,trans_id,trfc_src,realtime_num_kwds,latest_kwd_1,latest_kwd_2,latest_kwd_3,latest_kwd_4,latest_kwd_5,latest_kwd_6,latest_pv_hr_1,latest_pv_hr_2,latest_pv_hr_3,scorch,pm2.5,pm10,weekday
0,0,A01,0,1,mobile_app,11880,1,16:14,16,0.112903,11880.0,,1.0,과일선물세트,20190911,1,14.0,124.0,,unknown,0,-1,-1,-1,-1,-1,-1,0.0,0.0,0.0,25,4,8,2
1,0,A01,0,1,mobile_app,22432,2,16:15,16,0.112903,22432.0,,2.0,과일선물세트 백화점,20190911,1,14.0,124.0,,unknown,0,과일선물세트,-1,-1,-1,-1,-1,0.0,0.0,0.0,25,4,8,2
2,0,A01,0,1,mobile_app,36140,3,16:15,16,0.112903,36140.0,,3.0,과일바구니,20190911,1,14.0,124.0,,unknown,0,과일선물세트 백화점,과일선물세트,-1,-1,-1,-1,0.0,0.0,0.0,25,4,8,2
3,0,A01,0,1,mobile_app,41584,1,14:09,14,0.106132,41584.0,,1.0,초등가을잠바,20190922,2,45.0,424.0,,unknown,0,-1,-1,-1,-1,-1,-1,0.112903,0.0,0.0,16,3,6,6
4,0,A01,0,1,mobile_app,56113,2,14:10,14,0.106132,56113.0,,2.0,초등가을점퍼,20190922,2,45.0,424.0,,unknown,0,초등가을잠바,-1,-1,-1,-1,-1,0.112903,0.0,0.0,16,3,6,6


#### df3_unknown과 병합
- 트래커 모델에 의해 예측된 고객 유형(`clnt_type`)을 merge합니다.

In [94]:
tracked = pd.read_csv('./tracker/tracked.csv')
tracked = tracked.groupby('clnt_id').first()['clnt_type'].reset_index()
tracked.head()

Unnamed: 0,clnt_id,clnt_type
0,1,2
1,3,2
2,4,2
3,5,2
4,6,2


In [95]:
for_navi_raw = for_navi_raw.merge(tracked, how = 'left', on = 'clnt_id')
for_navi_raw = for_navi_raw[['clnt_id', 'clnt_type', 'sess_id', 'hit_seq', 'action_type', 'biz_unit', 'sess_dt',
       'hit_tm', 'hit_pss_tm', 'trans_id', 'sech_kwd', 'tot_pag_view_ct',
       'tot_sess_hr_v', 'trfc_src', 'dvc_ctg_nm', 'buy_id', 'hit_tm_class',
       'realtime_num_kwds', 'realtime_hit_pss_tm', 'latest_kwd_1',
       'latest_kwd_2', 'latest_kwd_3', 'latest_kwd_4', 'latest_kwd_5',
       'latest_kwd_6', 'pv_hr', 'latest_pv_hr_1', 'latest_pv_hr_2',
       'latest_pv_hr_3', 'realtime_hit_seq', 'scorch', 'pm2.5', 'pm10',
       'weekday']]

- 학습에 참여하시키지 않는 변수는 제거합니다.

In [96]:
for_navi = for_navi_raw.drop(['trans_id', 'hit_tm', 'sech_kwd', 'sess_dt', 'clnt_id', 'tot_pag_view_ct', 'tot_sess_hr_v', 'buy_id', 'pv_hr'], axis = 1)
for_navi.head()

Unnamed: 0,clnt_type,sess_id,hit_seq,action_type,biz_unit,hit_pss_tm,trfc_src,dvc_ctg_nm,hit_tm_class,realtime_num_kwds,realtime_hit_pss_tm,latest_kwd_1,latest_kwd_2,latest_kwd_3,latest_kwd_4,latest_kwd_5,latest_kwd_6,latest_pv_hr_1,latest_pv_hr_2,latest_pv_hr_3,realtime_hit_seq,scorch,pm2.5,pm10,weekday
0,2,1,1,0,A01,11880,unknown,mobile_app,16,0,11880.0,-1,-1,-1,-1,-1,-1,0.0,0.0,0.0,,25,4,8,2
1,2,1,2,0,A01,22432,unknown,mobile_app,16,0,22432.0,과일선물세트,-1,-1,-1,-1,-1,0.0,0.0,0.0,,25,4,8,2
2,2,1,3,0,A01,36140,unknown,mobile_app,16,0,36140.0,과일선물세트 백화점,과일선물세트,-1,-1,-1,-1,0.0,0.0,0.0,,25,4,8,2
3,2,2,1,0,A01,41584,unknown,mobile_app,14,0,41584.0,-1,-1,-1,-1,-1,-1,0.112903,0.0,0.0,,16,3,6,6
4,2,2,2,0,A01,56113,unknown,mobile_app,14,0,56113.0,초등가을잠바,-1,-1,-1,-1,-1,0.112903,0.0,0.0,,16,3,6,6


In [97]:
from sklearn.preprocessing import LabelEncoder

# 키워드 인코딩
encoder_kwd = LabelEncoder()
encoder_kwd.fit(np.load('./navigator/encoder_kwd.npy', allow_pickle = True))

for i in range(1,7):
    temp = for_navi_raw[(for_navi[f'latest_kwd_{i}']!='-1') &( for_navi[f'latest_kwd_{i}']!=-1)].index
    for_navi.loc[temp, f'latest_kwd_{i}'] = encoder_kwd.transform(for_navi.loc[temp, f'latest_kwd_{i}'])

# 키워드 정수화
for i in range(1, 7):
    for_navi.loc[:, f'latest_kwd_{i}'] = for_navi.loc[:, f'latest_kwd_{i}'].astype('int')

# dvc_ctg_nm 인코딩
encoder_dvc = LabelEncoder()
encoder_dvc.fit(np.load('./navigator/encoder_dvc_ctg_nm.npy', allow_pickle = True))
for_navi.loc[:, 'dvc_ctg_nm'] = encoder_dvc.transform(for_navi.loc[:, 'dvc_ctg_nm'])

# trfc_src 인코딩
encoder_trfc = LabelEncoder()
encoder_trfc.fit(np.load('./navigator/encoder_trfc_src.npy', allow_pickle = True))
for_navi.loc[:, 'trfc_src'] = encoder_trfc.transform(for_navi.loc[:, 'trfc_src'])

# biz_unit 인코딩
encoder_biz = LabelEncoder()
encoder_biz.fit(np.load('./navigator/encoder_biz_unit.npy', allow_pickle = True))
for_navi.loc[: , 'biz_unit'] = encoder_biz.transform(for_navi.loc[:, 'biz_unit'])

In [98]:
for_navi.head()

Unnamed: 0,clnt_type,sess_id,hit_seq,action_type,biz_unit,hit_pss_tm,trfc_src,dvc_ctg_nm,hit_tm_class,realtime_num_kwds,realtime_hit_pss_tm,latest_kwd_1,latest_kwd_2,latest_kwd_3,latest_kwd_4,latest_kwd_5,latest_kwd_6,latest_pv_hr_1,latest_pv_hr_2,latest_pv_hr_3,realtime_hit_seq,scorch,pm2.5,pm10,weekday
0,2,1,1,0,0,11880,6,1,16,0,11880.0,-1,-1,-1,-1,-1,-1,0.0,0.0,0.0,,25,4,8,2
1,2,1,2,0,0,22432,6,1,16,0,22432.0,15133,-1,-1,-1,-1,-1,0.0,0.0,0.0,,25,4,8,2
2,2,1,3,0,0,36140,6,1,16,0,36140.0,15134,15133,-1,-1,-1,-1,0.0,0.0,0.0,,25,4,8,2
3,2,2,1,0,0,41584,6,1,14,0,41584.0,-1,-1,-1,-1,-1,-1,0.112903,0.0,0.0,,16,3,6,6
4,2,2,2,0,0,56113,6,1,14,0,56113.0,81935,-1,-1,-1,-1,-1,0.112903,0.0,0.0,,16,3,6,6


- 데이터 저장

In [None]:
for_navi.to_csv('./navigator/# 식별화된 비회원 데이터_final_for_predict.csv', index = False)