# 컴피티션 링크
- https://www.kaggle.com/t/2e45abe9f1434b59a3358365432a48bb

# 고객별 백화점 구매기록 데이터셋
- 미혼(1) or 기혼(0) 고객을 예측
- 학습용 구매기록 데이터
    - https://drive.google.com/file/d/1tg41qXu02FK55bFa8P1Gx2URhwPUCoQQ/view?usp=sharing
- 학습용 정답 데이터
    - https://drive.google.com/file/d/1-9AcU9nAoO4SzSmqdCOYTvWNpA5Pdt3q/view?usp=sharing
- 테스트용 구매기록 데이터
    - https://drive.google.com/file/d/1-AwDfGlHm9rNtpnHIWOK96jBJYy3f2SZ/view?usp=sharing
- 제출 양식 데이터
    - https://drive.google.com/file/d/1-Qv7SlsY5Eu3bRR7Z0IUJyLkOO1Fl3y6/view?usp=sharing

In [818]:
# from google.colab import drive
# drive.mount('/content/drive')

- 데이터 경로 변수

In [819]:
# DATA_PATH = "/content/drive/MyDrive/파이널프로젝트/data/"
# DATA_PATH

In [820]:
DATA_PATH = "data/"
DATA_PATH

'data/'

- 데이터 불러오기

In [821]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

In [822]:
train_tr.describe()

Unnamed: 0,브랜드코드,구매가격
count,523105.0,523105.0
mean,5391.568913,98734.69
std,380.975185,313402.6
min,5100.0,-30820000.0
25%,5111.0,22520.0
50%,5237.0,52000.0
75%,5522.0,119700.0
max,6996.0,30820000.0


# 특성 공학(Feature Engineering)

## 날짜 형식으로 변환

In [823]:
train_tr["구매일시"] = pd.to_datetime(train_tr["구매일시"])
test_tr["구매일시"] = pd.to_datetime(test_tr["구매일시"])

## 날짜, 시간 컬럼 분할

In [824]:
train_tr["구매날짜"] = train_tr["구매일시"].dt.date
test_tr["구매날짜"] = test_tr["구매일시"].dt.date

train_tr["구매시각"] = train_tr["구매일시"].dt.time
test_tr["구매시각"] = test_tr["구매일시"].dt.time

train_tr["구매날짜"] = pd.to_datetime(train_tr["구매날짜"], errors="coerce")
test_tr["구매날짜"] = pd.to_datetime(test_tr["구매날짜"], errors="coerce")

train_tr.shape, test_tr.shape

((523105, 9), (441196, 9))

## 환불한 내역에 대한 구매내역 및 환불내역 삭제

In [825]:
purchase_train = train_tr[train_tr["구매가격"] > 0].reset_index()
purchase_test = test_tr[test_tr["구매가격"] > 0].reset_index()

refund_train = train_tr[train_tr["구매가격"] < 0].reset_index()
refund_test = test_tr[test_tr["구매가격"] < 0].reset_index()

In [826]:
# 환불 금액의 절대값과 동일한 구매 내역 찾기
refund_pairs_train = pd.merge(
    refund_train,
    purchase_train,
    on=["ID", "지점코드", "대분류", "중분류", "브랜드코드"],
    suffixes=("_refund", "_purchase")
)

# 환불 금액이 일치하는 경우만 선택
refund_pairs_train = refund_pairs_train[
    refund_pairs_train["구매가격_refund"].abs() == refund_pairs_train["구매가격_purchase"]
].copy()

# 환불 날짜 이후에 환불 금액과 일치하는 구매건
refund_pairs_train = refund_pairs_train[
    refund_pairs_train["구매날짜_purchase"] > refund_pairs_train["구매날짜_refund"]
]

repurchase_count_train = refund_pairs_train.groupby("ID").size().reset_index(name='환불후재구매브랜드_count')

repurchase_count_train

Unnamed: 0,ID,환불후재구매브랜드_count
0,train_10049,1
1,train_1006,1
2,train_10087,1
3,train_10106,1
4,train_10110,2
...,...,...
497,train_9943,1
498,train_9950,2
499,train_9956,1
500,train_998,1


In [827]:
# 테스트 데이터
refund_pairs_test = pd.merge(
    refund_test,
    purchase_test,
    on=["ID", "지점코드", "대분류", "중분류", "브랜드코드"],
    suffixes=("_refund", "_purchase")
)

refund_pairs_test = refund_pairs_test[
    refund_pairs_test["구매가격_refund"].abs() == refund_pairs_test["구매가격_purchase"]
].copy()

refund_pairs_test = refund_pairs_test[
    refund_pairs_test["구매날짜_purchase"] > refund_pairs_test["구매날짜_refund"]
]

repurchase_count_test = refund_pairs_test.groupby("ID").size().reset_index(name='환불후재구매브랜드_count')

repurchase_count_test

Unnamed: 0,ID,환불후재구매브랜드_count
0,test_10004,1
1,test_10019,2
2,test_10042,3
3,test_1005,1
4,test_10074,1
...,...,...
429,test_9932,1
430,test_9940,1
431,test_995,1
432,test_9974,2


In [828]:
# ID, 지점코드, 대분류, 중분류, 브랜드코드가 같은 구매내역과 환불내역 선택
refund_pairs_train = pd.merge(
    refund_train,
    purchase_train,
    on=["ID", "지점코드", "대분류", "중분류", "브랜드코드"], 
    suffixes=("_refund", "_purchase")
)

refund_pairs_test = pd.merge(
    refund_test,
    purchase_test,
    on=["ID", "지점코드", "대분류", "중분류", "브랜드코드"],
    suffixes=("_refund", "_purchase")
)

# 구매금액과 환불 금액이 일치하는 경우만 선택
refund_pairs_train = refund_pairs_train[
    refund_pairs_train["구매가격_refund"].abs() == refund_pairs_train["구매가격_purchase"]
].copy()

refund_pairs_test = refund_pairs_test[
    refund_pairs_test["구매가격_refund"].abs() == refund_pairs_test["구매가격_purchase"]
].copy()

# 환불 날짜와 시간이 구매 날짜와 시간보다 이후인 경우만 선택
refund_pairs_train = refund_pairs_train[
    (refund_pairs_train["구매날짜_refund"] > refund_pairs_train["구매날짜_purchase"]) |
    ((refund_pairs_train["구매날짜_refund"] == refund_pairs_train["구매날짜_purchase"]) & 
     (refund_pairs_train["구매시각_refund"] > refund_pairs_train["구매시각_purchase"]))
]

refund_pairs_test = refund_pairs_test[
    (refund_pairs_test["구매날짜_refund"] > refund_pairs_test["구매날짜_purchase"]) |
    ((refund_pairs_test["구매날짜_refund"] == refund_pairs_test["구매날짜_purchase"]) & 
     (refund_pairs_test["구매시각_refund"] > refund_pairs_test["구매시각_purchase"]))
]

# 환불 날짜/시각과 구매 날짜/시각의 차이를 계산
refund_pairs_train['time_diff'] = (
    pd.to_datetime(refund_pairs_train['구매날짜_refund'].astype(str) + ' ' + refund_pairs_train['구매시각_refund'].astype(str)) - 
    pd.to_datetime(refund_pairs_train['구매날짜_purchase'].astype(str) + ' ' + refund_pairs_train['구매시각_purchase'].astype(str))
)

refund_pairs_test['time_diff'] = (
    pd.to_datetime(refund_pairs_test['구매날짜_refund'].astype(str) + ' ' + refund_pairs_test['구매시각_refund'].astype(str)) - 
    pd.to_datetime(refund_pairs_test['구매날짜_purchase'].astype(str) + ' ' + refund_pairs_test['구매시각_purchase'].astype(str))
)

# _purchase 컬럼을 제외한 컬럼으로 그룹화하고 time_diff가 가장 작은 행만 선택
cols = [col for col in refund_pairs_train.columns if '_purchase' not in col and col != 'time_diff']
refund_pairs_train = refund_pairs_train.loc[refund_pairs_train.groupby(cols)['time_diff'].idxmin()]

cols = [col for col in refund_pairs_test.columns if '_purchase' not in col and col != 'time_diff']
refund_pairs_test = refund_pairs_test.loc[refund_pairs_test.groupby(cols)['time_diff'].idxmin()]

# time_diff 컬럼 제거
refund_pairs_train = refund_pairs_train.drop('time_diff', axis=1)
refund_pairs_test = refund_pairs_test.drop('time_diff', axis=1)

# 환불과 구매 인덱스 추출
index_refund_train = refund_pairs_train['index_refund'].values
index_purchase_train = refund_pairs_train['index_purchase'].values

index_refund_test = refund_pairs_test['index_refund'].values 
index_purchase_test = refund_pairs_test['index_purchase'].values

# train_tr, test_tr에서 해당 인덱스 제거
train_tr_clean = train_tr.drop(index=np.concatenate([index_refund_train, index_purchase_train]))
test_tr_clean = test_tr.drop(index=np.concatenate([index_refund_test, index_purchase_test]))

# 구매가격이 0보다 작은 행 제거
train_tr_clean = train_tr_clean[train_tr_clean['구매가격'] > 0]
test_tr_clean = test_tr_clean[test_tr_clean['구매가격'] > 0]

train_tr_clean.shape, test_tr_clean.shape

((456484, 9), (384916, 9))

## 중분류 컬럼 값 중 공백 포함 값에 대해 공백제거

In [829]:
train_tr_clean["중분류"] = train_tr_clean["중분류"].str.replace(" ", "")
test_tr_clean["중분류"] = test_tr_clean["중분류"].str.replace(" ", "")

## 새로 만든 feature와 병합할 고객ID로만 이루어진 데이터프레임 생성

In [830]:
train_ft = train_target[["ID"]]
test_ft = submit[["ID"]]

train_ft.shape, test_ft.shape

((14940, 1), (12225, 1))

## 구매날짜를 이용한 특성생성

In [831]:
# train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
# test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [832]:
train_today = train_tr_clean["구매날짜"].max() + pd.Timedelta(days=1)
test_today = test_tr_clean["구매날짜"].max() + pd.Timedelta(days=1)
train_today, test_today

(Timestamp('2005-04-30 00:00:00'), Timestamp('2005-04-30 00:00:00'))

In [833]:
agg_list = [
    # 일
    ("총방문일수", "nunique"),
    # ("첫구매날짜", 'min'),
    # ("마지막구매날짜", 'max'),
    ("백화점이용기간", lambda x: (x.max() - x.min()).days + 1),
    ("구매주기", lambda x: int(((x.max() - x.min()).days + 1) / x.dt.date.nunique())),
    # ("주말방문일수", lambda x: x[x.dt.weekday > 4].nunique()),
    ("일별평균구매건수", lambda x: x.count() / x.dt.date.nunique()),
    ("구매간격_표준편차", lambda x: np.std(np.diff([d.toordinal() for d in sorted(x)])) if len(x) > 1 else 0),
    
    # 요일
    ('평균구매요일', lambda x: x.dt.weekday.mean()),
    ("주구매요일", lambda x: x.dt.weekday.mode()[0]),
    ("월요일_구매비율", lambda x: np.mean(x.dt.weekday == 0)),
    ("화요일_구매비율", lambda x: np.mean(x.dt.weekday == 1)),
    ("수요일_구매비율", lambda x: np.mean(x.dt.weekday == 2)),
    ("목요일_구매비율", lambda x: np.mean(x.dt.weekday == 3)),
    ("금요일_구매비율", lambda x: np.mean(x.dt.weekday == 4)),
    ("토요일_구매비율", lambda x: np.mean(x.dt.weekday == 5)),
    ("일요일_구매비율", lambda x: np.mean(x.dt.weekday == 6)),
    
    # 월
    # ("1월_구매비율", lambda x: np.mean(x.dt.month == 1)),
    # ("2월_구매비율", lambda x: np.mean(x.dt.month == 2)),
    # ("3월_구매비율", lambda x: np.mean(x.dt.month == 3)),
    # ("4월_구매비율", lambda x: np.mean(x.dt.month == 4)),
    # ("5월_구매비율", lambda x: np.mean(x.dt.month == 5)),
    # ("6월_구매비율", lambda x: np.mean(x.dt.month == 6)),
    # ("7월_구매비율", lambda x: np.mean(x.dt.month == 7)),
    # ("8월_구매비율", lambda x: np.mean(x.dt.month == 8)),
    # ("9월_구매비율", lambda x: np.mean(x.dt.month == 9)),
    # ("10월_구매비율", lambda x: np.mean(x.dt.month == 10)),
    # ("11월_구매비율", lambda x: np.mean(x.dt.month == 11)),
    # ("12월_구매비율", lambda x: np.mean(x.dt.month == 12)),
    ("거래개월수", lambda x: x.dt.date.astype(str).str[:-3].nunique()),
    ("월별평균구매건수", lambda x: x.count() / x.dt.month.nunique()),
    ("월초구매비율", lambda x: np.mean(x.dt.day <= 10)),
    ("월중순구매비율", lambda x: np.mean((x.dt.day > 10) & (x.dt.day <= 20))),
    ("월말구매비율", lambda x: np.mean(x.dt.day >= 21)),
    
    # ("구매횟수_상반기", lambda x: np.mean((x.dt.month >= 1) & (x.dt.month <= 6))),
    # ("구매횟수_하반기", lambda x: np.mean((x.dt.month >= 7) & (x.dt.month <= 12))),
    
    # 성수기
    # ("추석성수기_구매비율", lambda x: np.mean((x.dt.month == 9) & (x.dt.day >= 18) | (x.dt.month == 10) & (x.dt.day <= 3))),
    # ("연말연시_구매비율", lambda x: np.mean((x.dt.month == 12) | (x.dt.month == 1) & (x.dt.day <= 10))),
    # ("설날성수기_구매비율", lambda x: np.mean((x.dt.month == 2) & (x.dt.day >= 1) & (x.dt.day <= 15))),
    
    # # 준성수기
    # ("여름시즌_구매비율", lambda x: np.mean((x.dt.month == 6) & (x.dt.day >= 15) | (x.dt.month == 7) | (x.dt.month == 8) & (x.dt.day <= 15))),
    # ("신학기_구매비율", lambda x: np.mean((x.dt.month == 3) & (x.dt.day >= 1) & (x.dt.day <= 15))),
    
    # # 중간기
    # ("여름끝추석전_구매비율", lambda x: np.mean((x.dt.month == 8) & (x.dt.day >= 16) | (x.dt.month == 9) & (x.dt.day <= 17))),
    # ("연말연시끝설날전_구매비율", lambda x: np.mean((x.dt.month == 1) & (x.dt.day >= 11) & (x.dt.day <= 31))),
    # ("신학기후_구매비율", lambda x: np.mean((x.dt.month == 3) & (x.dt.day >= 16) | (x.dt.month == 4) & (x.dt.day <= 10))),
    
    # # 비수기
    # ("초여름_구매비율", lambda x: np.mean((x.dt.month == 5) & (x.dt.day >= 17) | (x.dt.month == 6) & (x.dt.day <= 14))),
    # ("추석후연말전_구매비율", lambda x: np.mean((x.dt.month == 10) & (x.dt.day >= 4) | (x.dt.month == 11))),
    # ("봄철비수기_구매비율", lambda x: np.mean((x.dt.month == 4) & (x.dt.day >= 11) & (x.dt.day <= 29))),
    
    # 계절
    ('봄_구매비율', lambda x: np.mean(x.dt.month.isin([3,4,5]))),
    ('여름_구매비율', lambda x: np.mean(x.dt.month.isin([6,7,8]))),
    ('가을_구매비율', lambda x: np.mean(x.dt.month.isin([9,10,11]))),
    ('겨울_구매비율', lambda x: np.mean(x.dt.month.isin([1,2,12]))),
    
    #분기
    # ("1분기_구매비율", lambda x: np.mean(x.dt.quarter == 1)),
    # ("2분기_구매비율", lambda x: np.mean(x.dt.quarter == 2)),
    # ("3분기_구매비율", lambda x: np.mean(x.dt.quarter == 3)),
    # ("4분기_구매비율", lambda x: np.mean(x.dt.quarter == 4)),
    
    # 년
    # ("2004년_구매비율", lambda x: np.mean(x.dt.year == 2004)),
    # ("2005년_구매비율", lambda x: np.mean(x.dt.year == 2005)),
    ("연도별평균구매건수", lambda x: x.count() / x.dt.year.nunique()),
]

train_agg_list = agg_list + [("마지막구매후_경과일", lambda x: (pd.Timestamp(train_today) - x.max()).days)]
# tmp = train_tr_clean.groupby("ID")["구매날짜"].agg(train_agg_list).reset_index()
tmp = train_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,거래개월수,월별평균구매건수,월초구매비율,월중순구매비율,월말구매비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,연도별평균구매건수
0,train_0,9,255,28,1.555556,22.744789,3.0,2,0.071429,0.0,...,6,2.333333,0.571429,0.214286,0.214286,0.071429,0.357143,0.285714,0.285714,7.0
1,train_1,21,352,16,2.0,11.739134,2.309524,3,0.047619,0.261905,...,11,3.818182,0.285714,0.285714,0.428571,0.357143,0.166667,0.357143,0.119048,21.0
2,train_2,55,356,6,1.854545,5.019987,2.333333,0,0.254902,0.186275,...,12,8.5,0.323529,0.245098,0.431373,0.45098,0.117647,0.196078,0.235294,51.0
3,train_3,90,345,3,2.055556,3.137146,3.081081,4,0.075676,0.113514,...,12,15.416667,0.410811,0.378378,0.210811,0.383784,0.194595,0.2,0.221622,92.5
4,train_4,24,313,13,1.5,11.962731,3.861111,4,0.083333,0.055556,...,10,3.6,0.416667,0.388889,0.194444,0.111111,0.472222,0.305556,0.111111,18.0


In [834]:
test_agg_list = agg_list + [("마지막구매후_경과일", lambda x: (pd.Timestamp(test_today) - x.max()).days)]
# tmp = test_tr_clean.groupby("ID")["구매날짜"].agg(test_agg_list).reset_index()
tmp = test_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how="left", on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,거래개월수,월별평균구매건수,월초구매비율,월중순구매비율,월말구매비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,연도별평균구매건수
0,test_0,5,164,32,1.4,30.416096,3.428571,1,0.142857,0.285714,...,4,1.75,0.285714,0.428571,0.285714,0.285714,0.285714,0.428571,0.0,7.0
1,test_1,3,166,55,1.333333,57.879185,1.75,0,0.5,0.0,...,3,1.333333,0.0,0.75,0.25,0.0,0.0,0.75,0.25,2.0
2,test_2,2,18,9,2.5,7.361216,1.2,0,0.6,0.0,...,2,2.5,0.4,0.6,0.0,0.0,0.6,0.4,0.0,5.0
3,test_3,14,272,19,1.714286,17.715031,3.0,3,0.083333,0.083333,...,7,3.428571,0.25,0.208333,0.541667,0.083333,0.458333,0.375,0.083333,12.0
4,test_4,7,265,37,2.142857,36.517818,4.0,6,0.066667,0.2,...,4,3.75,0.533333,0.133333,0.333333,0.266667,0.266667,0.333333,0.133333,7.5


In [835]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 구매시각을 이용한 특성생성

In [836]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

- 구매시간대 컬럼생성

In [837]:
# def shopping_time(df):
#     # 날짜별 쇼핑시간 계산
#     shopping_by_date = df.groupby(['ID', '구매날짜']).agg({
#         '구매시각': lambda x: (max(x).hour * 60 + max(x).minute - (min(x).hour * 60 + min(x).minute)) / 60
#     }).reset_index()
#     shopping_by_date.columns = ['ID', '구매날짜', '쇼핑시간']
    
#     # 쇼핑시간이 있는 경우만 통계 계산
#     shopping_stats = shopping_by_date[shopping_by_date['쇼핑시간'] > 0].groupby('ID').agg({
#         '쇼핑시간': ['mean', 'std', 'max', 'min', 'count']
#     }).reset_index()
    
#     shopping_stats.columns = [
#         'ID', 
#         '평균쇼핑시간', 
#         '쇼핑시간_표준편차', 
#         '최대쇼핑시간', 
#         '최소쇼핑시간',
#         '쇼핑일수'
#     ]
    
#     return shopping_stats

# # train/test 적용
# train_shopping = shopping_time(train_tr_clean)

# # 기존 feature에 병합
# train_ft = train_ft.merge(train_shopping, on='ID', how='left')

# # NaN값 처리
# shopping_cols = ['평균쇼핑시간', '쇼핑시간_표준편차', '최대쇼핑시간', '최소쇼핑시간', '쇼핑일수']
# train_ft[shopping_cols] = train_ft[shopping_cols].fillna(0)
# train_ft.head()

In [838]:
# test_shopping = shopping_time(test_tr_clean)
# test_ft = test_ft.merge(test_shopping, on='ID', how='left')
# test_ft[shopping_cols] = test_ft[shopping_cols].fillna(0)
# test_ft.head()

In [839]:
agg_list = [
    ("18시이전_구매비율", lambda x: np.mean([t.hour < 18 for t in x])),
    ("18시이후_구매비율", lambda x: np.mean([t.hour >= 18 for t in x])),
    ("오전_구매비율", lambda x: np.mean([t.hour < 12 for t in x])),
    ("오후_구매비율", lambda x: np.mean([t.hour >= 12 for t in x])),
    ("주구매시간대", lambda x: pd.Series([t.hour for t in x]).mode()[0]),
    ("평균구매시각", lambda x: np.mean([t.hour + t.minute/60 for t in x])),
    # ("야간구매비율", lambda x: np.mean([(t.hour >= 21) | (t.hour <= 23) for t in x])),
    # ("새벽구매비율", lambda x: np.mean([(t.hour >= 0) & (t.hour < 6) for t in x])),
    ("아침_구매비율", lambda x: np.mean([6 <= t.hour < 11 for t in x])),
    ("점심_구매비율", lambda x: np.mean([11 <= t.hour < 14 for t in x])),
    ("저녁_구매비율", lambda x: np.mean([17 <= t.hour < 21 for t in x])),
]

tmp = train_tr_clean.groupby("ID")["구매시각"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,연도별평균구매건수,18시이전_구매비율,18시이후_구매비율,오전_구매비율,오후_구매비율,주구매시간대,평균구매시각,아침_구매비율,점심_구매비율,저녁_구매비율
0,train_0,9,255,28,1.555556,22.744789,3.0,2,0.071429,0.0,...,7.0,0.642857,0.357143,0.0,1.0,12,16.125,0.0,0.357143,0.571429
1,train_1,21,352,16,2.0,11.739134,2.309524,3,0.047619,0.261905,...,21.0,0.690476,0.309524,0.047619,0.952381,18,15.615873,0.02381,0.380952,0.428571
2,train_2,55,356,6,1.854545,5.019987,2.333333,0,0.254902,0.186275,...,51.0,0.852941,0.147059,0.068627,0.931373,15,15.555392,0.019608,0.196078,0.254902
3,train_3,90,345,3,2.055556,3.137146,3.081081,4,0.075676,0.113514,...,92.5,0.778378,0.221622,0.064865,0.935135,15,15.72,0.010811,0.248649,0.372973
4,train_4,24,313,13,1.5,11.962731,3.861111,4,0.083333,0.055556,...,18.0,0.75,0.25,0.111111,0.888889,16,15.833333,0.027778,0.194444,0.361111


In [840]:
tmp = test_tr_clean.groupby("ID")["구매시각"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how="left", on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,연도별평균구매건수,18시이전_구매비율,18시이후_구매비율,오전_구매비율,오후_구매비율,주구매시간대,평균구매시각,아침_구매비율,점심_구매비율,저녁_구매비율
0,test_0,5,164,32,1.4,30.416096,3.428571,1,0.142857,0.285714,...,7.0,1.0,0.0,0.0,1.0,14,14.078571,0.0,0.428571,0.0
1,test_1,3,166,55,1.333333,57.879185,1.75,0,0.5,0.0,...,2.0,1.0,0.0,0.25,0.75,11,15.095833,0.0,0.25,0.25
2,test_2,2,18,9,2.5,7.361216,1.2,0,0.6,0.0,...,5.0,1.0,0.0,0.4,0.6,17,15.066667,0.0,0.4,0.6
3,test_3,14,272,19,1.714286,17.715031,3.0,3,0.083333,0.083333,...,12.0,0.458333,0.541667,0.0,1.0,18,17.884722,0.0,0.0,0.708333
4,test_4,7,265,37,2.142857,36.517818,4.0,6,0.066667,0.2,...,7.5,0.533333,0.466667,0.0,1.0,19,17.278889,0.0,0.066667,0.6


In [841]:
agg_list = [
    # ("평일_18시이전_구매비율", lambda x: np.mean((x.dt.weekday < 5) & (x.dt.hour < 18))),
    # ("평일_18시이후_구매비율", lambda x: np.mean((x.dt.weekday < 5) & (x.dt.hour >= 18))),
    # ("주말_18시이전_구매비율", lambda x: np.mean((x.dt.weekday >= 5) & (x.dt.hour < 18))),
    # ("주말_18시이후_구매비율", lambda x: np.mean((x.dt.weekday >= 5) & (x.dt.hour >= 18))),
    ("주말구매비율", lambda x: np.mean(x.dt.weekday >= 5))
]

tmp = train_tr_clean.groupby('ID')['구매일시'].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left', on='ID')

tmp = test_tr_clean.groupby('ID')['구매일시'].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left', on='ID')

train_ft.shape, test_ft.shape

((14940, 35), (12225, 35))

In [842]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 지점을 이용한 특성생성

In [843]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [844]:
# agg_list = [
#     ("방문지점수", "nunique"),
#     ("주구매지점", lambda x: x.mode()[0]),
#     ("주구매지점_이용비율", lambda x: x[x == x.mode()[0]].count() / x.count()),
# ]

# tmp = train_tr_clean.groupby("ID")["지점코드"].agg(agg_list).reset_index()
# train_ft = train_ft.merge(tmp, how="left", on="ID")
# train_ft.head()

In [845]:
# tmp = test_tr_clean.groupby('ID')["지점코드"].agg(agg_list).reset_index()
# test_ft = test_ft.merge(tmp, how='left',on="ID")
# test_ft.head()

In [846]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 브랜드코드를 이용한 특성생성

In [847]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [848]:
agg_list = [
    ("브랜드코드_nunique", "nunique"),
    ("선호브랜드코드", lambda x: x.mode()[0]),
    ("선호브랜드코드_구매비율", lambda x: x[x == x.mode()[0]].count() / x.count())
]

tmp = train_tr_clean.groupby("ID")["브랜드코드"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,오후_구매비율,주구매시간대,평균구매시각,아침_구매비율,점심_구매비율,저녁_구매비율,주말구매비율,브랜드코드_nunique,선호브랜드코드,선호브랜드코드_구매비율
0,train_0,9,255,28,1.555556,22.744789,3.0,2,0.071429,0.0,...,1.0,12,16.125,0.0,0.357143,0.571429,0.214286,13,5405,0.142857
1,train_1,21,352,16,2.0,11.739134,2.309524,3,0.047619,0.261905,...,0.952381,18,15.615873,0.02381,0.380952,0.428571,0.02381,26,5100,0.142857
2,train_2,55,356,6,1.854545,5.019987,2.333333,0,0.254902,0.186275,...,0.931373,15,15.555392,0.019608,0.196078,0.254902,0.205882,58,5159,0.117647
3,train_3,90,345,3,2.055556,3.137146,3.081081,4,0.075676,0.113514,...,0.935135,15,15.72,0.010811,0.248649,0.372973,0.178378,99,5217,0.043243
4,train_4,24,313,13,1.5,11.962731,3.861111,4,0.083333,0.055556,...,0.888889,16,15.833333,0.027778,0.194444,0.361111,0.361111,16,5100,0.472222


In [849]:
tmp = test_tr_clean.groupby('ID')["브랜드코드"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left',on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,오후_구매비율,주구매시간대,평균구매시각,아침_구매비율,점심_구매비율,저녁_구매비율,주말구매비율,브랜드코드_nunique,선호브랜드코드,선호브랜드코드_구매비율
0,test_0,5,164,32,1.4,30.416096,3.428571,1,0.142857,0.285714,...,1.0,14,14.078571,0.0,0.428571,0.0,0.571429,5,5100,0.428571
1,test_1,3,166,55,1.333333,57.879185,1.75,0,0.5,0.0,...,0.75,11,15.095833,0.0,0.25,0.25,0.25,4,5149,0.25
2,test_2,2,18,9,2.5,7.361216,1.2,0,0.6,0.0,...,0.6,17,15.066667,0.0,0.4,0.6,0.0,5,5111,0.2
3,test_3,14,272,19,1.714286,17.715031,3.0,3,0.083333,0.083333,...,1.0,18,17.884722,0.0,0.0,0.708333,0.041667,18,5956,0.166667
4,test_4,7,265,37,2.142857,36.517818,4.0,6,0.066667,0.2,...,1.0,19,17.278889,0.0,0.066667,0.6,0.466667,12,5100,0.133333


In [850]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 중분류를 이용한 특성생성

In [851]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [852]:
agg_list = [
    ("중분류_nunique", "nunique"),
    ("주구매_중분류", lambda x: x.mode()[0]),
    
    ('주구매_중분류_아동용품', lambda x: (x.str.contains("아동|유아|신생아|완구|팬시|주니어", regex=True)).mean()),
    ('주구매_중분류_취미용품', lambda x: (x.str.contains('아웃도어|골프|스포츠|취미|수예|레포츠', regex=True)).mean()),
    # ('주구매_중분류_가전제품', lambda x: (x.str.contains('TV|냉장고|취사', regex=True)).mean()),
    # ('주구매_중분류_가구', lambda x: (x.str.contains('가구|식탁|쇼파|소파|침대|침구|홈', regex=True)).mean()),
    # ('주구매_중분류_사치품', lambda x: (x.str.contains('보석|모피,', regex=True)).mean()),
    ("주구매_중분류_악세서리", lambda x: (x.str.contains("악세사리|액세서리|보석|핸드백|장신구|시계|ACC", regex=True)).mean()),
    # ("주구매_중분류_명품", lambda x: (x.str.contains("명품|부띠끄|로얄|부틱|엘레강스", regex=True)).mean()),
    # ("주구매_중분류_화장품", lambda x: (x.str.contains("화장품|향수", regex=True)).mean()),
    ("주구매_중분류_영패션", lambda x: (x.str.contains("영|캐쥬얼", regex=True)).mean()),
    # ('주구매_중분류_중장년타겟제품', lambda x: (x.str.contains('건강식품|머플러|양말|도자기', regex=True)).mean()),
    # ("주구매_중분류_선물용품", lambda x: x.str.contains("차류|스카프|손수건|머플러", regex=True).mean()),
    ("주구매_중분류_주방용품", lambda x: (x.str.contains("야채|용기보증|주방용품|생선", regex=True)).mean()),
]

tmp = train_tr_clean.groupby("ID")["중분류"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left')
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,브랜드코드_nunique,선호브랜드코드,선호브랜드코드_구매비율,중분류_nunique,주구매_중분류,주구매_중분류_아동용품,주구매_중분류_취미용품,주구매_중분류_악세서리,주구매_중분류_영패션,주구매_중분류_주방용품
0,train_0,9,255,28,1.555556,22.744789,3.0,2,0.071429,0.0,...,13,5405,0.142857,11,골프웨어,0.142857,0.214286,0.0,0.0,0.0
1,train_1,21,352,16,2.0,11.739134,2.309524,3,0.047619,0.261905,...,26,5100,0.142857,24,영캐주얼,0.0,0.071429,0.0,0.190476,0.02381
2,train_2,55,356,6,1.854545,5.019987,2.333333,0,0.254902,0.186275,...,58,5159,0.117647,35,아웃도어,0.0,0.254902,0.029412,0.098039,0.039216
3,train_3,90,345,3,2.055556,3.137146,3.081081,4,0.075676,0.113514,...,99,5217,0.043243,60,수입종합화장품,0.016216,0.054054,0.064865,0.210811,0.021622
4,train_4,24,313,13,1.5,11.962731,3.861111,4,0.083333,0.055556,...,16,5100,0.472222,19,용기보증,0.027778,0.0,0.0,0.027778,0.194444


In [853]:
tmp = test_tr_clean.groupby('ID')["중분류"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left')
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,브랜드코드_nunique,선호브랜드코드,선호브랜드코드_구매비율,중분류_nunique,주구매_중분류,주구매_중분류_아동용품,주구매_중분류_취미용품,주구매_중분류_악세서리,주구매_중분류_영패션,주구매_중분류_주방용품
0,test_0,5,164,32,1.4,30.416096,3.428571,1,0.142857,0.285714,...,5,5100,0.428571,4,용기보증,0.285714,0.0,0.0,0.0,0.428571
1,test_1,3,166,55,1.333333,57.879185,1.75,0,0.5,0.0,...,4,5149,0.25,4,남성구두,0.0,0.0,0.0,0.0,0.0
2,test_2,2,18,9,2.5,7.361216,1.2,0,0.6,0.0,...,5,5111,0.2,4,수입종합화장품,0.0,0.0,0.0,0.2,0.0
3,test_3,14,272,19,1.714286,17.715031,3.0,3,0.083333,0.083333,...,18,5956,0.166667,11,영캐주얼,0.0,0.125,0.041667,0.25,0.0
4,test_4,7,265,37,2.142857,36.517818,4.0,6,0.066667,0.2,...,12,5100,0.133333,11,칼라드래디셔널,0.0,0.0,0.0,0.266667,0.066667


In [854]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 대분류를 이용한 특성생성

In [855]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [856]:
agg_list = [
    ("대분류_nunique", "nunique"),
    ("주구매_대분류", lambda x: x.mode()[0]),
    
    # ("주구매_대분류_아동용품", lambda x: (x.str.contains("아동")).mean()),
    # ("주구매_대분류_잡화", lambda x: (x.str.contains("영|잡화", regex=True)).mean()), # 추가고려
    ('주구매_대분류_스포츠', lambda x: (x.str.contains('골프|스포츠', regex=True)).mean()),
    # ("주구매_대분류_가정용품", lambda x: (x.str.contains("가정용품")).mean()),
    ("주구매_대분류_명품", lambda x: (x.str.contains("로얄|명품", regex=True)).mean()),
    # ("주구매_대분류_생식품", lambda x: (x.str.contains("생식품")).mean()),
]

tmp = train_tr_clean.groupby("ID")["대분류"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left')
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,주구매_중분류,주구매_중분류_아동용품,주구매_중분류_취미용품,주구매_중분류_악세서리,주구매_중분류_영패션,주구매_중분류_주방용품,대분류_nunique,주구매_대분류,주구매_대분류_스포츠,주구매_대분류_명품
0,train_0,9,255,28,1.555556,22.744789,3.0,2,0.071429,0.0,...,골프웨어,0.142857,0.214286,0.0,0.0,0.0,9,패션잡화,0.214286,0.0
1,train_1,21,352,16,2.0,11.739134,2.309524,3,0.047619,0.261905,...,영캐주얼,0.0,0.071429,0.0,0.190476,0.02381,12,영플라자,0.071429,0.047619
2,train_2,55,356,6,1.854545,5.019987,2.333333,0,0.254902,0.186275,...,아웃도어,0.0,0.254902,0.029412,0.098039,0.039216,14,명품잡화,0.411765,0.264706
3,train_3,90,345,3,2.055556,3.137146,3.081081,4,0.075676,0.113514,...,수입종합화장품,0.016216,0.054054,0.064865,0.210811,0.021622,16,케주얼_구두_아동,0.108108,0.021622
4,train_4,24,313,13,1.5,11.962731,3.861111,4,0.083333,0.055556,...,용기보증,0.027778,0.0,0.0,0.027778,0.194444,8,공산품파트,0.0,0.194444


In [857]:
tmp = test_tr_clean.groupby('ID')["대분류"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left')
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,주구매_중분류,주구매_중분류_아동용품,주구매_중분류_취미용품,주구매_중분류_악세서리,주구매_중분류_영패션,주구매_중분류_주방용품,대분류_nunique,주구매_대분류,주구매_대분류_스포츠,주구매_대분류_명품
0,test_0,5,164,32,1.4,30.416096,3.428571,1,0.142857,0.285714,...,용기보증,0.285714,0.0,0.0,0.0,0.428571,5,공산품,0.0,0.0
1,test_1,3,166,55,1.333333,57.879185,1.75,0,0.5,0.0,...,남성구두,0.0,0.0,0.0,0.0,0.0,3,명품잡화,0.0,0.5
2,test_2,2,18,9,2.5,7.361216,1.2,0,0.6,0.0,...,수입종합화장품,0.0,0.0,0.0,0.2,0.0,3,잡화파트,0.0,0.0
3,test_3,14,272,19,1.714286,17.715031,3.0,3,0.083333,0.083333,...,영캐주얼,0.0,0.125,0.041667,0.25,0.0,4,영플라자,0.041667,0.0
4,test_4,7,265,37,2.142857,36.517818,4.0,6,0.066667,0.2,...,칼라드래디셔널,0.0,0.0,0.0,0.266667,0.066667,6,여성캐주얼,0.066667,0.0


In [858]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 구매가격을 이용한 특성생성

In [859]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

- 구매가격 사분위수

In [860]:
high_purchase = train_tr_clean["구매가격"].quantile(0.95)
low_purchase = train_tr_clean["구매가격"].quantile(0.05)

high_purchase, low_purchase

(390000.0, 9729.150000000001)

- 환불가격 사분위수

In [861]:
low_refund, high_refund = refund_train["구매가격"].quantile(0.95), refund_train["구매가격"].quantile(0.05)
low_refund, high_refund

(-20800.0, -832255.0)

- 순수 구매 관련 컬럼

In [862]:
agg_list = [
    ("총구매금액", "sum"),
    ("구매건수", "count"),
    ("평균구매금액", "mean"),
    ("구매금액_median", 'median'),
    ("최대구매금액", 'max'),
    ("최소구매금액", 'min'),
    ("구매금액표준편차", 'std'),
    ("구매금액_skew", 'skew'),
    ("구매금액_kurtosis", lambda x: x.kurtosis()),
    ("고가제품_구매비율", lambda x: np.mean(x > high_purchase)),
    ("중가제품_구매비율", lambda x: np.mean((x >= low_purchase) & (x <= high_purchase))),
    ("저가제품_구매비율", lambda x: np.mean(x < low_purchase)),
    ("구매금액_변동성", lambda x: x.std() / x.mean() if x.mean() != 0 else 0),
]

tmp = train_tr_clean.groupby("ID")["구매가격"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, "left", "ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,구매금액_median,최대구매금액,최소구매금액,구매금액표준편차,구매금액_skew,구매금액_kurtosis,고가제품_구매비율,중가제품_구매비율,저가제품_구매비율,구매금액_변동성
0,train_0,9,255,28,1.555556,22.744789,3.0,2,0.071429,0.0,...,187000.0,1000000,20000,313025.1891,1.670489,1.990929,0.214286,0.785714,0.0,1.043119
1,train_1,21,352,16,2.0,11.739134,2.309524,3,0.047619,0.261905,...,90450.0,698000,5600,126592.626144,2.651821,9.771083,0.02381,0.952381,0.02381,1.054306
2,train_2,55,356,6,1.854545,5.019987,2.333333,0,0.254902,0.186275,...,93500.0,2770000,4400,415371.770475,4.393399,21.956275,0.107843,0.872549,0.019608,1.884243
3,train_3,90,345,3,2.055556,3.137146,3.081081,4,0.075676,0.113514,...,120000.0,1492000,10000,239163.307349,2.64347,8.320466,0.140541,0.859459,0.0,1.234529
4,train_4,24,313,13,1.5,11.962731,3.861111,4,0.083333,0.055556,...,41210.5,900000,7640,179263.263965,3.01761,10.550421,0.055556,0.916667,0.027778,1.617873


In [863]:
tmp = test_tr_clean.groupby('ID')["구매가격"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, 'left', "ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,구매금액_median,최대구매금액,최소구매금액,구매금액표준편차,구매금액_skew,구매금액_kurtosis,고가제품_구매비율,중가제품_구매비율,저가제품_구매비율,구매금액_변동성
0,test_0,5,164,32,1.4,30.416096,3.428571,1,0.142857,0.285714,...,58536.0,110000,26643,32293.148313,0.702551,-1.103688,0.0,1.0,0.0,0.526218
1,test_1,3,166,55,1.333333,57.879185,1.75,0,0.5,0.0,...,67500.0,118000,37600,33658.431336,0.856705,1.438447,0.0,1.0,0.0,0.463296
2,test_2,2,18,9,2.5,7.361216,1.2,0,0.6,0.0,...,95000.0,326000,38000,117233.954126,1.969409,4.075918,0.0,1.0,0.0,0.965683
3,test_3,14,272,19,1.714286,17.715031,3.0,3,0.083333,0.083333,...,55250.0,403000,7040,102440.792115,1.507265,2.087113,0.041667,0.916667,0.041667,1.066286
4,test_4,7,265,37,2.142857,36.517818,4.0,6,0.066667,0.2,...,79000.0,376000,16200,99879.156875,1.839077,3.277987,0.0,1.0,0.0,0.955056


- 환불된 데이터에 대한 컬럼

In [864]:
agg_list = [
    ("환불건수", "count"),
    ("총환불금액", "sum"),
    ("평균환불금액", "mean"),
    ("최소환불금액", "max"),
    ("최대환불금액", "min"),
    ("환불금액표준편차", "std"),
    ("환불금액_skew", "skew"), 
    ("환불금액_kurtosis", lambda x: x.kurtosis()),
    ("고가제품환불비율", lambda x: np.mean(x < high_refund)),
    ("중가제품환불비율", lambda x: np.mean((x >= high_refund) & (x <= low_refund))),
    ("저가제품환불비율", lambda x: np.mean(x > low_refund)),
]

tmp = refund_train.groupby("ID")["구매가격"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, "left", "ID").fillna(0)
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,총환불금액,평균환불금액,최소환불금액,최대환불금액,환불금액표준편차,환불금액_skew,환불금액_kurtosis,고가제품환불비율,중가제품환불비율,저가제품환불비율
0,train_0,9,255,28,1.555556,22.744789,3.0,2,0.071429,0.0,...,-2517000.0,-839000.0,-205000.0,-1236000.0,554857.6394,1.571433,0.0,0.666667,0.333333,0.0
1,train_1,21,352,16,2.0,11.739134,2.309524,3,0.047619,0.261905,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,train_2,55,356,6,1.854545,5.019987,2.333333,0,0.254902,0.186275,...,-1072500.0,-153214.285714,-4500.0,-288000.0,110664.604829,0.310886,-1.234108,0.0,0.714286,0.285714
3,train_3,90,345,3,2.055556,3.137146,3.081081,4,0.075676,0.113514,...,-2867800.0,-204842.857143,-58000.0,-768000.0,194802.324715,-2.219921,5.204875,0.0,1.0,0.0
4,train_4,24,313,13,1.5,11.962731,3.861111,4,0.083333,0.055556,...,-6954400.0,-534953.846154,-49000.0,-1204000.0,379184.767221,-0.381901,-1.126703,0.307692,0.692308,0.0


In [865]:
tmp = refund_test.groupby("ID")["구매가격"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, "left", "ID").fillna(0)
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,총환불금액,평균환불금액,최소환불금액,최대환불금액,환불금액표준편차,환불금액_skew,환불금액_kurtosis,고가제품환불비율,중가제품환불비율,저가제품환불비율
0,test_0,5,164,32,1.4,30.416096,3.428571,1,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,test_1,3,166,55,1.333333,57.879185,1.75,0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_2,2,18,9,2.5,7.361216,1.2,0,0.6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_3,14,272,19,1.714286,17.715031,3.0,3,0.083333,0.083333,...,-1092000.0,-273000.0,-140000.0,-403000.0,137663.841779,0.014171,-5.642846,0.0,1.0,0.0
4,test_4,7,265,37,2.142857,36.517818,4.0,6,0.066667,0.2,...,-372200.0,-124066.666667,-38000.0,-196200.0,80015.081912,0.759841,0.0,0.0,1.0,0.0


- 추가/환불 후 재구매

In [866]:
train_ft = train_ft.merge(repurchase_count_train, "left", "ID").fillna(0)
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,평균환불금액,최소환불금액,최대환불금액,환불금액표준편차,환불금액_skew,환불금액_kurtosis,고가제품환불비율,중가제품환불비율,저가제품환불비율,환불후재구매브랜드_count
0,train_0,9,255,28,1.555556,22.744789,3.0,2,0.071429,0.0,...,-839000.0,-205000.0,-1236000.0,554857.6394,1.571433,0.0,0.666667,0.333333,0.0,0.0
1,train_1,21,352,16,2.0,11.739134,2.309524,3,0.047619,0.261905,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,train_2,55,356,6,1.854545,5.019987,2.333333,0,0.254902,0.186275,...,-153214.285714,-4500.0,-288000.0,110664.604829,0.310886,-1.234108,0.0,0.714286,0.285714,0.0
3,train_3,90,345,3,2.055556,3.137146,3.081081,4,0.075676,0.113514,...,-204842.857143,-58000.0,-768000.0,194802.324715,-2.219921,5.204875,0.0,1.0,0.0,1.0
4,train_4,24,313,13,1.5,11.962731,3.861111,4,0.083333,0.055556,...,-534953.846154,-49000.0,-1204000.0,379184.767221,-0.381901,-1.126703,0.307692,0.692308,0.0,0.0


In [867]:
test_ft = test_ft.merge(repurchase_count_test, "left", "ID").fillna(0)
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,평균환불금액,최소환불금액,최대환불금액,환불금액표준편차,환불금액_skew,환불금액_kurtosis,고가제품환불비율,중가제품환불비율,저가제품환불비율,환불후재구매브랜드_count
0,test_0,5,164,32,1.4,30.416096,3.428571,1,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,test_1,3,166,55,1.333333,57.879185,1.75,0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_2,2,18,9,2.5,7.361216,1.2,0,0.6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_3,14,272,19,1.714286,17.715031,3.0,3,0.083333,0.083333,...,-273000.0,-140000.0,-403000.0,137663.841779,0.014171,-5.642846,0.0,1.0,0.0,0.0
4,test_4,7,265,37,2.142857,36.517818,4.0,6,0.066667,0.2,...,-124066.666667,-38000.0,-196200.0,80015.081912,0.759841,0.0,0.0,1.0,0.0,0.0


In [868]:
train_ft["환불후재구매비율"] = np.where((train_ft["환불건수"] + train_ft["환불후재구매브랜드_count"]) == 0, 0, (train_ft["환불후재구매브랜드_count"] / (train_ft["환불건수"] + train_ft["환불후재구매브랜드_count"])))
test_ft["환불후재구매비율"] = np.where((test_ft["환불건수"] + test_ft["환불후재구매브랜드_count"]) == 0, 0, (test_ft["환불후재구매브랜드_count"] / (test_ft["환불건수"] + test_ft["환불후재구매브랜드_count"])))
train_ft["환불후재구매비율"].isnull().sum(), test_ft["환불후재구매비율"].isnull().sum()

(0, 0)

### 구매가격을 이용한 등급 특성 생성

In [869]:
def categorize_customer(total_amount):
    if total_amount >= 20_000_000:
        return 'Diamond'
    elif total_amount >= 15_000_000:
        return 'Platinum'
    elif total_amount >= 10_000_000:
        return 'Gold'
    elif total_amount >= 6_000_000:
        return 'Black'
    else:
        return 'Normal'
    
train_ft['고객등급'] = train_ft['총구매금액'].apply(categorize_customer)
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,최대환불금액,환불금액표준편차,환불금액_skew,환불금액_kurtosis,고가제품환불비율,중가제품환불비율,저가제품환불비율,환불후재구매브랜드_count,환불후재구매비율,고객등급
0,train_0,9,255,28,1.555556,22.744789,3.0,2,0.071429,0.0,...,-1236000.0,554857.6394,1.571433,0.0,0.666667,0.333333,0.0,0.0,0.0,Normal
1,train_1,21,352,16,2.0,11.739134,2.309524,3,0.047619,0.261905,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
2,train_2,55,356,6,1.854545,5.019987,2.333333,0,0.254902,0.186275,...,-288000.0,110664.604829,0.310886,-1.234108,0.0,0.714286,0.285714,0.0,0.0,Diamond
3,train_3,90,345,3,2.055556,3.137146,3.081081,4,0.075676,0.113514,...,-768000.0,194802.324715,-2.219921,5.204875,0.0,1.0,0.0,1.0,0.066667,Diamond
4,train_4,24,313,13,1.5,11.962731,3.861111,4,0.083333,0.055556,...,-1204000.0,379184.767221,-0.381901,-1.126703,0.307692,0.692308,0.0,0.0,0.0,Normal


In [870]:
test_ft['고객등급'] = test_ft['총구매금액'].apply(categorize_customer)
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,일별평균구매건수,구매간격_표준편차,평균구매요일,주구매요일,월요일_구매비율,화요일_구매비율,...,최대환불금액,환불금액표준편차,환불금액_skew,환불금액_kurtosis,고가제품환불비율,중가제품환불비율,저가제품환불비율,환불후재구매브랜드_count,환불후재구매비율,고객등급
0,test_0,5,164,32,1.4,30.416096,3.428571,1,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
1,test_1,3,166,55,1.333333,57.879185,1.75,0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
2,test_2,2,18,9,2.5,7.361216,1.2,0,0.6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
3,test_3,14,272,19,1.714286,17.715031,3.0,3,0.083333,0.083333,...,-403000.0,137663.841779,0.014171,-5.642846,0.0,1.0,0.0,0.0,0.0,Normal
4,test_4,7,265,37,2.142857,36.517818,4.0,6,0.066667,0.2,...,-196200.0,80015.081912,0.759841,0.0,0.0,1.0,0.0,0.0,0.0,Normal


In [871]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## pivot_table을 이용한 특성 생성

In [872]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

### ID, 지점코드별

- 구매횟수

In [873]:
train_tmp = pd.pivot_table(
    train_tr_clean,
    index="ID",
    columns="지점코드",
    values="구매가격",
    aggfunc="count",
    fill_value=0,
).add_prefix("지점코드_구매횟수_")

train_tmp

지점코드,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
train_0,8,6,0,0
train_1,31,2,0,9
train_10,0,118,10,0
train_100,5,9,0,6
train_1000,0,2,0,13
...,...,...,...,...
train_9995,1,0,0,0
train_9996,1,22,0,0
train_9997,15,0,0,0
train_9998,24,0,0,5


In [874]:
test_tmp = pd.pivot_table(
    test_tr_clean,
    index="ID",
    columns="지점코드",
    values="구매가격",
    aggfunc="count",
    fill_value=0,
).add_prefix("지점코드_구매횟수_")

test_tmp

지점코드,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
test_0,0,0,7,0
test_1,1,0,0,3
test_10,28,0,0,0
test_100,103,3,0,5
test_1000,0,0,0,3
...,...,...,...,...
test_9995,16,0,0,0
test_9996,0,0,44,0
test_9997,0,0,0,21
test_9998,0,0,62,0


- 구매금액

In [875]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="지점코드",
        values="구매가격",
        aggfunc="sum",
        fill_value=0,
    )
    .add_prefix("지점코드_구매합계_")
    .reset_index(),
    on="ID",
    how="left",
)

train_tmp

지점코드,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000
0,train_0,8,6,0,0,1242800,2958400,0,0
1,train_1,31,2,0,9,4172100,50400,0,820525
2,train_10,0,118,10,0,0,10006771,1228099,0
3,train_100,5,9,0,6,919000,594320,0,362567
4,train_1000,0,2,0,13,0,155200,0,602300
...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,36000,0,0,0
14936,train_9996,1,22,0,0,192000,4582410,0,0
14937,train_9997,15,0,0,0,795700,0,0,0
14938,train_9998,24,0,0,5,1782325,0,0,287000


In [876]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="지점코드",
        values="구매가격",
        aggfunc="sum",
        fill_value=0,
    )
    .add_prefix("지점코드_구매합계_")
    .reset_index(),
    on="ID",
    how="left",
)

test_tmp

지점코드,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000
0,test_0,0,0,7,0,0,0,429579,0
1,test_1,1,0,0,3,62000,0,0,228600
2,test_10,28,0,0,0,2265788,0,0,0
3,test_100,103,3,0,5,2363841,350000,0,112550
4,test_1000,0,0,0,3,0,0,0,227000
...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,1061128,0,0,0
12221,test_9996,0,0,44,0,0,0,1624893,0
12222,test_9997,0,0,0,21,0,0,0,1635600
12223,test_9998,0,0,62,0,0,0,6730872,0


### ID, 중분류별

- 구매횟수

In [877]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="중분류",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("중분류_구매횟수_")
    .reset_index(),
    on="ID",
    how="left",
)

train_tmp

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,중분류_구매횟수_행사슈즈,중분류_구매횟수_행사핸드백,중분류_구매횟수_향수,중분류_구매횟수_헤어ACC,중분류_구매횟수_헤어악세사리,중분류_구매횟수_헤어액세사리,중분류_구매횟수_홈데코,중분류_구매횟수_화장잡화,중분류_구매횟수_화장품,중분류_구매횟수_훼미닌부틱
0,train_0,8,6,0,0,1242800,2958400,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,train_1,31,2,0,9,4172100,50400,0,820525,0,...,0,0,0,0,0,0,0,0,0,0
2,train_10,0,118,10,0,0,10006771,1228099,0,0,...,0,0,0,0,0,0,0,0,0,0
3,train_100,5,9,0,6,919000,594320,0,362567,0,...,0,0,0,0,0,0,0,0,0,0
4,train_1000,0,2,0,13,0,155200,0,602300,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,36000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14936,train_9996,1,22,0,0,192000,4582410,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14937,train_9997,15,0,0,0,795700,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14938,train_9998,24,0,0,5,1782325,0,0,287000,0,...,0,0,0,0,0,0,0,0,0,0


In [878]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="중분류",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("중분류_구매횟수_")
    .reset_index(),
    on="ID",
    how="left",
)

test_tmp

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,중분류_구매횟수_행사슈즈,중분류_구매횟수_행사핸드백,중분류_구매횟수_향수,중분류_구매횟수_헤어ACC,중분류_구매횟수_헤어악세사리,중분류_구매횟수_헤어액세사리,중분류_구매횟수_홈데코,중분류_구매횟수_화장잡화,중분류_구매횟수_화장품,중분류_구매횟수_훼미닌부틱
0,test_0,0,0,7,0,0,0,429579,0,0,...,0,0,0,0,0,0,0,0,0,0
1,test_1,1,0,0,3,62000,0,0,228600,0,...,0,0,0,0,0,0,0,0,0,0
2,test_10,28,0,0,0,2265788,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,test_100,103,3,0,5,2363841,350000,0,112550,0,...,0,0,1,0,0,0,0,0,0,0
4,test_1000,0,0,0,3,0,0,0,227000,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,1061128,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
12221,test_9996,0,0,44,0,0,0,1624893,0,0,...,0,0,0,0,1,0,0,0,0,0
12222,test_9997,0,0,0,21,0,0,0,1635600,0,...,0,0,0,0,0,2,0,0,0,0
12223,test_9998,0,0,62,0,0,0,6730872,0,0,...,0,0,0,0,0,0,0,0,0,0


- 구매금액

In [879]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="중분류",
        values="구매가격",
        aggfunc="sum",
        fill_value=0,
    )
    .add_prefix("중분류_구매합계_")
    .reset_index(),
    on="ID",
    how="left",
)

train_tmp

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,중분류_구매합계_행사슈즈,중분류_구매합계_행사핸드백,중분류_구매합계_향수,중분류_구매합계_헤어ACC,중분류_구매합계_헤어악세사리,중분류_구매합계_헤어액세사리,중분류_구매합계_홈데코,중분류_구매합계_화장잡화,중분류_구매합계_화장품,중분류_구매합계_훼미닌부틱
0,train_0,8,6,0,0,1242800,2958400,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,train_1,31,2,0,9,4172100,50400,0,820525,0,...,0,0,0,0,0,0,0,0,0,0
2,train_10,0,118,10,0,0,10006771,1228099,0,0,...,0,0,0,0,0,0,0,0,0,0
3,train_100,5,9,0,6,919000,594320,0,362567,0,...,0,0,0,0,0,0,0,0,0,0
4,train_1000,0,2,0,13,0,155200,0,602300,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,36000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14936,train_9996,1,22,0,0,192000,4582410,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14937,train_9997,15,0,0,0,795700,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14938,train_9998,24,0,0,5,1782325,0,0,287000,0,...,0,0,0,0,0,0,0,0,0,0


In [880]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="중분류",
        values="구매가격",
        aggfunc="sum",
        fill_value=0,
    )
    .add_prefix("중분류_구매합계_")
    .reset_index(),
    on="ID",
    how="left",
)

test_tmp

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,중분류_구매합계_행사슈즈,중분류_구매합계_행사핸드백,중분류_구매합계_향수,중분류_구매합계_헤어ACC,중분류_구매합계_헤어악세사리,중분류_구매합계_헤어액세사리,중분류_구매합계_홈데코,중분류_구매합계_화장잡화,중분류_구매합계_화장품,중분류_구매합계_훼미닌부틱
0,test_0,0,0,7,0,0,0,429579,0,0,...,0,0,0,0,0,0,0,0,0,0
1,test_1,1,0,0,3,62000,0,0,228600,0,...,0,0,0,0,0,0,0,0,0,0
2,test_10,28,0,0,0,2265788,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,test_100,103,3,0,5,2363841,350000,0,112550,0,...,0,0,22000,0,0,0,0,0,0,0
4,test_1000,0,0,0,3,0,0,0,227000,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,1061128,0,0,0,0,...,0,0,0,0,0,0,0,0,145700,0
12221,test_9996,0,0,44,0,0,0,1624893,0,0,...,0,0,0,0,46000,0,0,0,0,0
12222,test_9997,0,0,0,21,0,0,0,1635600,0,...,0,0,0,0,0,30000,0,0,0,0
12223,test_9998,0,0,62,0,0,0,6730872,0,0,...,0,0,0,0,0,0,0,0,0,0


### ID, 대분류별

- 구매횟수

In [881]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="대분류",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("대분류_구매횟수_")
    .reset_index(),
    on="ID",
    how="left",
)

train_tmp

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,대분류_구매횟수_여성캐주얼,대분류_구매횟수_여성캐쥬얼,대분류_구매횟수_영라이브,대분류_구매횟수_영어덜트캐쥬얼,대분류_구매횟수_영캐릭터,대분류_구매횟수_영플라자,대분류_구매횟수_잡화,대분류_구매횟수_잡화파트,대분류_구매횟수_케주얼_구두_아동,대분류_구매횟수_패션잡화
0,train_0,8,6,0,0,1242800,2958400,0,0,0,...,0,0,0,0,0,0,0,1,1,3
1,train_1,31,2,0,9,4172100,50400,0,820525,0,...,6,0,0,0,4,11,0,0,1,8
2,train_10,0,118,10,0,0,10006771,1228099,0,0,...,0,0,0,0,0,0,2,19,5,0
3,train_100,5,9,0,6,919000,594320,0,362567,0,...,0,0,0,0,0,1,0,3,4,1
4,train_1000,0,2,0,13,0,155200,0,602300,0,...,2,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,36000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
14936,train_9996,1,22,0,0,192000,4582410,0,0,0,...,0,0,0,0,0,1,0,2,12,0
14937,train_9997,15,0,0,0,795700,0,0,0,0,...,2,0,0,0,2,7,0,0,0,1
14938,train_9998,24,0,0,5,1782325,0,0,287000,0,...,1,0,0,0,0,0,0,0,0,9


In [882]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="대분류",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("대분류_구매횟수_")
    .reset_index(),
    on="ID",
    how="left",
)

test_tmp

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,대분류_구매횟수_여성캐주얼,대분류_구매횟수_여성캐쥬얼,대분류_구매횟수_영라이브,대분류_구매횟수_영어덜트캐쥬얼,대분류_구매횟수_영캐릭터,대분류_구매횟수_영플라자,대분류_구매횟수_잡화,대분류_구매횟수_잡화파트,대분류_구매횟수_케주얼_구두_아동,대분류_구매횟수_패션잡화
0,test_0,0,0,7,0,0,0,429579,0,0,...,0,1,0,0,0,0,1,0,0,0
1,test_1,1,0,0,3,62000,0,0,228600,0,...,1,0,0,0,0,0,0,0,0,0
2,test_10,28,0,0,0,2265788,0,0,0,0,...,0,0,0,0,4,3,0,0,0,11
3,test_100,103,3,0,5,2363841,350000,0,112550,0,...,5,0,0,0,1,1,0,2,0,16
4,test_1000,0,0,0,3,0,0,0,227000,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,1061128,0,0,0,0,...,2,0,0,0,3,3,0,0,0,2
12221,test_9996,0,0,44,0,0,0,1624893,0,0,...,0,2,4,4,0,0,4,0,0,0
12222,test_9997,0,0,0,21,0,0,0,1635600,0,...,2,0,0,0,0,0,0,0,0,0
12223,test_9998,0,0,62,0,0,0,6730872,0,0,...,0,6,10,1,0,0,8,0,0,0


- 구매금액

In [883]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="대분류",
        values="구매가격",
        aggfunc="sum",
        fill_value=0,
    )
    .add_prefix("대분류_구매합계_")
    .reset_index(),
    on="ID",
    how="left",
)

train_tmp

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,대분류_구매합계_여성캐주얼,대분류_구매합계_여성캐쥬얼,대분류_구매합계_영라이브,대분류_구매합계_영어덜트캐쥬얼,대분류_구매합계_영캐릭터,대분류_구매합계_영플라자,대분류_구매합계_잡화,대분류_구매합계_잡화파트,대분류_구매합계_케주얼_구두_아동,대분류_구매합계_패션잡화
0,train_0,8,6,0,0,1242800,2958400,0,0,0,...,0,0,0,0,0,0,0,320000,35000,377500
1,train_1,31,2,0,9,4172100,50400,0,820525,0,...,1360500,0,0,0,689000,1376700,0,0,39000,710000
2,train_10,0,118,10,0,0,10006771,1228099,0,0,...,0,0,0,0,0,0,166000,4154600,371300,0
3,train_100,5,9,0,6,919000,594320,0,362567,0,...,0,0,0,0,0,45000,0,163000,302800,59000
4,train_1000,0,2,0,13,0,155200,0,602300,0,...,83300,0,0,0,0,0,0,130000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,36000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,36000
14936,train_9996,1,22,0,0,192000,4582410,0,0,0,...,0,0,0,0,0,192000,0,133000,1379800,0
14937,train_9997,15,0,0,0,795700,0,0,0,0,...,16000,0,0,0,217000,361500,0,0,0,52000
14938,train_9998,24,0,0,5,1782325,0,0,287000,0,...,24000,0,0,0,0,0,0,0,0,465000


In [884]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="대분류",
        values="구매가격",
        aggfunc="sum",
        fill_value=0,
    )
    .add_prefix("대분류_구매합계_")
    .reset_index(),
    on="ID",
    how="left",
)

test_tmp

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,대분류_구매합계_여성캐주얼,대분류_구매합계_여성캐쥬얼,대분류_구매합계_영라이브,대분류_구매합계_영어덜트캐쥬얼,대분류_구매합계_영캐릭터,대분류_구매합계_영플라자,대분류_구매합계_잡화,대분류_구매합계_잡화파트,대분류_구매합계_케주얼_구두_아동,대분류_구매합계_패션잡화
0,test_0,0,0,7,0,0,0,429579,0,0,...,0,100000,0,0,0,0,110000,0,0,0
1,test_1,1,0,0,3,62000,0,0,228600,0,...,62000,0,0,0,0,0,0,0,0,0
2,test_10,28,0,0,0,2265788,0,0,0,0,...,0,0,0,0,186500,215700,0,0,0,1074000
3,test_100,103,3,0,5,2363841,350000,0,112550,0,...,148700,0,0,0,79000,77600,0,320000,0,758000
4,test_1000,0,0,0,3,0,0,0,227000,0,...,0,0,0,0,0,0,0,0,0,73000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,1061128,0,0,0,0,...,145000,0,0,0,148000,145700,0,0,0,195000
12221,test_9996,0,0,44,0,0,0,1624893,0,0,...,0,229000,108000,254200,0,0,254000,0,0,0
12222,test_9997,0,0,0,21,0,0,0,1635600,0,...,404000,0,0,0,0,0,0,0,0,0
12223,test_9998,0,0,62,0,0,0,6730872,0,0,...,0,478000,582300,59000,0,0,1213800,0,0,0


### ID, 브랜드코드별

- 구매횟수

In [885]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="브랜드코드",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("브랜드코드_구매횟수_")
    .reset_index(),
    on="ID",
    how='left'    
)

train_tmp

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,브랜드코드_구매횟수_6984,브랜드코드_구매횟수_6985,브랜드코드_구매횟수_6987,브랜드코드_구매횟수_6988,브랜드코드_구매횟수_6989,브랜드코드_구매횟수_6990,브랜드코드_구매횟수_6992,브랜드코드_구매횟수_6994,브랜드코드_구매횟수_6995,브랜드코드_구매횟수_6996
0,train_0,8,6,0,0,1242800,2958400,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,train_1,31,2,0,9,4172100,50400,0,820525,0,...,0,0,0,0,0,0,0,0,0,0
2,train_10,0,118,10,0,0,10006771,1228099,0,0,...,0,0,0,0,0,0,0,0,0,0
3,train_100,5,9,0,6,919000,594320,0,362567,0,...,0,0,0,0,0,0,0,0,0,0
4,train_1000,0,2,0,13,0,155200,0,602300,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,36000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14936,train_9996,1,22,0,0,192000,4582410,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14937,train_9997,15,0,0,0,795700,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14938,train_9998,24,0,0,5,1782325,0,0,287000,0,...,0,0,0,0,0,0,0,0,0,0


In [886]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="브랜드코드",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("브랜드코드_구매횟수_")
    .reset_index(),
    on="ID",
    how='left'    
)

test_tmp

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,브랜드코드_구매횟수_6985,브랜드코드_구매횟수_6986,브랜드코드_구매횟수_6987,브랜드코드_구매횟수_6988,브랜드코드_구매횟수_6990,브랜드코드_구매횟수_6992,브랜드코드_구매횟수_6994,브랜드코드_구매횟수_6995,브랜드코드_구매횟수_6996,브랜드코드_구매횟수_6998
0,test_0,0,0,7,0,0,0,429579,0,0,...,0,0,0,0,0,0,0,0,0,0
1,test_1,1,0,0,3,62000,0,0,228600,0,...,0,0,0,0,0,0,0,0,0,0
2,test_10,28,0,0,0,2265788,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,test_100,103,3,0,5,2363841,350000,0,112550,0,...,0,0,0,0,0,0,0,0,0,0
4,test_1000,0,0,0,3,0,0,0,227000,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,1061128,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12221,test_9996,0,0,44,0,0,0,1624893,0,0,...,0,0,0,0,0,0,0,0,0,0
12222,test_9997,0,0,0,21,0,0,0,1635600,0,...,0,0,0,0,0,0,0,0,0,0
12223,test_9998,0,0,62,0,0,0,6730872,0,0,...,0,0,0,0,0,0,0,0,0,0


- 구매금액

In [887]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="브랜드코드",
        values="구매가격",
        aggfunc="sum",
        fill_value=0,
    )
    .add_prefix("브랜드코드_구매합계_")
    .reset_index(),
    on="ID",
    how='left'    
)

train_tmp

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,브랜드코드_구매합계_6984,브랜드코드_구매합계_6985,브랜드코드_구매합계_6987,브랜드코드_구매합계_6988,브랜드코드_구매합계_6989,브랜드코드_구매합계_6990,브랜드코드_구매합계_6992,브랜드코드_구매합계_6994,브랜드코드_구매합계_6995,브랜드코드_구매합계_6996
0,train_0,8,6,0,0,1242800,2958400,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,train_1,31,2,0,9,4172100,50400,0,820525,0,...,0,0,0,0,0,0,0,0,0,0
2,train_10,0,118,10,0,0,10006771,1228099,0,0,...,0,0,0,0,0,0,0,0,0,0
3,train_100,5,9,0,6,919000,594320,0,362567,0,...,0,0,0,0,0,0,0,0,0,0
4,train_1000,0,2,0,13,0,155200,0,602300,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,36000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14936,train_9996,1,22,0,0,192000,4582410,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14937,train_9997,15,0,0,0,795700,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14938,train_9998,24,0,0,5,1782325,0,0,287000,0,...,0,0,0,0,0,0,0,0,0,0


In [888]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="브랜드코드",
        values="구매가격",
        aggfunc="sum",
        fill_value=0,
    )
    .add_prefix("브랜드코드_구매합계_")
    .reset_index(),
    on="ID",
    how='left'    
)

test_tmp

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,브랜드코드_구매합계_6985,브랜드코드_구매합계_6986,브랜드코드_구매합계_6987,브랜드코드_구매합계_6988,브랜드코드_구매합계_6990,브랜드코드_구매합계_6992,브랜드코드_구매합계_6994,브랜드코드_구매합계_6995,브랜드코드_구매합계_6996,브랜드코드_구매합계_6998
0,test_0,0,0,7,0,0,0,429579,0,0,...,0,0,0,0,0,0,0,0,0,0
1,test_1,1,0,0,3,62000,0,0,228600,0,...,0,0,0,0,0,0,0,0,0,0
2,test_10,28,0,0,0,2265788,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,test_100,103,3,0,5,2363841,350000,0,112550,0,...,0,0,0,0,0,0,0,0,0,0
4,test_1000,0,0,0,3,0,0,0,227000,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,1061128,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12221,test_9996,0,0,44,0,0,0,1624893,0,0,...,0,0,0,0,0,0,0,0,0,0
12222,test_9997,0,0,0,21,0,0,0,1635600,0,...,0,0,0,0,0,0,0,0,0,0
12223,test_9998,0,0,62,0,0,0,6730872,0,0,...,0,0,0,0,0,0,0,0,0,0


- 요일별 구매금액 합계

In [889]:
weekday_purchase_sum = pd.pivot_table(
    train_tr_clean,
    index='ID',
    columns=train_tr_clean['구매날짜'].dt.day_name(),
    values='구매가격',
    aggfunc='sum',
    fill_value=0
)

weekday_purchase_sum.columns = [f"{day}_sum" for day in weekday_purchase_sum.columns]
train_tmp = train_tmp.merge(weekday_purchase_sum, how='left', on='ID')
train_tmp.head()

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,브랜드코드_구매합계_6994,브랜드코드_구매합계_6995,브랜드코드_구매합계_6996,Friday_sum,Monday_sum,Saturday_sum,Sunday_sum,Thursday_sum,Tuesday_sum,Wednesday_sum
0,train_0,8,6,0,0,1242800,2958400,0,0,0,...,0,0,0,43500,320000,1430000,340000,1507400,0,560300
1,train_1,31,2,0,9,4172100,50400,0,820525,0,...,0,0,0,643400,316000,0,30000,1822450,1193000,1038175
2,train_10,0,118,10,0,0,10006771,1228099,0,0,...,0,0,0,10912013,0,124688,0,0,129640,68529
3,train_100,5,9,0,6,919000,594320,0,362567,0,...,0,0,0,669520,73000,983800,45567,0,45000,59000
4,train_1000,0,2,0,13,0,155200,0,602300,0,...,0,0,0,0,29000,29000,292300,110000,62000,235200


In [890]:
weekday_purchase_sum = pd.pivot_table(
    test_tr_clean,
    index='ID',
    columns=test_tr_clean['구매날짜'].dt.day_name(),
    values='구매가격',
    aggfunc='sum',
    fill_value=0
)

weekday_purchase_sum.columns = [f"{day}_sum" for day in weekday_purchase_sum.columns]
test_tmp = test_tmp.merge(weekday_purchase_sum, how='left', on='ID')
test_tmp.head()

Unnamed: 0,ID,지점코드_구매횟수_A112000,지점코드_구매횟수_A144000,지점코드_구매횟수_A202000,지점코드_구매횟수_A373000,지점코드_구매합계_A112000,지점코드_구매합계_A144000,지점코드_구매합계_A202000,지점코드_구매합계_A373000,중분류_구매횟수_DC캐주얼,...,브랜드코드_구매합계_6995,브랜드코드_구매합계_6996,브랜드코드_구매합계_6998,Friday_sum,Monday_sum,Saturday_sum,Sunday_sum,Thursday_sum,Tuesday_sum,Wednesday_sum
0,test_0,0,0,7,0,0,0,429579,0,0,...,0,0,0,0,100000,136643,94400,0,98536,0
1,test_1,1,0,0,3,62000,0,0,228600,0,...,0,0,0,0,155600,62000,0,0,0,73000
2,test_10,28,0,0,0,2265788,0,0,0,0,...,0,0,0,493300,62588,120000,153000,146000,1157900,133000
3,test_100,103,3,0,5,2363841,350000,0,112550,0,...,0,0,0,718347,322549,160081,572907,531251,145493,375763
4,test_1000,0,0,0,3,0,0,0,227000,0,...,0,0,0,0,0,0,73000,0,0,154000


### 최종 pivot table merge

In [891]:
train_ft = train_ft.merge(train_tmp, on="ID", how="left")

for col in train_tmp.columns:
    if col not in test_tmp.columns:
        test_tmp[col] = 0

test_tmp = test_tmp[train_tmp.columns]
test_ft = test_ft.merge(test_tmp, how="left", on="ID")

  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0
  test_tmp[col] = 0


In [892]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 추가 피처 생성

In [893]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [894]:
# weekdays = ["월요일_구매비율", "화요일_구매비율", "수요일_구매비율", "목요일_구매비율", "금요일_구매비율"]

# train_ft["평일_구매비율"] = train_ft[weekdays].sum(axis=1)
# test_ft["평일_구매비율"] = test_ft[weekdays].sum(axis=1)

In [895]:
# weekend = ["토요일_구매비율", "일요일_구매비율"]

# train_ft["주말_구매비율"] = train_ft[weekend].sum(axis=1)
# test_ft["주말_구매비율"] = test_ft[weekend].sum(axis=1)

In [896]:
# train_ft["주말_방문_선호도"] = train_ft["주말_구매비율"] / (train_ft["평일_구매비율"] + 1)
train_ft["방문일수_대비_구매건수"] = train_ft["구매건수"] / train_ft["총방문일수"]
# train_ft["전체_성수기_구매비율"] = train_ft[["추석성수기_구매비율", "연말연시_구매비율", "설날성수기_구매비율"]].sum(axis=1)
# train_ft["전체_준성수기_구매비율"] = train_ft[["여름시즌_구매비율", "신학기_구매비율"]].sum(axis=1)
# train_ft["전체_중간기_구매비율"] = train_ft[["여름끝추석전_구매비율", "연말연시끝설날전_구매비율", "신학기후_구매비율"]].sum(axis=1)
# train_ft["전체_비수기_구매비율"] = train_ft[["초여름_구매비율", "추석후연말전_구매비율", "봄철비수기_구매비율"]].sum(axis=1)
# train_ft["성수기_구매집중도"] = train_ft["전체_성수기_구매비율"] / (train_ft["전체_비수기_구매비율"] + 1)
train_ft["계절_구매변동성"] = train_ft[["봄_구매비율", "여름_구매비율", "가을_구매비율", "겨울_구매비율"]].std(axis=1)

# test_ft["주말_방문_선호도"] = test_ft["주말_구매비율"] / (test_ft["평일_구매비율"] + 1)
test_ft["방문일수_대비_구매건수"] = test_ft["구매건수"] / test_ft["총방문일수"]
# test_ft["전체_성수기_구매비율"] = test_ft[["추석성수기_구매비율", "연말연시_구매비율", "설날성수기_구매비율"]].sum(axis=1)
# test_ft["전체_준성수기_구매비율"] = test_ft[["여름시즌_구매비율", "신학기_구매비율"]].sum(axis=1)
# test_ft["전체_중간기_구매비율"] = test_ft[["여름끝추석전_구매비율", "연말연시끝설날전_구매비율", "신학기후_구매비율"]].sum(axis=1)
# test_ft["전체_비수기_구매비율"] = test_ft[["초여름_구매비율", "추석후연말전_구매비율", "봄철비수기_구매비율"]].sum(axis=1)
# test_ft["성수기_구매집중도"] = test_ft["전체_성수기_구매비율"] / (test_ft["전체_비수기_구매비율"] + 1)
test_ft["계절_구매변동성"] = test_ft[["봄_구매비율", "여름_구매비율", "가을_구매비율", "겨울_구매비율"]].std(axis=1)

In [897]:
train_ft["구매주기_대비_구매금액"] = train_ft["총구매금액"] / train_ft["구매주기"]
train_ft["방문당_평균구매금액"] = train_ft["총구매금액"] / train_ft["총방문일수"]
train_ft["구매금액_대비_환불금액"] = train_ft["총환불금액"] / train_ft["총구매금액"]
train_ft["환불건수_대비_구매건수"] = train_ft["환불건수"] / train_ft["구매건수"]

test_ft["구매주기_대비_구매금액"] = test_ft["총구매금액"] / test_ft["구매주기"]
test_ft["방문당_평균구매금액"] = test_ft["총구매금액"] / test_ft["총방문일수"]
test_ft["구매금액_대비_환불금액"] = abs(test_ft["총환불금액"]) / test_ft["총구매금액"]
test_ft["환불건수_대비_구매건수"] = test_ft["환불건수"] / test_ft["구매건수"]

# 항상 확인하기
- 학습데이터와 테스트 데이터의 피처개수는 동일해야 함

In [898]:
train_ft.shape, test_ft.shape

((14940, 4409), (12225, 4409))

# 추출한 피처 저장하기

In [899]:
train_ft.to_csv(f"{DATA_PATH}train_common.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_common.csv",index=False)