# 컴피티션 링크
- https://www.kaggle.com/t/2e45abe9f1434b59a3358365432a48bb

# 고객별 백화점 구매기록 데이터셋
- 미혼(1) or 기혼(0) 고객을 예측
- 학습용 구매기록 데이터
    - https://drive.google.com/file/d/1tg41qXu02FK55bFa8P1Gx2URhwPUCoQQ/view?usp=sharing
- 학습용 정답 데이터
    - https://drive.google.com/file/d/1-9AcU9nAoO4SzSmqdCOYTvWNpA5Pdt3q/view?usp=sharing
- 테스트용 구매기록 데이터
    - https://drive.google.com/file/d/1-AwDfGlHm9rNtpnHIWOK96jBJYy3f2SZ/view?usp=sharing
- 제출 양식 데이터
    - https://drive.google.com/file/d/1-Qv7SlsY5Eu3bRR7Z0IUJyLkOO1Fl3y6/view?usp=sharing

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

- 데이터 경로 변수

In [2]:
DATA_PATH = "data/"
DATA_PATH

'data/'

- 데이터 불러오기

In [3]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

# 특성 공학(Feature Engineering)

## 날짜 형식으로 변환

In [4]:
train_tr["구매일시"] = pd.to_datetime(train_tr["구매일시"])
test_tr["구매일시"] = pd.to_datetime(test_tr["구매일시"])

## 날짜, 시간 컬럼 분할

In [5]:
train_tr["구매날짜"] = train_tr["구매일시"].dt.date
test_tr["구매날짜"] = test_tr["구매일시"].dt.date

train_tr["구매시각"] = train_tr["구매일시"].dt.time
test_tr["구매시각"] = test_tr["구매일시"].dt.time

train_tr["구매날짜"] = pd.to_datetime(train_tr["구매날짜"], errors="coerce")
test_tr["구매날짜"] = pd.to_datetime(test_tr["구매날짜"], errors="coerce")

train_tr.shape, test_tr.shape

((523105, 9), (441196, 9))

## 환불한 내역에 대한 구매내역 및 환불내역 삭제

In [6]:
purchase_train = train_tr[train_tr["구매가격"] > 0].reset_index()
purchase_test = test_tr[test_tr["구매가격"] > 0].reset_index()

refund_train = train_tr[train_tr["구매가격"] < 0].reset_index()
refund_test = test_tr[test_tr["구매가격"] < 0].reset_index()

In [7]:
# 환불 금액의 절대값과 동일한 구매 내역 찾기
refund_pairs_train = pd.merge(
    refund_train,
    purchase_train,
    on=["ID", "지점코드", "대분류", "중분류", "브랜드코드"],
    suffixes=("_refund", "_purchase")
)

# 환불 금액이 일치하는 경우만 선택
refund_pairs_train = refund_pairs_train[
    refund_pairs_train["구매가격_refund"].abs() == refund_pairs_train["구매가격_purchase"]
].copy()

# 환불 날짜 이후에 환불 금액과 일치하는 구매건
refund_pairs_train = refund_pairs_train[
    refund_pairs_train["구매날짜_purchase"] > refund_pairs_train["구매날짜_refund"]
]

repurchase_count_train = refund_pairs_train.groupby("ID").size().reset_index(name='환불후재구매브랜드_count')

repurchase_count_train

Unnamed: 0,ID,환불후재구매브랜드_count
0,train_10049,1
1,train_1006,1
2,train_10087,1
3,train_10106,1
4,train_10110,2
...,...,...
497,train_9943,1
498,train_9950,2
499,train_9956,1
500,train_998,1


In [8]:
# 테스트 데이터
refund_pairs_test = pd.merge(
    refund_test,
    purchase_test,
    on=["ID", "지점코드", "대분류", "중분류", "브랜드코드"],
    suffixes=("_refund", "_purchase")
)

refund_pairs_test = refund_pairs_test[
    refund_pairs_test["구매가격_refund"].abs() == refund_pairs_test["구매가격_purchase"]
].copy()

refund_pairs_test = refund_pairs_test[
    refund_pairs_test["구매날짜_purchase"] > refund_pairs_test["구매날짜_refund"]
]

repurchase_count_test = refund_pairs_test.groupby("ID").size().reset_index(name='환불후재구매브랜드_count')

repurchase_count_test

Unnamed: 0,ID,환불후재구매브랜드_count
0,test_10004,1
1,test_10019,2
2,test_10042,3
3,test_1005,1
4,test_10074,1
...,...,...
429,test_9932,1
430,test_9940,1
431,test_995,1
432,test_9974,2


In [9]:
# ID, 지점코드, 대분류, 중분류, 브랜드코드가 같은 구매내역과 환불내역 선택
refund_pairs_train = pd.merge(
    refund_train,
    purchase_train,
    on=["ID", "지점코드", "대분류", "중분류", "브랜드코드"],
    suffixes=("_refund", "_purchase")
)

refund_pairs_test = pd.merge(
    refund_test,
    purchase_test,
    on=["ID", "지점코드", "대분류", "중분류", "브랜드코드"],
    suffixes=("_refund", "_purchase")
)

# 구매금액과 환불 금액이 일치하는 경우만 선택
refund_pairs_train = refund_pairs_train[
    refund_pairs_train["구매가격_refund"].abs() == refund_pairs_train["구매가격_purchase"]
].copy()

refund_pairs_test = refund_pairs_test[
    refund_pairs_test["구매가격_refund"].abs() == refund_pairs_test["구매가격_purchase"]
].copy()

# 환불 날짜와 시간이 구매 날짜와 시간보다 이후인 경우만 선택
refund_pairs_train = refund_pairs_train[
    (refund_pairs_train["구매날짜_refund"] > refund_pairs_train["구매날짜_purchase"]) |
    ((refund_pairs_train["구매날짜_refund"] == refund_pairs_train["구매날짜_purchase"]) &
     (refund_pairs_train["구매시각_refund"] > refund_pairs_train["구매시각_purchase"]))
]

refund_pairs_test = refund_pairs_test[
    (refund_pairs_test["구매날짜_refund"] > refund_pairs_test["구매날짜_purchase"]) |
    ((refund_pairs_test["구매날짜_refund"] == refund_pairs_test["구매날짜_purchase"]) &
     (refund_pairs_test["구매시각_refund"] > refund_pairs_test["구매시각_purchase"]))
]

# 환불 날짜/시각과 구매 날짜/시각의 차이를 계산
refund_pairs_train['time_diff'] = (
    pd.to_datetime(refund_pairs_train['구매날짜_refund'].astype(str) + ' ' + refund_pairs_train['구매시각_refund'].astype(str)) -
    pd.to_datetime(refund_pairs_train['구매날짜_purchase'].astype(str) + ' ' + refund_pairs_train['구매시각_purchase'].astype(str))
)

refund_pairs_test['time_diff'] = (
    pd.to_datetime(refund_pairs_test['구매날짜_refund'].astype(str) + ' ' + refund_pairs_test['구매시각_refund'].astype(str)) -
    pd.to_datetime(refund_pairs_test['구매날짜_purchase'].astype(str) + ' ' + refund_pairs_test['구매시각_purchase'].astype(str))
)

# _purchase 컬럼을 제외한 컬럼으로 그룹화하고 time_diff가 가장 작은 행만 선택
cols = [col for col in refund_pairs_train.columns if '_purchase' not in col and col != 'time_diff']
refund_pairs_train = refund_pairs_train.loc[refund_pairs_train.groupby(cols)['time_diff'].idxmin()]

cols = [col for col in refund_pairs_test.columns if '_purchase' not in col and col != 'time_diff']
refund_pairs_test = refund_pairs_test.loc[refund_pairs_test.groupby(cols)['time_diff'].idxmin()]

# time_diff 컬럼 제거
refund_pairs_train = refund_pairs_train.drop('time_diff', axis=1)
refund_pairs_test = refund_pairs_test.drop('time_diff', axis=1)

# 환불과 구매 인덱스 추출
index_refund_train = refund_pairs_train['index_refund'].values
index_purchase_train = refund_pairs_train['index_purchase'].values

index_refund_test = refund_pairs_test['index_refund'].values
index_purchase_test = refund_pairs_test['index_purchase'].values

# train_tr, test_tr에서 해당 인덱스 제거
train_tr_clean = train_tr.drop(index=np.concatenate([index_refund_train, index_purchase_train]))
test_tr_clean = test_tr.drop(index=np.concatenate([index_refund_test, index_purchase_test]))

# 구매가격이 0보다 작은 행 제거
train_tr_clean = train_tr_clean[train_tr_clean['구매가격'] > 0]
test_tr_clean = test_tr_clean[test_tr_clean['구매가격'] > 0]

train_tr_clean.shape, test_tr_clean.shape

((456484, 9), (384916, 9))

## 중분류 컬럼 값 중 공백 포함 값에 대해 공백제거

In [10]:
train_tr_clean["중분류"] = train_tr_clean["중분류"].str.replace(" ", "")
test_tr_clean["중분류"] = test_tr_clean["중분류"].str.replace(" ", "")

## 새로 만든 feature와 병합할 고객ID로만 이루어진 데이터프레임 생성

In [11]:
train_ft = train_target[["ID"]]
test_ft = submit[["ID"]]

train_ft.shape, test_ft.shape

((14940, 1), (12225, 1))

## 구매날짜를 이용한 특성생성

In [12]:
# train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
# test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [13]:
train_today = train_tr_clean["구매날짜"].max() + pd.Timedelta(days=1)
test_today = test_tr_clean["구매날짜"].max() + pd.Timedelta(days=1)
train_today, test_today

(Timestamp('2005-04-30 00:00:00'), Timestamp('2005-04-30 00:00:00'))

In [14]:
agg_list = [
    # 일
    ("총방문일수", "nunique"),
    # ("첫구매날짜", 'min'),
    # ("마지막구매날짜", 'max'),
    ("백화점이용기간", lambda x: (x.max() - x.min()).days + 1),
    ("구매주기", lambda x: int(((x.max() - x.min()).days + 1) / x.dt.date.nunique())),
    ("주말방문일수", lambda x: x[x.dt.weekday > 4].nunique()),
    ("일별평균구매건수", lambda x: x.count() / x.dt.date.nunique()),
    ("구매간격_표준편차", lambda x: np.std(np.diff([d.toordinal() for d in sorted(x)])) if len(x) > 1 else 0),

    # 요일
    # ('평균구매요일', lambda x: x.dt.weekday.mean()),
    ("주구매요일", lambda x: x.dt.weekday.mode()[0]),
    ("월요일_구매비율", lambda x: np.mean(x.dt.weekday == 0)),
    ("화요일_구매비율", lambda x: np.mean(x.dt.weekday == 1)),
    ("수요일_구매비율", lambda x: np.mean(x.dt.weekday == 2)),
    ("목요일_구매비율", lambda x: np.mean(x.dt.weekday == 3)),
    ("금요일_구매비율", lambda x: np.mean(x.dt.weekday == 4)),
    ("토요일_구매비율", lambda x: np.mean(x.dt.weekday == 5)),
    ("일요일_구매비율", lambda x: np.mean(x.dt.weekday == 6)),

    # 월
    ("1월_구매비율", lambda x: np.mean(x.dt.month == 1)),
    ("2월_구매비율", lambda x: np.mean(x.dt.month == 2)),
    ("3월_구매비율", lambda x: np.mean(x.dt.month == 3)),
    ("4월_구매비율", lambda x: np.mean(x.dt.month == 4)),
    ("5월_구매비율", lambda x: np.mean(x.dt.month == 5)),
    ("6월_구매비율", lambda x: np.mean(x.dt.month == 6)),
    ("7월_구매비율", lambda x: np.mean(x.dt.month == 7)),
    ("8월_구매비율", lambda x: np.mean(x.dt.month == 8)),
    ("9월_구매비율", lambda x: np.mean(x.dt.month == 9)),
    ("10월_구매비율", lambda x: np.mean(x.dt.month == 10)),
    ("11월_구매비율", lambda x: np.mean(x.dt.month == 11)),
    ("12월_구매비율", lambda x: np.mean(x.dt.month == 12)),
    ("거래개월수", lambda x: x.dt.date.astype(str).str[:-3].nunique()),
    ("월별평균구매건수", lambda x: x.count() / x.dt.month.nunique()),
    # ("월초구매비율", lambda x: np.mean(x.dt.day <= 10)),
    # ("월중순구매비율", lambda x: np.mean((x.dt.day > 10) & (x.dt.day <= 20))),
    # ("월말구매비율", lambda x: np.mean(x.dt.day >= 21)),

    # # 성수기
    # ("추석성수기_구매비율", lambda x: np.mean((x.dt.month == 9) & (x.dt.day >= 18) | (x.dt.month == 10) & (x.dt.day <= 3))),
    # ("연말연시_구매비율", lambda x: np.mean((x.dt.month == 12) | (x.dt.month == 1) & (x.dt.day <= 10))),
    # ("설날성수기_구매비율", lambda x: np.mean((x.dt.month == 2) & (x.dt.day >= 1) & (x.dt.day <= 15))),

    # # 준성수기
    # ("여름시즌_구매비율", lambda x: np.mean((x.dt.month == 6) & (x.dt.day >= 15) | (x.dt.month == 7) | (x.dt.month == 8) & (x.dt.day <= 15))),
    # ("신학기_구매비율", lambda x: np.mean((x.dt.month == 3) & (x.dt.day >= 1) & (x.dt.day <= 15))),

    # # 중간기
    # ("여름끝추석전_구매비율", lambda x: np.mean((x.dt.month == 8) & (x.dt.day >= 16) | (x.dt.month == 9) & (x.dt.day <= 17))),
    # ("연말연시끝설날전_구매비율", lambda x: np.mean((x.dt.month == 1) & (x.dt.day >= 11) & (x.dt.day <= 31))),
    # ("신학기후_구매비율", lambda x: np.mean((x.dt.month == 3) & (x.dt.day >= 16) | (x.dt.month == 4) & (x.dt.day <= 10))),

    # # 비수기
    # ("초여름_구매비율", lambda x: np.mean((x.dt.month == 5) & (x.dt.day >= 17) | (x.dt.month == 6) & (x.dt.day <= 14))),
    # ("추석후연말전_구매비율", lambda x: np.mean((x.dt.month == 10) & (x.dt.day >= 4) | (x.dt.month == 11))),
    # ("봄철비수기_구매비율", lambda x: np.mean((x.dt.month == 4) & (x.dt.day >= 11) & (x.dt.day <= 29))),

    # 계절
    ('봄_구매비율', lambda x: np.mean(x.dt.month.isin([3,4,5]))),
    ('여름_구매비율', lambda x: np.mean(x.dt.month.isin([6,7,8]))),
    ('가을_구매비율', lambda x: np.mean(x.dt.month.isin([9,10,11]))),
    ('겨울_구매비율', lambda x: np.mean(x.dt.month.isin([1,2,12]))),

    #분기
    ("1분기_구매비율", lambda x: np.mean(x.dt.quarter == 1)),
    ("2분기_구매비율", lambda x: np.mean(x.dt.quarter == 2)),
    ("3분기_구매비율", lambda x: np.mean(x.dt.quarter == 3)),
    ("4분기_구매비율", lambda x: np.mean(x.dt.quarter == 4)),

    #반기
    ("상반기_구매비율", lambda x: np.mean((x.dt.month >= 1) & (x.dt.month <= 6))),
    ("하반기_구매비율", lambda x: np.mean((x.dt.month >= 7) & (x.dt.month <= 12))),

    # 년
    # ("2004년_구매비율", lambda x: np.mean(x.dt.year == 2004)),
    # ("2005년_구매비율", lambda x: np.mean(x.dt.year == 2005)),
    ("연도별평균구매건수", lambda x: x.count() / x.dt.year.nunique()),
]

train_agg_list = agg_list + [("마지막구매후_경과일", lambda x: (pd.Timestamp(train_today) - x.max()).days)]
tmp = train_tr_clean.groupby("ID")["구매날짜"].agg(train_agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,가을_구매비율,겨울_구매비율,1분기_구매비율,2분기_구매비율,3분기_구매비율,4분기_구매비율,상반기_구매비율,하반기_구매비율,연도별평균구매건수,마지막구매후_경과일
0,train_0,9,255,28,2,1.555556,22.744789,2,0.071429,0.0,...,0.285714,0.285714,0.071429,0.285714,0.428571,0.214286,0.357143,0.642857,7.0,104
1,train_1,21,352,16,1,2.0,11.739134,3,0.047619,0.261905,...,0.357143,0.119048,0.190476,0.238095,0.380952,0.190476,0.428571,0.571429,21.0,3
2,train_2,55,356,6,11,1.854545,5.019987,0,0.254902,0.186275,...,0.196078,0.235294,0.323529,0.392157,0.147059,0.137255,0.715686,0.284314,51.0,4
3,train_3,90,345,3,19,2.055556,3.137146,4,0.075676,0.113514,...,0.2,0.221622,0.27027,0.340541,0.167568,0.221622,0.610811,0.389189,92.5,10
4,train_4,24,313,13,6,1.5,11.962731,4,0.083333,0.055556,...,0.305556,0.111111,0.166667,0.194444,0.527778,0.111111,0.361111,0.638889,18.0,44


In [15]:
test_agg_list = agg_list + [("마지막구매후_경과일", lambda x: (pd.Timestamp(test_today) - x.max()).days)]
tmp = test_tr_clean.groupby("ID")["구매날짜"].agg(test_agg_list).reset_index()
test_ft = test_ft.merge(tmp, how="left", on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,가을_구매비율,겨울_구매비율,1분기_구매비율,2분기_구매비율,3분기_구매비율,4분기_구매비율,상반기_구매비율,하반기_구매비율,연도별평균구매건수,마지막구매후_경과일
0,test_0,5,164,32,3,1.4,30.416096,1,0.142857,0.285714,...,0.428571,0.0,0.0,0.428571,0.142857,0.428571,0.428571,0.571429,7.0,186
1,test_1,3,166,55,1,1.333333,57.879185,0,0.5,0.0,...,0.75,0.25,0.25,0.0,0.25,0.5,0.25,0.75,2.0,66
2,test_2,2,18,9,0,2.5,7.361216,0,0.6,0.0,...,0.4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,5.0,240
3,test_3,14,272,19,1,1.714286,17.715031,3,0.083333,0.083333,...,0.375,0.083333,0.041667,0.083333,0.583333,0.291667,0.125,0.875,12.0,11
4,test_4,7,265,37,3,2.142857,36.517818,6,0.066667,0.2,...,0.333333,0.133333,0.133333,0.266667,0.266667,0.333333,0.4,0.6,7.5,76


In [16]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

In [17]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [18]:
agg_list = [
    (
        "구매간격_중앙값",
        lambda x: (
            np.median(np.diff([d.toordinal() for d in sorted(x)])) if len(x) > 1 else 0
        ),
    ),
    (
        "구매간격_평균",
        lambda x: (
            np.mean(np.diff([d.toordinal() for d in sorted(x)])) if len(x) > 1 else 0
        ),
    ),
    (
        "구매간격_최대값",
        lambda x: (
            np.max(np.diff([d.toordinal() for d in sorted(x)])) if len(x) > 1 else 0
        ),
    ),
    (
        "구매간격_최소값",
        lambda x: (
            np.min(np.diff([d.toordinal() for d in sorted(x)])) if len(x) > 1 else 0
        ),
    ),
]

tmp = train_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")

tmp = test_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how="left", on="ID")

In [19]:
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,3분기_구매비율,4분기_구매비율,상반기_구매비율,하반기_구매비율,연도별평균구매건수,마지막구매후_경과일,구매간격_중앙값,구매간격_평균,구매간격_최대값,구매간격_최소값
0,test_0,5,164,32,3,1.4,30.416096,1,0.142857,0.285714,...,0.142857,0.428571,0.428571,0.571429,7.0,186,18.5,27.166667,89,0
1,test_1,3,166,55,1,1.333333,57.879185,0,0.5,0.0,...,0.25,0.5,0.25,0.75,2.0,66,30.0,55.0,135,0
2,test_2,2,18,9,0,2.5,7.361216,0,0.6,0.0,...,1.0,0.0,0.0,1.0,5.0,240,0.0,4.25,17,0
3,test_3,14,272,19,1,1.714286,17.715031,3,0.083333,0.083333,...,0.583333,0.291667,0.125,0.875,12.0,11,5.0,11.782609,62,0
4,test_4,7,265,37,3,2.142857,36.517818,6,0.066667,0.2,...,0.266667,0.333333,0.4,0.6,7.5,76,0.0,18.857143,132,0


In [20]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 구매시각을 이용한 특성생성

In [21]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

- 구매시간대 컬럼생성

In [22]:
# def shopping_time(df):
#     # 날짜별 쇼핑시간 계산
#     shopping_by_date = df.groupby(['ID', '구매날짜']).agg({
#         '구매시각': lambda x: (max(x).hour * 60 + max(x).minute - (min(x).hour * 60 + min(x).minute)) / 60
#     }).reset_index()
#     shopping_by_date.columns = ['ID', '구매날짜', '쇼핑시간']

#     # 쇼핑시간이 있는 경우만 통계 계산
#     shopping_stats = shopping_by_date[shopping_by_date['쇼핑시간'] > 0].groupby('ID').agg({
#         '쇼핑시간': ['mean', 'std', 'max', 'min', 'count']
#     }).reset_index()

#     shopping_stats.columns = [
#         'ID',
#         '평균쇼핑시간',
#         '쇼핑시간_표준편차',
#         '최대쇼핑시간',
#         '최소쇼핑시간',
#         '쇼핑일수'
#     ]

#     return shopping_stats

# # train/test 적용
# train_shopping = shopping_time(train_tr_clean)

# # 기존 feature에 병합
# train_ft = train_ft.merge(train_shopping, on='ID', how='left')

# # NaN값 처리
# shopping_cols = ['평균쇼핑시간', '쇼핑시간_표준편차', '최대쇼핑시간', '최소쇼핑시간', '쇼핑일수']
# train_ft[shopping_cols] = train_ft[shopping_cols].fillna(0)
# train_ft.head()

In [23]:
# test_shopping = shopping_time(test_tr_clean)
# test_ft = test_ft.merge(test_shopping, on='ID', how='left')
# test_ft[shopping_cols] = test_ft[shopping_cols].fillna(0)
# test_ft.head()

In [24]:
agg_list = [
    ("18시이전_구매비율", lambda x: np.mean([t.hour < 18 for t in x])),
    ("18시이후_구매비율", lambda x: np.mean([t.hour >= 18 for t in x])),
    ("오전_구매비율", lambda x: np.mean([t.hour < 12 for t in x])),
    ("오후_구매비율", lambda x: np.mean([t.hour >= 12 for t in x])),
    ("주구매시간대", lambda x: pd.Series([t.hour for t in x]).mode()[0]),
    ("평균구매시간", lambda x: np.mean([t.hour + t.minute/60 for t in x])),
    ("야간구매비율", lambda x: np.mean([(t.hour >= 18) | (t.hour <= 6) for t in x])),
    ("새벽구매비율", lambda x: np.mean([(t.hour >= 0) & (t.hour < 6) for t in x])),
    ("아침_구매비율", lambda x: np.mean([6 <= t.hour < 11 for t in x])),
    ("점심_구매비율", lambda x: np.mean([11 <= t.hour < 14 for t in x])),
    ("저녁_구매비율", lambda x: np.mean([17 <= t.hour < 21 for t in x])),
]

tmp = train_tr_clean.groupby("ID")["구매시각"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,18시이후_구매비율,오전_구매비율,오후_구매비율,주구매시간대,평균구매시간,야간구매비율,새벽구매비율,아침_구매비율,점심_구매비율,저녁_구매비율
0,train_0,9,255,28,2,1.555556,22.744789,2,0.071429,0.0,...,0.357143,0.0,1.0,12,16.125,0.357143,0.0,0.0,0.357143,0.571429
1,train_1,21,352,16,1,2.0,11.739134,3,0.047619,0.261905,...,0.309524,0.047619,0.952381,18,15.615873,0.309524,0.0,0.02381,0.380952,0.428571
2,train_2,55,356,6,11,1.854545,5.019987,0,0.254902,0.186275,...,0.147059,0.068627,0.931373,15,15.555392,0.147059,0.0,0.019608,0.196078,0.254902
3,train_3,90,345,3,19,2.055556,3.137146,4,0.075676,0.113514,...,0.221622,0.064865,0.935135,15,15.72,0.221622,0.0,0.010811,0.248649,0.372973
4,train_4,24,313,13,6,1.5,11.962731,4,0.083333,0.055556,...,0.25,0.111111,0.888889,16,15.833333,0.25,0.0,0.027778,0.194444,0.361111


In [25]:
tmp = test_tr_clean.groupby("ID")["구매시각"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how="left", on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,18시이후_구매비율,오전_구매비율,오후_구매비율,주구매시간대,평균구매시간,야간구매비율,새벽구매비율,아침_구매비율,점심_구매비율,저녁_구매비율
0,test_0,5,164,32,3,1.4,30.416096,1,0.142857,0.285714,...,0.0,0.0,1.0,14,14.078571,0.0,0.0,0.0,0.428571,0.0
1,test_1,3,166,55,1,1.333333,57.879185,0,0.5,0.0,...,0.0,0.25,0.75,11,15.095833,0.0,0.0,0.0,0.25,0.25
2,test_2,2,18,9,0,2.5,7.361216,0,0.6,0.0,...,0.0,0.4,0.6,17,15.066667,0.0,0.0,0.0,0.4,0.6
3,test_3,14,272,19,1,1.714286,17.715031,3,0.083333,0.083333,...,0.541667,0.0,1.0,18,17.884722,0.541667,0.0,0.0,0.0,0.708333
4,test_4,7,265,37,3,2.142857,36.517818,6,0.066667,0.2,...,0.466667,0.0,1.0,19,17.278889,0.466667,0.0,0.0,0.066667,0.6


In [26]:
agg_list = [
    ("평일_18시이전_구매비율", lambda x: np.mean((x.dt.weekday < 5) & (x.dt.hour < 18))),
    ("평일_18시이후_구매비율", lambda x: np.mean((x.dt.weekday < 5) & (x.dt.hour >= 18))),
    ("주말_18시이전_구매비율", lambda x: np.mean((x.dt.weekday >= 5) & (x.dt.hour < 18))),
    ("주말_18시이후_구매비율", lambda x: np.mean((x.dt.weekday >= 5) & (x.dt.hour >= 18))),
]

tmp = train_tr_clean.groupby('ID')['구매일시'].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left', on='ID')

tmp = test_tr_clean.groupby('ID')['구매일시'].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left', on='ID')

train_ft.shape, test_ft.shape

((14940, 60), (12225, 60))

In [27]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 지점을 이용한 특성생성

In [28]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [29]:
agg_list = [
    ("방문지점수", "nunique"),
    ("주구매지점", lambda x: x.mode()[0]),
    ("주구매지점_이용비율", lambda x: x[x == x.mode()[0]].count() / x.count()),
]

tmp = train_tr_clean.groupby("ID")["지점코드"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,아침_구매비율,점심_구매비율,저녁_구매비율,평일_18시이전_구매비율,평일_18시이후_구매비율,주말_18시이전_구매비율,주말_18시이후_구매비율,방문지점수,주구매지점,주구매지점_이용비율
0,train_0,9,255,28,2,1.555556,22.744789,2,0.071429,0.0,...,0.0,0.357143,0.571429,0.5,0.285714,0.142857,0.071429,2,A112000,0.571429
1,train_1,21,352,16,1,2.0,11.739134,3,0.047619,0.261905,...,0.02381,0.380952,0.428571,0.690476,0.285714,0.0,0.02381,3,A112000,0.738095
2,train_2,55,356,6,11,1.854545,5.019987,0,0.254902,0.186275,...,0.019608,0.196078,0.254902,0.666667,0.127451,0.186275,0.019608,2,A373000,0.901961
3,train_3,90,345,3,19,2.055556,3.137146,4,0.075676,0.113514,...,0.010811,0.248649,0.372973,0.648649,0.172973,0.12973,0.048649,3,A144000,0.810811
4,train_4,24,313,13,6,1.5,11.962731,4,0.083333,0.055556,...,0.027778,0.194444,0.361111,0.5,0.138889,0.25,0.111111,2,A144000,0.888889


In [30]:
tmp = test_tr_clean.groupby('ID')["지점코드"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left',on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,아침_구매비율,점심_구매비율,저녁_구매비율,평일_18시이전_구매비율,평일_18시이후_구매비율,주말_18시이전_구매비율,주말_18시이후_구매비율,방문지점수,주구매지점,주구매지점_이용비율
0,test_0,5,164,32,3,1.4,30.416096,1,0.142857,0.285714,...,0.0,0.428571,0.0,0.428571,0.0,0.571429,0.0,1,A202000,1.0
1,test_1,3,166,55,1,1.333333,57.879185,0,0.5,0.0,...,0.0,0.25,0.25,0.75,0.0,0.25,0.0,2,A373000,0.75
2,test_2,2,18,9,0,2.5,7.361216,0,0.6,0.0,...,0.0,0.4,0.6,1.0,0.0,0.0,0.0,1,A144000,1.0
3,test_3,14,272,19,1,1.714286,17.715031,3,0.083333,0.083333,...,0.0,0.0,0.708333,0.458333,0.5,0.0,0.041667,1,A112000,1.0
4,test_4,7,265,37,3,2.142857,36.517818,6,0.066667,0.2,...,0.0,0.066667,0.6,0.133333,0.4,0.4,0.066667,3,A112000,0.466667


In [31]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 브랜드코드를 이용한 특성생성

In [32]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [33]:
agg_list = [
    ("브랜드코드_nunique", "nunique"),
    ("선호브랜드코드", lambda x: x.mode()[0]),
    ("선호브랜드코드_구매비율", lambda x: x[x == x.mode()[0]].count() / x.count())
]

tmp = train_tr_clean.groupby("ID")["브랜드코드"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,평일_18시이전_구매비율,평일_18시이후_구매비율,주말_18시이전_구매비율,주말_18시이후_구매비율,방문지점수,주구매지점,주구매지점_이용비율,브랜드코드_nunique,선호브랜드코드,선호브랜드코드_구매비율
0,train_0,9,255,28,2,1.555556,22.744789,2,0.071429,0.0,...,0.5,0.285714,0.142857,0.071429,2,A112000,0.571429,13,5405,0.142857
1,train_1,21,352,16,1,2.0,11.739134,3,0.047619,0.261905,...,0.690476,0.285714,0.0,0.02381,3,A112000,0.738095,26,5100,0.142857
2,train_2,55,356,6,11,1.854545,5.019987,0,0.254902,0.186275,...,0.666667,0.127451,0.186275,0.019608,2,A373000,0.901961,58,5159,0.117647
3,train_3,90,345,3,19,2.055556,3.137146,4,0.075676,0.113514,...,0.648649,0.172973,0.12973,0.048649,3,A144000,0.810811,99,5217,0.043243
4,train_4,24,313,13,6,1.5,11.962731,4,0.083333,0.055556,...,0.5,0.138889,0.25,0.111111,2,A144000,0.888889,16,5100,0.472222


In [34]:
tmp = test_tr_clean.groupby('ID')["브랜드코드"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left',on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,평일_18시이전_구매비율,평일_18시이후_구매비율,주말_18시이전_구매비율,주말_18시이후_구매비율,방문지점수,주구매지점,주구매지점_이용비율,브랜드코드_nunique,선호브랜드코드,선호브랜드코드_구매비율
0,test_0,5,164,32,3,1.4,30.416096,1,0.142857,0.285714,...,0.428571,0.0,0.571429,0.0,1,A202000,1.0,5,5100,0.428571
1,test_1,3,166,55,1,1.333333,57.879185,0,0.5,0.0,...,0.75,0.0,0.25,0.0,2,A373000,0.75,4,5149,0.25
2,test_2,2,18,9,0,2.5,7.361216,0,0.6,0.0,...,1.0,0.0,0.0,0.0,1,A144000,1.0,5,5111,0.2
3,test_3,14,272,19,1,1.714286,17.715031,3,0.083333,0.083333,...,0.458333,0.5,0.0,0.041667,1,A112000,1.0,18,5956,0.166667
4,test_4,7,265,37,3,2.142857,36.517818,6,0.066667,0.2,...,0.133333,0.4,0.4,0.066667,3,A112000,0.466667,12,5100,0.133333


In [35]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 중분류를 이용한 특성생성

In [36]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [37]:
train_ft.shape, test_ft.shape

((14940, 66), (12225, 66))

In [38]:
train_tr['중분류'].unique()

array(['차류', '화장잡화', '용기보증', '아동복', '전화기_카세트', '일용잡화', '수입종합화장품', '아동',
       '테이프', '수영복', '상품군미지정', '야채', '진케주얼', '단품', 'TV_VCR', '세탁기_냉장고',
       '트래디셔널', '완구(문화)', '캐릭터캐쥬얼', '주방용품', '문구_팬시', '수입부띠끄', '면류', '정육',
       '골프웨어', '유아용품', '패션시계', '골프(국내)', '진캐쥬얼', '색조화장품', '아웃도어', '양말',
       '영트랜드', '핸드백', '골프(LC)', '곡물', '행사_단품', '과자류', '니트웨어', '란제리', '완구',
       '스타킹', '향수', '수입향수', '레이디숍A', '명품', '문화', '단품_행사(트래디셔널)', '신생아',
       '팬시코너(문화)', '우산장갑', '셔츠', '영캐쥬얼', '행사', '칼라드래디셔널', '생선', '아동잡화',
       '스카프', '하이캐쥬얼', '타운웨어', '영캐주얼', '남성잡화', '캐릭터', '소형취사가전', '유아복',
       '헤어ACC', '로얄부틱2F', '타운단품', '캐리어캐쥬얼', '진캐주얼', '침구', '청과', '일반조리',
       '부띠끄', '모자', '디자이너', '캐릭터캐주얼', '캐릭터슈즈', '디자이너부띠끄', '슈즈', '냉장식품',
       '패션란제리', '수입액세서리', '손수건', '잡화', '스포츠웨어', '캐쥬얼구두', '즉석조리', '여성구두',
       '내의', '골프(수입)', '넥타이', '수입ACC', '잡화(문화)', '욕실용품', 'NB제화', 'L_B침구',
       '국산화장품', '디자이너숍', '썬그라스', '수입의류', '트레디셔널캐주얼', '준보석', '싸롱화',
       '트래디셔널캐쥬얼', '화장품', '수입캐주얼', '수입', '미씨캐릭터', '라이센스', '구두임

In [39]:
agg_list = [
    ("중분류_nunique", "nunique"),
    ("주구매_중분류", lambda x: x.mode()[0]),

    ("주구매_중분류_영패션", lambda x: (x.str.contains("영|캐쥬얼|캐주얼", regex=True)).mean()),
    ('주구매_중분류_취미용품', lambda x: (x.str.contains('아웃도어|골프|스포츠|취미|수예|레포츠', regex=True)).mean()),
    ('주구매_중분류_아동용품', lambda x: (x.str.contains("아동|유아|신생아|완구|팬시|주니어", regex=True)).mean()),

    ("주구매_중분류_화장품", lambda x: (x.str.contains("화장품|향수", regex=True)).mean()),

    ('주구매_중분류_가전제품', lambda x: (x.str.contains('TV|냉장고|취사', regex=True)).mean()),
    ('주구매_중분류_가구', lambda x: (x.str.contains('가구|식탁|쇼파|소파|침대|침구|홈', regex=True)).mean()),
    ('주구매_중분류_사치품', lambda x: (x.str.contains('보석|모피,', regex=True)).mean()),
    ("주구매_중분류_악세서리", lambda x: (x.str.contains("악세사리|액세서리|보석|핸드백|장신구|시계|ACC", regex=True)).mean()),
    ("주구매_중분류_명품", lambda x: (x.str.contains("명품|부띠끄|로얄|부틱|엘레강스", regex=True)).mean()),
    ('주구매_중분류_중장년타겟제품', lambda x: (x.str.contains('건강식품|머플러|양말|도자기', regex=True)).mean()),
    ("주구매_중분류_선물용품", lambda x: x.str.contains("차류|스카프|손수건|머플러", regex=True).mean()),
]

tmp = train_tr_clean.groupby("ID")["중분류"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left')
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,주구매_중분류_취미용품,주구매_중분류_아동용품,주구매_중분류_화장품,주구매_중분류_가전제품,주구매_중분류_가구,주구매_중분류_사치품,주구매_중분류_악세서리,주구매_중분류_명품,주구매_중분류_중장년타겟제품,주구매_중분류_선물용품
0,train_0,9,255,28,2,1.555556,22.744789,2,0.071429,0.0,...,0.214286,0.142857,0.071429,0.0,0.0,0.0,0.0,0.0,0.071429,0.0
1,train_1,21,352,16,1,2.0,11.739134,3,0.047619,0.261905,...,0.071429,0.0,0.095238,0.0,0.0,0.0,0.0,0.047619,0.0,0.0
2,train_2,55,356,6,11,1.854545,5.019987,0,0.254902,0.186275,...,0.254902,0.0,0.098039,0.0,0.009804,0.0,0.029412,0.127451,0.019608,0.0
3,train_3,90,345,3,19,2.055556,3.137146,4,0.075676,0.113514,...,0.054054,0.016216,0.216216,0.010811,0.0,0.005405,0.064865,0.0,0.005405,0.0
4,train_4,24,313,13,6,1.5,11.962731,4,0.083333,0.055556,...,0.0,0.027778,0.138889,0.0,0.0,0.0,0.0,0.166667,0.0,0.0


In [40]:
tmp = test_tr_clean.groupby('ID')["중분류"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left')
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,주구매_중분류_취미용품,주구매_중분류_아동용품,주구매_중분류_화장품,주구매_중분류_가전제품,주구매_중분류_가구,주구매_중분류_사치품,주구매_중분류_악세서리,주구매_중분류_명품,주구매_중분류_중장년타겟제품,주구매_중분류_선물용품
0,test_0,5,164,32,3,1.4,30.416096,1,0.142857,0.285714,...,0.0,0.285714,0.142857,0.0,0.0,0.0,0.0,0.142857,0.0,0.0
1,test_1,3,166,55,1,1.333333,57.879185,0,0.5,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_2,2,18,9,0,2.5,7.361216,0,0.6,0.0,...,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_3,14,272,19,1,1.714286,17.715031,3,0.083333,0.083333,...,0.125,0.0,0.083333,0.0,0.0,0.0,0.041667,0.0,0.041667,0.0
4,test_4,7,265,37,3,2.142857,36.517818,6,0.066667,0.2,...,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 대분류를 이용한 특성생성

In [42]:
train_tr['대분류'].unique()

array(['공산품파트', '잡화파트', '공산품', '아동_스포츠', '가정용품', '아동문화', '케주얼_구두_아동',
       '스포츠캐쥬얼', '아동', '패션잡화', '여성캐쥬얼', '생식품파트', '영어덜트캐쥬얼', '로얄부띠끄',
       '생식품', '남성정장스포츠', '명품잡화', '골프_유니캐쥬얼', '남성의류', '여성정장', '잡화',
       '여성의류파트', '영라이브', '여성캐주얼', '영플라자', '가정용품파트', '로얄부틱', '영캐릭터'],
      dtype=object)

In [43]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [44]:
agg_list = [
    ("대분류_nunique", "nunique"),
    ("주구매_대분류", lambda x: x.mode()[0]),

    ("주구매_대분류_아동용품", lambda x: (x.str.contains("아동")).mean()),
    ("주구매_대분류_잡화", lambda x: (x.str.contains("영|잡화", regex=True)).mean()),
    ('주구매_대분류_스포츠', lambda x: (x.str.contains('골프|스포츠', regex=True)).mean()),
    ("주구매_대분류_가정용품", lambda x: (x.str.contains("가정용품")).mean()),
    ("주구매_대분류_명품", lambda x: (x.str.contains("로얄|명품", regex=True)).mean()),
    ("주구매_대분류_생식품", lambda x: (x.str.contains("생식품")).mean()),
]

tmp = train_tr_clean.groupby("ID")["대분류"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left')
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,주구매_중분류_중장년타겟제품,주구매_중분류_선물용품,대분류_nunique,주구매_대분류,주구매_대분류_아동용품,주구매_대분류_잡화,주구매_대분류_스포츠,주구매_대분류_가정용품,주구매_대분류_명품,주구매_대분류_생식품
0,train_0,9,255,28,2,1.555556,22.744789,2,0.071429,0.0,...,0.071429,0.0,9,패션잡화,0.142857,0.285714,0.214286,0.285714,0.0,0.071429
1,train_1,21,352,16,1,2.0,11.739134,3,0.047619,0.261905,...,0.0,0.0,12,영플라자,0.02381,0.595238,0.071429,0.0,0.047619,0.02381
2,train_2,55,356,6,11,1.854545,5.019987,0,0.254902,0.186275,...,0.019608,0.0,14,명품잡화,0.156863,0.372549,0.411765,0.029412,0.264706,0.029412
3,train_3,90,345,3,19,2.055556,3.137146,4,0.075676,0.113514,...,0.005405,0.0,16,케주얼_구두_아동,0.286486,0.383784,0.108108,0.037838,0.021622,0.021622
4,train_4,24,313,13,6,1.5,11.962731,4,0.083333,0.055556,...,0.0,0.0,8,공산품파트,0.111111,0.25,0.0,0.0,0.194444,0.194444


In [45]:
tmp = test_tr_clean.groupby('ID')["대분류"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left')
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,주구매_중분류_중장년타겟제품,주구매_중분류_선물용품,대분류_nunique,주구매_대분류,주구매_대분류_아동용품,주구매_대분류_잡화,주구매_대분류_스포츠,주구매_대분류_가정용품,주구매_대분류_명품,주구매_대분류_생식품
0,test_0,5,164,32,3,1.4,30.416096,1,0.142857,0.285714,...,0.0,0.0,5,공산품,0.285714,0.142857,0.0,0.0,0.0,0.142857
1,test_1,3,166,55,1,1.333333,57.879185,0,0.5,0.0,...,0.0,0.0,3,명품잡화,0.0,0.5,0.0,0.0,0.5,0.0
2,test_2,2,18,9,0,2.5,7.361216,0,0.6,0.0,...,0.0,0.0,3,잡화파트,0.4,0.4,0.0,0.0,0.0,0.0
3,test_3,14,272,19,1,1.714286,17.715031,3,0.083333,0.083333,...,0.041667,0.0,4,영플라자,0.0,0.916667,0.041667,0.041667,0.0,0.0
4,test_4,7,265,37,3,2.142857,36.517818,6,0.066667,0.2,...,0.0,0.0,6,여성캐주얼,0.2,0.133333,0.066667,0.0,0.0,0.0


In [46]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 구매가격을 이용한 특성생성

In [47]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

- 구매가격 사분위수

In [48]:
high_purchase = train_tr_clean["구매가격"].quantile(0.75)
low_purchase = train_tr_clean["구매가격"].quantile(0.25)

high_purchase, low_purchase

(120000.0, 26839.75)

- 환불가격 사분위수

In [49]:
low_refund, high_refund = refund_train["구매가격"].quantile(0.75), refund_train["구매가격"].quantile(0.25)
low_refund, high_refund

(-55000.0, -239000.0)

- 순수 구매 관련 컬럼

In [50]:
agg_list = [
    ("총구매금액", "sum"),
    ("구매건수", "count"),
    ("평균구매금액", "mean"),
    ("구매금액_median", 'median'),
    ("최대구매금액", 'max'),
    ("최소구매금액", 'min'),
    ("구매금액표준편차", 'std'),
    ("구매금액_skew", 'skew'),
    ("구매금액_kurtosis", lambda x: x.kurtosis()),
    # ("고가제품_구매비율", lambda x: np.mean(x > high_purchase)),
    # ("중가제품_구매비율", lambda x: np.mean((x >= low_purchase) & (x <= high_purchase))),
    # ("저가제품_구매비율", lambda x: np.mean(x < low_purchase)),
    # ("구매금액_변동성", lambda x: x.std() / x.mean() if x.mean() != 0 else 0),
]

tmp = train_tr_clean.groupby("ID")["구매가격"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, "left", "ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,주구매_대분류_생식품,총구매금액,구매건수,평균구매금액,구매금액_median,최대구매금액,최소구매금액,구매금액표준편차,구매금액_skew,구매금액_kurtosis
0,train_0,9,255,28,2,1.555556,22.744789,2,0.071429,0.0,...,0.071429,4201200,14,300085.714286,187000.0,1000000,20000,313025.1891,1.670489,1.990929
1,train_1,21,352,16,1,2.0,11.739134,3,0.047619,0.261905,...,0.02381,5043025,42,120072.02381,90450.0,698000,5600,126592.626144,2.651821,9.771083
2,train_2,55,356,6,11,1.854545,5.019987,0,0.254902,0.186275,...,0.029412,22485386,102,220444.960784,93500.0,2770000,4400,415371.770475,4.393399,21.956275
3,train_3,90,345,3,19,2.055556,3.137146,4,0.075676,0.113514,...,0.021622,35839762,185,193728.443243,120000.0,1492000,10000,239163.307349,2.64347,8.320466
4,train_4,24,313,13,6,1.5,11.962731,4,0.083333,0.055556,...,0.194444,3988866,36,110801.833333,41210.5,900000,7640,179263.263965,3.01761,10.550421


In [51]:
tmp = test_tr_clean.groupby('ID')["구매가격"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, 'left', "ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,주구매_대분류_생식품,총구매금액,구매건수,평균구매금액,구매금액_median,최대구매금액,최소구매금액,구매금액표준편차,구매금액_skew,구매금액_kurtosis
0,test_0,5,164,32,3,1.4,30.416096,1,0.142857,0.285714,...,0.142857,429579,7,61368.428571,58536.0,110000,26643,32293.148313,0.702551,-1.103688
1,test_1,3,166,55,1,1.333333,57.879185,0,0.5,0.0,...,0.0,290600,4,72650.0,67500.0,118000,37600,33658.431336,0.856705,1.438447
2,test_2,2,18,9,0,2.5,7.361216,0,0.6,0.0,...,0.0,607000,5,121400.0,95000.0,326000,38000,117233.954126,1.969409,4.075918
3,test_3,14,272,19,1,1.714286,17.715031,3,0.083333,0.083333,...,0.0,2305740,24,96072.5,55250.0,403000,7040,102440.792115,1.507265,2.087113
4,test_4,7,265,37,3,2.142857,36.517818,6,0.066667,0.2,...,0.0,1568690,15,104579.333333,79000.0,376000,16200,99879.156875,1.839077,3.277987


- 환불된 데이터에 대한 컬럼

In [52]:
agg_list = [
    ("환불건수", "count"),
    ("총환불금액", "sum"),
    ("평균환불금액", "mean"),
    ("최소환불금액", "max"),
    ("최대환불금액", "min"),
    ("환불금액표준편차", "std"),
    ("환불금액_skew", "skew"),
    ("환불금액_kurtosis", lambda x: x.kurtosis()),
    # ("고가제품환불비율", lambda x: np.mean(x < high_refund)),
    # ("중가제품환불비율", lambda x: np.mean((x >= high_refund) & (x <= low_refund))),
    # ("저가제품환불비율", lambda x: np.mean(x > low_refund)),
]

tmp = refund_train.groupby("ID")["구매가격"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, "left", "ID").fillna(0)
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,구매금액_skew,구매금액_kurtosis,환불건수,총환불금액,평균환불금액,최소환불금액,최대환불금액,환불금액표준편차,환불금액_skew,환불금액_kurtosis
0,train_0,9,255,28,2,1.555556,22.744789,2,0.071429,0.0,...,1.670489,1.990929,3.0,-2517000.0,-839000.0,-205000.0,-1236000.0,554857.6394,1.571433,0.0
1,train_1,21,352,16,1,2.0,11.739134,3,0.047619,0.261905,...,2.651821,9.771083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,train_2,55,356,6,11,1.854545,5.019987,0,0.254902,0.186275,...,4.393399,21.956275,7.0,-1072500.0,-153214.285714,-4500.0,-288000.0,110664.604829,0.310886,-1.234108
3,train_3,90,345,3,19,2.055556,3.137146,4,0.075676,0.113514,...,2.64347,8.320466,14.0,-2867800.0,-204842.857143,-58000.0,-768000.0,194802.324715,-2.219921,5.204875
4,train_4,24,313,13,6,1.5,11.962731,4,0.083333,0.055556,...,3.01761,10.550421,13.0,-6954400.0,-534953.846154,-49000.0,-1204000.0,379184.767221,-0.381901,-1.126703


In [53]:
tmp = refund_test.groupby("ID")["구매가격"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, "left", "ID").fillna(0)
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,구매금액_skew,구매금액_kurtosis,환불건수,총환불금액,평균환불금액,최소환불금액,최대환불금액,환불금액표준편차,환불금액_skew,환불금액_kurtosis
0,test_0,5,164,32,3,1.4,30.416096,1,0.142857,0.285714,...,0.702551,-1.103688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,test_1,3,166,55,1,1.333333,57.879185,0,0.5,0.0,...,0.856705,1.438447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_2,2,18,9,0,2.5,7.361216,0,0.6,0.0,...,1.969409,4.075918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_3,14,272,19,1,1.714286,17.715031,3,0.083333,0.083333,...,1.507265,2.087113,4.0,-1092000.0,-273000.0,-140000.0,-403000.0,137663.841779,0.014171,-5.642846
4,test_4,7,265,37,3,2.142857,36.517818,6,0.066667,0.2,...,1.839077,3.277987,3.0,-372200.0,-124066.666667,-38000.0,-196200.0,80015.081912,0.759841,0.0


- 추가/환불 후 재구매

In [54]:
train_ft = train_ft.merge(repurchase_count_train, "left", "ID").fillna(0)
train_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,구매금액_kurtosis,환불건수,총환불금액,평균환불금액,최소환불금액,최대환불금액,환불금액표준편차,환불금액_skew,환불금액_kurtosis,환불후재구매브랜드_count
0,train_0,9,255,28,2,1.555556,22.744789,2,0.071429,0.0,...,1.990929,3.0,-2517000.0,-839000.0,-205000.0,-1236000.0,554857.6394,1.571433,0.0,0.0
1,train_1,21,352,16,1,2.0,11.739134,3,0.047619,0.261905,...,9.771083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,train_2,55,356,6,11,1.854545,5.019987,0,0.254902,0.186275,...,21.956275,7.0,-1072500.0,-153214.285714,-4500.0,-288000.0,110664.604829,0.310886,-1.234108,0.0
3,train_3,90,345,3,19,2.055556,3.137146,4,0.075676,0.113514,...,8.320466,14.0,-2867800.0,-204842.857143,-58000.0,-768000.0,194802.324715,-2.219921,5.204875,1.0
4,train_4,24,313,13,6,1.5,11.962731,4,0.083333,0.055556,...,10.550421,13.0,-6954400.0,-534953.846154,-49000.0,-1204000.0,379184.767221,-0.381901,-1.126703,0.0


In [55]:
test_ft = test_ft.merge(repurchase_count_test, "left", "ID").fillna(0)
test_ft.head()

Unnamed: 0,ID,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,주구매요일,월요일_구매비율,화요일_구매비율,...,구매금액_kurtosis,환불건수,총환불금액,평균환불금액,최소환불금액,최대환불금액,환불금액표준편차,환불금액_skew,환불금액_kurtosis,환불후재구매브랜드_count
0,test_0,5,164,32,3,1.4,30.416096,1,0.142857,0.285714,...,-1.103688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,test_1,3,166,55,1,1.333333,57.879185,0,0.5,0.0,...,1.438447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_2,2,18,9,0,2.5,7.361216,0,0.6,0.0,...,4.075918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_3,14,272,19,1,1.714286,17.715031,3,0.083333,0.083333,...,2.087113,4.0,-1092000.0,-273000.0,-140000.0,-403000.0,137663.841779,0.014171,-5.642846,0.0
4,test_4,7,265,37,3,2.142857,36.517818,6,0.066667,0.2,...,3.277987,3.0,-372200.0,-124066.666667,-38000.0,-196200.0,80015.081912,0.759841,0.0,0.0


In [56]:
train_ft["환불후재구매비율"] = np.where((train_ft["환불건수"] + train_ft["환불후재구매브랜드_count"]) == 0, 0, (train_ft["환불후재구매브랜드_count"] / (train_ft["환불건수"] + train_ft["환불후재구매브랜드_count"])))
test_ft["환불후재구매비율"] = np.where((test_ft["환불건수"] + test_ft["환불후재구매브랜드_count"]) == 0, 0, (test_ft["환불후재구매브랜드_count"] / (test_ft["환불건수"] + test_ft["환불후재구매브랜드_count"])))
train_ft["환불후재구매비율"].isnull().sum(), test_ft["환불후재구매비율"].isnull().sum()

(0, 0)

In [57]:
train_ft.shape, test_ft.shape

((14940, 106), (12225, 106))

### 구매가격을 이용한 등급 특성 생성

In [58]:
# def categorize_customer(total_amount):
#     if total_amount >= 20_000_000:
#         return 'Diamond'
#     elif total_amount >= 15_000_000:
#         return 'Platinum'
#     elif total_amount >= 10_000_000:
#         return 'Gold'
#     elif total_amount >= 6_000_000:
#         return 'Black'
#     else:
#         return 'Normal'

# train_ft['고객등급'] = train_ft['총구매금액'].apply(categorize_customer)
# train_ft.head()

In [59]:
# test_ft['고객등급'] = test_ft['총구매금액'].apply(categorize_customer)
# test_ft.head()

In [60]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## pivot_table을 이용한 특성 생성

In [61]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

### ID, 지점코드별 구매횟수

In [62]:
train_tmp = pd.pivot_table(
    train_tr_clean,
    index="ID",
    columns="지점코드",
    values="구매가격",
    aggfunc="count",
    fill_value=0,
).add_prefix("pivot_지점코드_")

train_tmp

지점코드,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
train_0,8,6,0,0
train_1,31,2,0,9
train_10,0,118,10,0
train_100,5,9,0,6
train_1000,0,2,0,13
...,...,...,...,...
train_9995,1,0,0,0
train_9996,1,22,0,0
train_9997,15,0,0,0
train_9998,24,0,0,5


In [63]:
test_tmp = pd.pivot_table(
    test_tr_clean,
    index="ID",
    columns="지점코드",
    values="구매가격",
    aggfunc="count",
    fill_value=0,
).add_prefix("pivot_지점코드_")

test_tmp

지점코드,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
test_0,0,0,7,0
test_1,1,0,0,3
test_10,28,0,0,0
test_100,103,3,0,5
test_1000,0,0,0,3
...,...,...,...,...
test_9995,16,0,0,0
test_9996,0,0,44,0
test_9997,0,0,0,21
test_9998,0,0,62,0


### ID, 중분류별 구매횟수

In [64]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="중분류",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("pivot_중분류_")
    .reset_index(),
    on="ID",
    how="left",
)

train_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼,pivot_중분류_GBR지원,pivot_중분류_L_B침구,pivot_중분류_NB제화,pivot_중분류_NB핸드백,...,pivot_중분류_행사슈즈,pivot_중분류_행사핸드백,pivot_중분류_향수,pivot_중분류_헤어ACC,pivot_중분류_헤어악세사리,pivot_중분류_헤어액세사리,pivot_중분류_홈데코,pivot_중분류_화장잡화,pivot_중분류_화장품,pivot_중분류_훼미닌부틱
0,train_0,8,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,train_1,31,2,0,9,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,train_10,0,118,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,train_100,5,9,0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,train_1000,0,2,0,13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14936,train_9996,1,22,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14937,train_9997,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14938,train_9998,24,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="중분류",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("pivot_중분류_")
    .reset_index(),
    on="ID",
    how="left",
)

test_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼,pivot_중분류_GBR지원,pivot_중분류_L_B침구,pivot_중분류_NB제화,pivot_중분류_NB핸드백,...,pivot_중분류_행사슈즈,pivot_중분류_행사핸드백,pivot_중분류_향수,pivot_중분류_헤어ACC,pivot_중분류_헤어악세사리,pivot_중분류_헤어액세사리,pivot_중분류_홈데코,pivot_중분류_화장잡화,pivot_중분류_화장품,pivot_중분류_훼미닌부틱
0,test_0,0,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,test_1,1,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,test_10,28,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,test_100,103,3,0,5,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
4,test_1000,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
12221,test_9996,0,0,44,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
12222,test_9997,0,0,0,21,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
12223,test_9998,0,0,62,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### ID, 중분류별 18시 이전 구매금액 (합계)

In [66]:
train_tr_clean['구매시각']

0         09:40:00
1         09:40:00
2         10:20:00
3         10:30:00
4         10:30:00
            ...   
523100    19:53:00
523101    19:54:00
523102    20:00:00
523103    20:00:00
523104    20:03:00
Name: 구매시각, Length: 456484, dtype: object

In [67]:
before18 = train_tr_clean[train_tr_clean['구매시각'].apply(lambda x: pd.to_datetime(x, format='%H:%M:%S').hour < 18)].pivot_table(
    index='ID',
    columns='중분류',
    values='구매가격',
    aggfunc='sum',
    fill_value=0
).add_prefix("pivot_18시이전구매_")

In [68]:
before18.columns = [ "".join(col) for col in before18.columns]
before18.head()

Unnamed: 0_level_0,pivot_18시이전구매_DC캐주얼,pivot_18시이전구매_GBR지원,pivot_18시이전구매_L_B침구,pivot_18시이전구매_NB제화,pivot_18시이전구매_NB핸드백,pivot_18시이전구매_N_B침구,pivot_18시이전구매_TOP디자이너,pivot_18시이전구매_TV.VTR,pivot_18시이전구매_TV_VCR,pivot_18시이전구매_TV_VTR,...,pivot_18시이전구매_행사슈즈,pivot_18시이전구매_행사핸드백,pivot_18시이전구매_향수,pivot_18시이전구매_헤어ACC,pivot_18시이전구매_헤어악세사리,pivot_18시이전구매_헤어액세사리,pivot_18시이전구매_홈데코,pivot_18시이전구매_화장잡화,pivot_18시이전구매_화장품,pivot_18시이전구매_훼미닌부틱
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
train_0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
train_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
train_10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
train_100,0,0,0,0,0,0,0,0,707000,0,...,0,0,0,0,0,0,0,0,0,0
train_1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
before18 = before18.reset_index()
before18.head()

Unnamed: 0,ID,pivot_18시이전구매_DC캐주얼,pivot_18시이전구매_GBR지원,pivot_18시이전구매_L_B침구,pivot_18시이전구매_NB제화,pivot_18시이전구매_NB핸드백,pivot_18시이전구매_N_B침구,pivot_18시이전구매_TOP디자이너,pivot_18시이전구매_TV.VTR,pivot_18시이전구매_TV_VCR,...,pivot_18시이전구매_행사슈즈,pivot_18시이전구매_행사핸드백,pivot_18시이전구매_향수,pivot_18시이전구매_헤어ACC,pivot_18시이전구매_헤어악세사리,pivot_18시이전구매_헤어액세사리,pivot_18시이전구매_홈데코,pivot_18시이전구매_화장잡화,pivot_18시이전구매_화장품,pivot_18시이전구매_훼미닌부틱
0,train_0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,train_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,train_10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,train_100,0,0,0,0,0,0,0,0,707000,...,0,0,0,0,0,0,0,0,0,0
4,train_1000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
train_tmp = train_tmp.merge(
    before18,
    on="ID",
    how="left",
).fillna(0)

train_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼,pivot_중분류_GBR지원,pivot_중분류_L_B침구,pivot_중분류_NB제화,pivot_중분류_NB핸드백,...,pivot_18시이전구매_행사슈즈,pivot_18시이전구매_행사핸드백,pivot_18시이전구매_향수,pivot_18시이전구매_헤어ACC,pivot_18시이전구매_헤어악세사리,pivot_18시이전구매_헤어액세사리,pivot_18시이전구매_홈데코,pivot_18시이전구매_화장잡화,pivot_18시이전구매_화장품,pivot_18시이전구매_훼미닌부틱
0,train_0,8,6,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,train_1,31,2,0,9,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,train_10,0,118,10,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,train_100,5,9,0,6,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,train_1000,0,2,0,13,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14936,train_9996,1,22,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14937,train_9997,15,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14938,train_9998,24,0,0,5,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
before18 = test_tr_clean[test_tr_clean['구매시각'].apply(lambda x: pd.to_datetime(x, format='%H:%M:%S').hour < 18)].pivot_table(
    index='ID',
    columns='중분류',
    values='구매가격',
    aggfunc='sum',
    fill_value=0
).add_prefix("pivot_18시이전구매_")

before18.columns = [ "".join(col) for col in before18.columns]

before18 = before18.reset_index()
before18.head()

Unnamed: 0,ID,pivot_18시이전구매_DC캐주얼,pivot_18시이전구매_GBR지원,pivot_18시이전구매_L_B침구,pivot_18시이전구매_NB제화,pivot_18시이전구매_NB핸드백,pivot_18시이전구매_N_B침구,pivot_18시이전구매_TOP디자이너,pivot_18시이전구매_TV.VTR,pivot_18시이전구매_TV_VCR,...,pivot_18시이전구매_행사슈즈,pivot_18시이전구매_행사핸드백,pivot_18시이전구매_향수,pivot_18시이전구매_헤어ACC,pivot_18시이전구매_헤어악세사리,pivot_18시이전구매_헤어액세사리,pivot_18시이전구매_홈데코,pivot_18시이전구매_화장잡화,pivot_18시이전구매_화장품,pivot_18시이전구매_훼미닌부틱
0,test_0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,test_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,test_10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,test_100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,test_1000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
test_tmp = test_tmp.merge(
    before18,
    on="ID",
    how="left",
).fillna(0)

test_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼,pivot_중분류_GBR지원,pivot_중분류_L_B침구,pivot_중분류_NB제화,pivot_중분류_NB핸드백,...,pivot_18시이전구매_행사슈즈,pivot_18시이전구매_행사핸드백,pivot_18시이전구매_향수,pivot_18시이전구매_헤어ACC,pivot_18시이전구매_헤어악세사리,pivot_18시이전구매_헤어액세사리,pivot_18시이전구매_홈데코,pivot_18시이전구매_화장잡화,pivot_18시이전구매_화장품,pivot_18시이전구매_훼미닌부틱
0,test_0,0,0,7,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,test_1,1,0,0,3,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_10,28,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_100,103,3,0,5,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,test_1000,0,0,0,3,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,145700.0,0.0
12221,test_9996,0,0,44,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,46000.0,0.0,0.0,0.0,0.0,0.0
12222,test_9997,0,0,0,21,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,30000.0,0.0,0.0,0.0,0.0
12223,test_9998,0,0,62,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### ID, 중분류별 18시 이후 구매금액 (합계)

In [73]:
# after18 = train_tr_clean[train_tr_clean['구매시각'].apply(lambda x: pd.to_datetime(x, format='%H:%M:%S').hour >= 18)].pivot_table(
#     index='ID',
#     columns='중분류',
#     values='구매가격',
#     aggfunc='sum',
#     fill_value=0
# ).add_prefix("pivot_18시이후구매_")

# after18.columns = [ "".join(col) for col in after18.columns]

# after18 = after18.reset_index()
# after18.head()

In [74]:
# train_tmp = train_tmp.merge(
#     after18,
#     on="ID",
#     how="left",
# ).fillna(0)

# train_tmp

In [75]:
# after18 = test_tr_clean[test_tr_clean['구매시각'].apply(lambda x: pd.to_datetime(x, format='%H:%M:%S').hour >= 18)].pivot_table(
#     index='ID',
#     columns='중분류',
#     values='구매가격',
#     aggfunc='sum',
#     fill_value=0
# ).add_prefix("pivot_18시이전구매_")

# after18.columns = [ "".join(col) for col in after18.columns]

# after18 = after18.reset_index()
# after18.head()

In [76]:
# test_tmp = test_tmp.merge(
#     after18,
#     on="ID",
#     how="left",
# ).fillna(0)

# test_tmp

### ID, 대분류별 구매횟수

In [77]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="대분류",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("pivot_대분류_")
    .reset_index(),
    on="ID",
    how="left",
)

train_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼,pivot_중분류_GBR지원,pivot_중분류_L_B침구,pivot_중분류_NB제화,pivot_중분류_NB핸드백,...,pivot_대분류_여성캐주얼,pivot_대분류_여성캐쥬얼,pivot_대분류_영라이브,pivot_대분류_영어덜트캐쥬얼,pivot_대분류_영캐릭터,pivot_대분류_영플라자,pivot_대분류_잡화,pivot_대분류_잡화파트,pivot_대분류_케주얼_구두_아동,pivot_대분류_패션잡화
0,train_0,8,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,3
1,train_1,31,2,0,9,0,0,0,0,0,...,6,0,0,0,4,11,0,0,1,8
2,train_10,0,118,10,0,0,0,0,0,0,...,0,0,0,0,0,0,2,19,5,0
3,train_100,5,9,0,6,0,0,0,0,0,...,0,0,0,0,0,1,0,3,4,1
4,train_1000,0,2,0,13,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
14936,train_9996,1,22,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,2,12,0
14937,train_9997,15,0,0,0,0,0,0,0,0,...,2,0,0,0,2,7,0,0,0,1
14938,train_9998,24,0,0,5,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,9


In [78]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="대분류",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("pivot_대분류_")
    .reset_index(),
    on="ID",
    how="left",
)

test_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼,pivot_중분류_GBR지원,pivot_중분류_L_B침구,pivot_중분류_NB제화,pivot_중분류_NB핸드백,...,pivot_대분류_여성캐주얼,pivot_대분류_여성캐쥬얼,pivot_대분류_영라이브,pivot_대분류_영어덜트캐쥬얼,pivot_대분류_영캐릭터,pivot_대분류_영플라자,pivot_대분류_잡화,pivot_대분류_잡화파트,pivot_대분류_케주얼_구두_아동,pivot_대분류_패션잡화
0,test_0,0,0,7,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
1,test_1,1,0,0,3,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,test_10,28,0,0,0,0,0,0,0,0,...,0,0,0,0,4,3,0,0,0,11
3,test_100,103,3,0,5,0,0,0,0,1,...,5,0,0,0,1,1,0,2,0,16
4,test_1000,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,0,0,0,0,0,...,2,0,0,0,3,3,0,0,0,2
12221,test_9996,0,0,44,0,0,0,0,0,0,...,0,2,4,4,0,0,4,0,0,0
12222,test_9997,0,0,0,21,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
12223,test_9998,0,0,62,0,0,0,0,0,0,...,0,6,10,1,0,0,8,0,0,0


### ID, 브랜드코드별 구매횟수
- 사용할거면 브랜드코드만 차원축소 진행

In [79]:
# train_tmp = train_tmp.merge(
#     pd.pivot_table(
#         train_tr_clean,
#         index="ID",
#         columns="브랜드코드",
#         values="구매가격",
#         aggfunc="count",
#         fill_value=0,
#     )
#     .add_prefix("pivot_브랜드코드_")
#     .reset_index(),
#     on="ID",
#     how='left'
# )

# train_tmp

In [80]:
# test_tmp = test_tmp.merge(
#     pd.pivot_table(
#         test_tr_clean,
#         index="ID",
#         columns="브랜드코드",
#         values="구매가격",
#         aggfunc="count",
#         fill_value=0,
#     )
#     .add_prefix("pivot_브랜드코드_")
#     .reset_index(),
#     on="ID",
#     how='left'
# )

# test_tmp

### ID, 구매 요일별 구매금액

In [81]:
요일별_구매금액_sum = pd.pivot_table(
    train_tr_clean,
    index='ID',
    columns=train_tr_clean['구매날짜'].dt.day_name(),
    values='구매가격',
    aggfunc='sum',
    fill_value=0
)

요일별_구매금액_sum.columns = [f"{day}_sum" for day in 요일별_구매금액_sum.columns]
train_tmp = train_tmp.merge(요일별_구매금액_sum, how='left', on='ID')
train_tmp.head()

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼,pivot_중분류_GBR지원,pivot_중분류_L_B침구,pivot_중분류_NB제화,pivot_중분류_NB핸드백,...,pivot_대분류_잡화파트,pivot_대분류_케주얼_구두_아동,pivot_대분류_패션잡화,Friday_sum,Monday_sum,Saturday_sum,Sunday_sum,Thursday_sum,Tuesday_sum,Wednesday_sum
0,train_0,8,6,0,0,0,0,0,0,0,...,1,1,3,43500,320000,1430000,340000,1507400,0,560300
1,train_1,31,2,0,9,0,0,0,0,0,...,0,1,8,643400,316000,0,30000,1822450,1193000,1038175
2,train_10,0,118,10,0,0,0,0,0,0,...,19,5,0,10912013,0,124688,0,0,129640,68529
3,train_100,5,9,0,6,0,0,0,0,0,...,3,4,1,669520,73000,983800,45567,0,45000,59000
4,train_1000,0,2,0,13,0,0,0,0,0,...,1,0,0,0,29000,29000,292300,110000,62000,235200


In [82]:
요일별_구매금액_sum = pd.pivot_table(
    test_tr_clean,
    index='ID',
    columns=test_tr_clean['구매날짜'].dt.day_name(),
    values='구매가격',
    aggfunc='sum',
    fill_value=0
)

요일별_구매금액_sum.columns = [f"{day}_sum" for day in 요일별_구매금액_sum.columns]
test_tmp = test_tmp.merge(요일별_구매금액_sum, how='left', on='ID')
test_tmp.head()

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼,pivot_중분류_GBR지원,pivot_중분류_L_B침구,pivot_중분류_NB제화,pivot_중분류_NB핸드백,...,pivot_대분류_잡화파트,pivot_대분류_케주얼_구두_아동,pivot_대분류_패션잡화,Friday_sum,Monday_sum,Saturday_sum,Sunday_sum,Thursday_sum,Tuesday_sum,Wednesday_sum
0,test_0,0,0,7,0,0,0,0,0,0,...,0,0,0,0,100000,136643,94400,0,98536,0
1,test_1,1,0,0,3,0,0,0,0,0,...,0,0,0,0,155600,62000,0,0,0,73000
2,test_10,28,0,0,0,0,0,0,0,0,...,0,0,11,493300,62588,120000,153000,146000,1157900,133000
3,test_100,103,3,0,5,0,0,0,0,1,...,2,0,16,718347,322549,160081,572907,531251,145493,375763
4,test_1000,0,0,0,3,0,0,0,0,0,...,0,0,1,0,0,0,73000,0,0,154000


### 최종 pivot table merge

In [83]:
train_ft = train_ft.merge(train_tmp, on="ID", how="left")

for col in train_tmp.columns:
    if col not in test_tmp.columns:
        test_tmp[col] = 0

test_tmp = test_tmp[train_tmp.columns]
test_ft = test_ft.merge(test_tmp, how="left", on="ID")

In [84]:
train_ft.shape, test_ft.shape

((14940, 751), (12225, 751))

In [85]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

## 추가 피처 생성

In [86]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [87]:
train_ft.shape, test_ft.shape

((14940, 751), (12225, 751))

In [88]:
weekdays = ["월요일_구매비율", "화요일_구매비율", "수요일_구매비율", "목요일_구매비율", "금요일_구매비율"]

train_ft["평일_구매비율"] = train_ft[weekdays].sum(axis=1)
test_ft["평일_구매비율"] = test_ft[weekdays].sum(axis=1)

In [89]:
weekend = ["토요일_구매비율", "일요일_구매비율"]

train_ft["주말_구매비율"] = train_ft[weekend].sum(axis=1)
test_ft["주말_구매비율"] = test_ft[weekend].sum(axis=1)

In [90]:
train_ft["주말_방문_선호도"] = train_ft["주말_구매비율"] / (train_ft["평일_구매비율"] + 1)  #
train_ft["방문일수_대비_구매건수"] = train_ft["구매건수"] / train_ft["총방문일수"]
# train_ft["전체_성수기_구매비율"] = train_ft[["추석성수기_구매비율", "연말연시_구매비율", "설날성수기_구매비율"]].sum(axis=1)
# train_ft["전체_준성수기_구매비율"] = train_ft[["여름시즌_구매비율", "신학기_구매비율"]].sum(axis=1)
# train_ft["전체_중간기_구매비율"] = train_ft[["여름끝추석전_구매비율", "연말연시끝설날전_구매비율", "신학기후_구매비율"]].sum(axis=1)
# train_ft["전체_비수기_구매비율"] = train_ft[["초여름_구매비율", "추석후연말전_구매비율", "봄철비수기_구매비율"]].sum(axis=1)
# train_ft["성수기_구매집중도"] = train_ft["전체_성수기_구매비율"] / (train_ft["전체_비수기_구매비율"] + 1)
# train_ft["계절_구매변동성"] = train_ft[["봄_구매비율", "여름_구매비율", "가을_구매비율", "겨울_구매비율"]].std(axis=1)

test_ft["주말_방문_선호도"] = test_ft["주말_구매비율"] / (test_ft["평일_구매비율"] + 1)
test_ft["방문일수_대비_구매건수"] = test_ft["구매건수"] / test_ft["총방문일수"]
# test_ft["전체_성수기_구매비율"] = test_ft[["추석성수기_구매비율", "연말연시_구매비율", "설날성수기_구매비율"]].sum(axis=1)
# test_ft["전체_준성수기_구매비율"] = test_ft[["여름시즌_구매비율", "신학기_구매비율"]].sum(axis=1)
# test_ft["전체_중간기_구매비율"] = test_ft[["여름끝추석전_구매비율", "연말연시끝설날전_구매비율", "신학기후_구매비율"]].sum(axis=1)
# test_ft["전체_비수기_구매비율"] = test_ft[["초여름_구매비율", "추석후연말전_구매비율", "봄철비수기_구매비율"]].sum(axis=1)
# test_ft["성수기_구매집중도"] = test_ft["전체_성수기_구매비율"] / (test_ft["전체_비수기_구매비율"] + 1)
# test_ft["계절_구매변동성"] = test_ft[["봄_구매비율", "여름_구매비율", "가을_구매비율", "겨울_구매비율"]].std(axis=1)

In [91]:
train_ft["구매주기_대비_구매금액"] = train_ft["총구매금액"] / train_ft["구매주기"]
train_ft["방문당_평균구매금액"] = train_ft["총구매금액"] / train_ft["총방문일수"]
train_ft["구매금액_대비_환불금액"] = abs(train_ft["총환불금액"]) / train_ft["총구매금액"]
train_ft["환불건수_대비_구매건수"] = train_ft["환불건수"] / train_ft["구매건수"]

test_ft["구매주기_대비_구매금액"] = test_ft["총구매금액"] / test_ft["구매주기"]
test_ft["방문당_평균구매금액"] = test_ft["총구매금액"] / test_ft["총방문일수"]
test_ft["구매금액_대비_환불금액"] = abs(test_ft["총환불금액"]) / test_ft["총구매금액"]
test_ft["환불건수_대비_구매건수"] = test_ft["환불건수"] / test_ft["구매건수"]

# 항상 확인하기
- 학습데이터와 테스트 데이터의 피처개수는 동일해야 함

In [92]:
train_ft.shape, test_ft.shape

((14940, 759), (12225, 759))

# 추출한 피처 저장하기

In [93]:
train_ft.to_csv(f"{DATA_PATH}train_pvt1.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_pvt1.csv",index=False)

# 241111
- 브랜드코드 활용방법
- 앙상블 모델은 피처셀렉션 X
  - 기존 데이터 유실될 수 있음
- 앙상블 모델이 아닌 모델들에 대해서 피처셀렉션 진행
- 튜닝 없이 기본 파라미터에서 CV점수 높은애들로 보팅이나 스태킹

- 모델 종류별로 나눠서 CV 점수 확인해보는것도 좋음
- catboost는 튜닝안해도됨

# 241113
- 브랜드코드를 군집화하고 해당 군집으로 피벗테이블을 만들어봐도
  - 어떻게 군집화?
    - merge의 조건을 브랜드코드로 해서 merge 해보기