In [2]:
DATA_PATH = "data/"
DATA_PATH

'data/'

In [3]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

## 날짜 형식 변환

In [4]:
train_tr["구매일시"] = pd.to_datetime(train_tr["구매일시"])
test_tr["구매일시"] = pd.to_datetime(test_tr["구매일시"])

## 날짜 시간 칼럼 분할

In [5]:
train_tr["구매날짜"] = train_tr["구매일시"].dt.date
test_tr["구매날짜"] = test_tr["구매일시"].dt.date

train_tr["구매시각"] = train_tr["구매일시"].dt.time
test_tr["구매시각"] = test_tr["구매일시"].dt.time

train_tr = train_tr.drop(columns=["구매일시"])
test_tr = test_tr.drop(columns=["구매일시"])

train_tr["구매날짜"] = pd.to_datetime(train_tr["구매날짜"], errors="coerce")
test_tr["구매날짜"] = pd.to_datetime(test_tr["구매날짜"], errors="coerce")

train_tr.shape, test_tr.shape

((523105, 8), (441196, 8))

## 실구매와 환불 분리하기

In [6]:
purchase_train = train_tr[train_tr["구매가격"] > 0].reset_index()
purchase_test = test_tr[test_tr["구매가격"] > 0].reset_index()

refund_train = train_tr[train_tr["구매가격"] < 0].reset_index()
refund_test = test_tr[test_tr["구매가격"] < 0].reset_index()

# 환불 금액의 절대값과 동일한 구매 내역 찾기
refund_pairs_train = pd.merge(
    refund_train,
    purchase_train,
    on=["ID", "지점코드", "대분류", "중분류", "브랜드코드"],
    suffixes=("_refund", "_purchase")
)

# 환불 금액이 일치하는 경우만 선택
refund_pairs_train = refund_pairs_train[
    refund_pairs_train["구매가격_refund"].abs() == refund_pairs_train["구매가격_purchase"]
].copy()

refund_train.head()

Unnamed: 0,index,ID,지점코드,대분류,중분류,브랜드코드,구매가격,구매날짜,구매시각
0,56,train_13653,A144000,케주얼_구두_아동,영트랜드,5184,-126000,2004-05-01,11:10:00
1,58,train_7200,A112000,패션잡화,수입종합화장품,5149,-38000,2004-05-01,11:10:00
2,59,train_2935,A202000,아동,아동복,5155,-39000,2004-05-01,11:10:00
3,109,train_10857,A112000,여성캐주얼,칼라드래디셔널,5248,-57500,2004-05-01,11:40:00
4,120,train_8192,A373000,여성캐주얼,영캐주얼,5263,-89000,2004-05-01,11:41:00


In [7]:
# 테스트 데이터
refund_pairs_test = pd.merge(
    refund_test,
    purchase_test,
    on=["ID", "지점코드", "대분류", "중분류", "브랜드코드"],
    suffixes=("_refund", "_purchase")

)

refund_pairs_test = refund_pairs_test[
    refund_pairs_test["구매가격_refund"].abs() == refund_pairs_test["구매가격_purchase"]
].copy()


In [8]:
refund_pairs_train = pd.merge(
    refund_train,
    purchase_train,
    on=["ID", "지점코드", "대분류", "중분류", "브랜드코드"], 
    suffixes=("_refund", "_purchase")
)

refund_pairs_test = pd.merge(
    refund_test,
    purchase_test,
    on=["ID", "지점코드", "대분류", "중분류", "브랜드코드"],
    suffixes=("_refund", "_purchase")
)

# 구매금액과 환불 금액이 일치하는 경우만 선택
refund_pairs_train = refund_pairs_train[
    refund_pairs_train["구매가격_refund"].abs() == refund_pairs_train["구매가격_purchase"]
].copy()

refund_pairs_test = refund_pairs_test[
    refund_pairs_test["구매가격_refund"].abs() == refund_pairs_test["구매가격_purchase"]
].copy()

# 환불 날짜와 시간이 구매 날짜와 시간보다 이후인 경우만 선택
refund_pairs_train = refund_pairs_train[
    (refund_pairs_train["구매날짜_refund"] > refund_pairs_train["구매날짜_purchase"]) |
    ((refund_pairs_train["구매날짜_refund"] == refund_pairs_train["구매날짜_purchase"]) & 
     (refund_pairs_train["구매시각_refund"] > refund_pairs_train["구매시각_purchase"]))
]

refund_pairs_test = refund_pairs_test[
    (refund_pairs_test["구매날짜_refund"] > refund_pairs_test["구매날짜_purchase"]) |
    ((refund_pairs_test["구매날짜_refund"] == refund_pairs_test["구매날짜_purchase"]) & 
     (refund_pairs_test["구매시각_refund"] > refund_pairs_test["구매시각_purchase"]))
]

# 환불 날짜/시각과 구매 날짜/시각의 차이를 계산
refund_pairs_train['time_diff'] = (
    pd.to_datetime(refund_pairs_train['구매날짜_refund'].astype(str) + ' ' + refund_pairs_train['구매시각_refund'].astype(str)) - 
    pd.to_datetime(refund_pairs_train['구매날짜_purchase'].astype(str) + ' ' + refund_pairs_train['구매시각_purchase'].astype(str))
)

refund_pairs_test['time_diff'] = (
    pd.to_datetime(refund_pairs_test['구매날짜_refund'].astype(str) + ' ' + refund_pairs_test['구매시각_refund'].astype(str)) - 
    pd.to_datetime(refund_pairs_test['구매날짜_purchase'].astype(str) + ' ' + refund_pairs_test['구매시각_purchase'].astype(str))
)

# _purchase 컬럼을 제외한 컬럼으로 그룹화하고 time_diff가 가장 작은 행만 선택
cols = [col for col in refund_pairs_train.columns if '_purchase' not in col and col != 'time_diff']
refund_pairs_train = refund_pairs_train.loc[refund_pairs_train.groupby(cols)['time_diff'].idxmin()]

cols = [col for col in refund_pairs_test.columns if '_purchase' not in col and col != 'time_diff']
refund_pairs_test = refund_pairs_test.loc[refund_pairs_test.groupby(cols)['time_diff'].idxmin()]

# time_diff 컬럼 제거
refund_pairs_train = refund_pairs_train.drop('time_diff', axis=1)
refund_pairs_test = refund_pairs_test.drop('time_diff', axis=1)

# 환불과 구매 인덱스 추출
index_refund_train = refund_pairs_train['index_refund'].values
index_purchase_train = refund_pairs_train['index_purchase'].values

index_refund_test = refund_pairs_test['index_refund'].values 
index_purchase_test = refund_pairs_test['index_purchase'].values

# train_tr, test_tr에서 해당 인덱스 제거
train_tr_clean = train_tr.drop(index=np.concatenate([index_refund_train, index_purchase_train]))
test_tr_clean = test_tr.drop(index=np.concatenate([index_refund_test, index_purchase_test]))

# 구매가격이 0보다 작은 행 제거
train_tr_clean = train_tr_clean[train_tr_clean['구매가격'] > 0]
test_tr_clean = test_tr_clean[test_tr_clean['구매가격'] > 0]

train_tr_clean.shape, test_tr_clean.shape

((456484, 8), (384916, 8))

In [9]:
train_ft = train_target[["ID"]]
test_ft = submit[["ID"]]

train_ft.shape, test_ft.shape

((14940, 1), (12225, 1))

## 구매 날짜를 이용한 특성 생성

In [10]:
agg_list = [
    # 일
    ("총방문일수", "nunique"),
    ("첫구매날짜", 'min'),
    ("마지막구매날짜", 'max'),
    ("백화점이용기간", lambda x: (x.max() - x.min()).days + 1),
    ("구매주기", lambda x: int(((x.max() - x.min()).days + 1) / x.dt.date.nunique())),
    ("주말방문일수", lambda x: x[x.dt.weekday > 4].nunique()),
    ("일별평균구매건수", lambda x: x.count() / x.dt.date.nunique()),
    ("구매간격_표준편차", lambda x: np.std(np.diff([d.toordinal() for d in sorted(x)])) if len(x) > 1 else 0),
    ("마지막구매후_경과일", lambda x: (pd.Timestamp('2005-04-30') - x.max()).days),
    
    # 요일
    ('평균구매요일', lambda x: x.dt.weekday.mean()),
    ("주구매요일", lambda x: x.dt.weekday.mode()[0]),
    ("월요일_구매비율", lambda x: np.mean(x.dt.weekday == 0)),
    ("화요일_구매비율", lambda x: np.mean(x.dt.weekday == 1)),
    ("수요일_구매비율", lambda x: np.mean(x.dt.weekday == 2)),
    ("목요일_구매비율", lambda x: np.mean(x.dt.weekday == 3)),
    ("금요일_구매비율", lambda x: np.mean(x.dt.weekday == 4)),
    ("토요일_구매비율", lambda x: np.mean(x.dt.weekday == 5)),
    ("일요일_구매비율", lambda x: np.mean(x.dt.weekday == 6)),
    
    # 월
    ("1월_구매비율", lambda x: np.mean(x.dt.month == 1)),
    ("2월_구매비율", lambda x: np.mean(x.dt.month == 2)),
    ("3월_구매비율", lambda x: np.mean(x.dt.month == 3)),
    ("4월_구매비율", lambda x: np.mean(x.dt.month == 4)),
    ("5월_구매비율", lambda x: np.mean(x.dt.month == 5)),
    ("6월_구매비율", lambda x: np.mean(x.dt.month == 6)),
    ("7월_구매비율", lambda x: np.mean(x.dt.month == 7)),
    ("8월_구매비율", lambda x: np.mean(x.dt.month == 8)),
    ("9월_구매비율", lambda x: np.mean(x.dt.month == 9)),
    ("10월_구매비율", lambda x: np.mean(x.dt.month == 10)),
    ("11월_구매비율", lambda x: np.mean(x.dt.month == 11)),
    ("12월_구매비율", lambda x: np.mean(x.dt.month == 12)),
    ("거래개월수", lambda x: x.dt.date.astype(str).str[:-3].nunique()),
    ("월별평균구매건수", lambda x: x.count() / x.dt.month.nunique()),
    ("월초구매비율", lambda x: np.mean(x.dt.day <= 10)),
    ("월중순구매비율", lambda x: np.mean((x.dt.day > 10) & (x.dt.day <= 20))),
    ("월말구매비율", lambda x: np.mean(x.dt.day >= 21)),
    
    ("구매횟수_상반기", lambda x: np.mean((x.dt.month >= 1) & (x.dt.month <= 6))),
    ("구매횟수_하반기", lambda x: np.mean((x.dt.month >= 7) & (x.dt.month <= 12))),
    
    # 성수기
    ("추석성수기_구매비율", lambda x: np.mean((x.dt.month == 9) & (x.dt.day >= 18) | (x.dt.month == 10) & (x.dt.day <= 3))),
    ("연말연시_구매비율", lambda x: np.mean((x.dt.month == 12) | (x.dt.month == 1) & (x.dt.day <= 10))),
    ("설날성수기_구매비율", lambda x: np.mean((x.dt.month == 2) & (x.dt.day >= 1) & (x.dt.day <= 15))),
    
    # 준성수기
    ("여름시즌_구매비율", lambda x: np.mean((x.dt.month == 6) & (x.dt.day >= 15) | (x.dt.month == 7) | (x.dt.month == 8) & (x.dt.day <= 15))),
    ("신학기_구매비율", lambda x: np.mean((x.dt.month == 3) & (x.dt.day >= 1) & (x.dt.day <= 15))),
    
    # 중간기
    ("여름끝추석전_구매비율", lambda x: np.mean((x.dt.month == 8) & (x.dt.day >= 16) | (x.dt.month == 9) & (x.dt.day <= 17))),
    ("연말연시끝설날전_구매비율", lambda x: np.mean((x.dt.month == 1) & (x.dt.day >= 11) & (x.dt.day <= 31))),
    ("신학기후_구매비율", lambda x: np.mean((x.dt.month == 3) & (x.dt.day >= 16) | (x.dt.month == 4) & (x.dt.day <= 10))),
    
    # 비수기
    ("초여름_구매비율", lambda x: np.mean((x.dt.month == 5) & (x.dt.day >= 17) | (x.dt.month == 6) & (x.dt.day <= 14))),
    ("추석후연말전_구매비율", lambda x: np.mean((x.dt.month == 10) & (x.dt.day >= 4) | (x.dt.month == 11))),
    ("봄철비수기_구매비율", lambda x: np.mean((x.dt.month == 4) & (x.dt.day >= 11) & (x.dt.day <= 29))),
    
    # 계절
    ('봄_구매비율', lambda x: np.mean(x.dt.month.isin([3,4,5]))),
    ('여름_구매비율', lambda x: np.mean(x.dt.month.isin([6,7,8]))),
    ('가을_구매비율', lambda x: np.mean(x.dt.month.isin([9,10,11]))),
    ('겨울_구매비율', lambda x: np.mean(x.dt.month.isin([1,2,12]))),
    
    #분기
    ("1분기_구매비율", lambda x: np.mean(x.dt.quarter == 1)),
    ("2분기_구매비율", lambda x: np.mean(x.dt.quarter == 2)),
    ("3분기_구매비율", lambda x: np.mean(x.dt.quarter == 3)),
    ("4분기_구매비율", lambda x: np.mean(x.dt.quarter == 4)),
]

tmp = train_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,추석후연말전_구매비율,봄철비수기_구매비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,1분기_구매비율,2분기_구매비율,3분기_구매비율,4분기_구매비율
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,0.0,0.0,0.071429,0.357143,0.285714,0.285714,0.071429,0.285714,0.428571,0.214286
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,0.119048,0.047619,0.357143,0.166667,0.357143,0.119048,0.190476,0.238095,0.380952,0.190476
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,0.107843,0.107843,0.45098,0.117647,0.196078,0.235294,0.323529,0.392157,0.147059,0.137255
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,0.140541,0.064865,0.383784,0.194595,0.2,0.221622,0.27027,0.340541,0.167568,0.221622
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,0.055556,0.0,0.111111,0.472222,0.305556,0.111111,0.166667,0.194444,0.527778,0.111111


In [11]:
tmp = test_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how="left", on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,추석후연말전_구매비율,봄철비수기_구매비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,1분기_구매비율,2분기_구매비율,3분기_구매비율,4분기_구매비율
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,0.428571,0.0,0.285714,0.285714,0.428571,0.0,0.0,0.428571,0.142857,0.428571
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,0.5,0.0,0.0,0.0,0.75,0.25,0.25,0.0,0.25,0.5
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,0.0,0.0,0.0,0.6,0.4,0.0,0.0,0.0,1.0,0.0
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,0.25,0.041667,0.083333,0.458333,0.375,0.083333,0.041667,0.083333,0.583333,0.291667
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,0.0,0.0,0.266667,0.266667,0.333333,0.133333,0.133333,0.266667,0.266667,0.333333


In [12]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

## 공휴일 피처 추가

In [13]:
train_tr_clean["구매날짜"] = pd.to_datetime(train_tr_clean["구매날짜"])
test_tr_clean["구매날짜"] = pd.to_datetime(test_tr_clean["구매날짜"])

train_tr_clean["구매날짜"]

0        2004-05-01
1        2004-05-01
2        2004-05-01
3        2004-05-01
4        2004-05-01
            ...    
523100   2005-04-29
523101   2005-04-29
523102   2005-04-29
523103   2005-04-29
523104   2005-04-29
Name: 구매날짜, Length: 456484, dtype: datetime64[ns]

In [14]:
holiday = pd.to_datetime(["2004-05-05", "2004-05-26", "2004-06-06", "2004-07-17", "2004-08-15", "2004-09-27", "2004-09-28", "2004-09-29", "2004-10-03", "2004-12-25", "2005-01-01", "2005-02-08", "2005-02-09", "2005-02-10", "2005-03-01", "2005-04-05"])

In [15]:
# 새로 추가한 피처
    # 공휴일 구매비율 추가
agg_list = [
    ("공휴일_구매횟수", lambda x: np.count_nonzero(x.dt.date.isin(holiday.date))),
    ("공휴일_구매비율", lambda x: np.mean(x.dt.date.isin(holiday.date)))
]

tmp = train_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,1분기_구매비율,2분기_구매비율,3분기_구매비율,4분기_구매비율,공휴일_구매횟수,공휴일_구매비율
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,0.071429,0.357143,0.285714,0.285714,0.071429,0.285714,0.428571,0.214286,0,0.0
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,0.357143,0.166667,0.357143,0.119048,0.190476,0.238095,0.380952,0.190476,3,0.071429
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,0.45098,0.117647,0.196078,0.235294,0.323529,0.392157,0.147059,0.137255,7,0.068627
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,0.383784,0.194595,0.2,0.221622,0.27027,0.340541,0.167568,0.221622,5,0.027027
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,0.111111,0.472222,0.305556,0.111111,0.166667,0.194444,0.527778,0.111111,9,0.25


In [16]:
tmp = test_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how="left", on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,1분기_구매비율,2분기_구매비율,3분기_구매비율,4분기_구매비율,공휴일_구매횟수,공휴일_구매비율
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,0.285714,0.285714,0.428571,0.0,0.0,0.428571,0.142857,0.428571,0,0.0
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,0.0,0.0,0.75,0.25,0.25,0.0,0.25,0.5,0,0.0
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,0.0,0.6,0.4,0.0,0.0,0.0,1.0,0.0,0,0.0
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,0.083333,0.458333,0.375,0.083333,0.041667,0.083333,0.583333,0.291667,0,0.0
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,0.266667,0.266667,0.333333,0.133333,0.133333,0.266667,0.266667,0.333333,5,0.333333


In [17]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

In [18]:
train_ft.to_csv(f"{DATA_PATH}train_tmp_1113.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp_1113.csv",index=False)

## 구매 시각을 이용한 특성

In [19]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp_1113.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp_1113.csv")

train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,1분기_구매비율,2분기_구매비율,3분기_구매비율,4분기_구매비율,공휴일_구매횟수,공휴일_구매비율
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,0.071429,0.357143,0.285714,0.285714,0.071429,0.285714,0.428571,0.214286,0,0.0
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,0.357143,0.166667,0.357143,0.119048,0.190476,0.238095,0.380952,0.190476,3,0.071429
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,0.45098,0.117647,0.196078,0.235294,0.323529,0.392157,0.147059,0.137255,7,0.068627
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,0.383784,0.194595,0.2,0.221622,0.27027,0.340541,0.167568,0.221622,5,0.027027
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,0.111111,0.472222,0.305556,0.111111,0.166667,0.194444,0.527778,0.111111,9,0.25


In [20]:
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,1분기_구매비율,2분기_구매비율,3분기_구매비율,4분기_구매비율,공휴일_구매횟수,공휴일_구매비율
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,0.071429,0.357143,0.285714,0.285714,0.071429,0.285714,0.428571,0.214286,0,0.0
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,0.357143,0.166667,0.357143,0.119048,0.190476,0.238095,0.380952,0.190476,3,0.071429
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,0.45098,0.117647,0.196078,0.235294,0.323529,0.392157,0.147059,0.137255,7,0.068627
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,0.383784,0.194595,0.2,0.221622,0.27027,0.340541,0.167568,0.221622,5,0.027027
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,0.111111,0.472222,0.305556,0.111111,0.166667,0.194444,0.527778,0.111111,9,0.25


In [21]:
agg_list = [
    ("18시이전_구매비율", lambda x: np.mean([t.hour < 18 for t in x])),
    ("18시이후_구매비율", lambda x: np.mean([t.hour >= 18 for t in x])),
    ("오전_구매비율", lambda x: np.mean([t.hour < 12 for t in x])),
    ("오후_구매비율", lambda x: np.mean([t.hour >= 12 for t in x])),
    ("주구매시간대", lambda x: pd.Series([t.hour for t in x]).mode()[0]),
    ("평균구매시간", lambda x: int(np.mean([t.hour + t.minute/60 for t in x]))), # 범주형으로 쪼갤거임
    ("야간구매비율", lambda x: np.mean([(t.hour >= 18) | (t.hour <= 6) for t in x])),
    ("새벽구매비율", lambda x: np.mean([(t.hour >= 0) & (t.hour < 6) for t in x])),
    ("아침_구매비율", lambda x: np.mean([6 <= t.hour < 11 for t in x])),
    ("점심_구매비율", lambda x: np.mean([11 <= t.hour < 14 for t in x])),
    ("저녁_구매비율", lambda x: np.mean([17 <= t.hour < 21 for t in x])),
]

tmp = train_tr_clean.groupby("ID")["구매시각"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,18시이후_구매비율,오전_구매비율,오후_구매비율,주구매시간대,평균구매시간,야간구매비율,새벽구매비율,아침_구매비율,점심_구매비율,저녁_구매비율
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,0.357143,0.0,1.0,12,16,0.357143,0.0,0.0,0.357143,0.571429
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,0.309524,0.047619,0.952381,18,15,0.309524,0.0,0.02381,0.380952,0.428571
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,0.147059,0.068627,0.931373,15,15,0.147059,0.0,0.019608,0.196078,0.254902
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,0.221622,0.064865,0.935135,15,15,0.221622,0.0,0.010811,0.248649,0.372973
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,0.25,0.111111,0.888889,16,15,0.25,0.0,0.027778,0.194444,0.361111


In [22]:
tmp = test_tr_clean.groupby("ID")["구매시각"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how="left", on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,18시이후_구매비율,오전_구매비율,오후_구매비율,주구매시간대,평균구매시간,야간구매비율,새벽구매비율,아침_구매비율,점심_구매비율,저녁_구매비율
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,0.0,0.0,1.0,14,14,0.0,0.0,0.0,0.428571,0.0
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,0.0,0.25,0.75,11,15,0.0,0.0,0.0,0.25,0.25
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,0.0,0.4,0.6,17,15,0.0,0.0,0.0,0.4,0.6
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,0.541667,0.0,1.0,18,17,0.541667,0.0,0.0,0.0,0.708333
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,0.466667,0.0,1.0,19,17,0.466667,0.0,0.0,0.066667,0.6


In [23]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

## 지점을 이용한 특성 생성

### 지점 다양성 추가

In [24]:
agg_list = [
    ("방문지점수", "nunique"),
    ("주구매지점", lambda x: x.mode()[0]),
    ("주구매지점_이용비율", lambda x: x[x == x.mode()[0]].count() / x.count()),
    ("지점별_구매다양성", lambda x: x.nunique() / x.count()) # 추가
]

tmp = train_tr_clean.groupby("ID")["지점코드"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,평균구매시간,야간구매비율,새벽구매비율,아침_구매비율,점심_구매비율,저녁_구매비율,방문지점수,주구매지점,주구매지점_이용비율,지점별_구매다양성
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,16,0.357143,0.0,0.0,0.357143,0.571429,2,A112000,0.571429,0.142857
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,15,0.309524,0.0,0.02381,0.380952,0.428571,3,A112000,0.738095,0.071429
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,15,0.147059,0.0,0.019608,0.196078,0.254902,2,A373000,0.901961,0.019608
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,15,0.221622,0.0,0.010811,0.248649,0.372973,3,A144000,0.810811,0.016216
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,15,0.25,0.0,0.027778,0.194444,0.361111,2,A144000,0.888889,0.055556


In [25]:
tmp = test_tr_clean.groupby('ID')["브랜드코드"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left',on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,평균구매시간,야간구매비율,새벽구매비율,아침_구매비율,점심_구매비율,저녁_구매비율,방문지점수,주구매지점,주구매지점_이용비율,지점별_구매다양성
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,14,0.0,0.0,0.0,0.428571,0.0,5,5100,0.428571,0.714286
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,15,0.0,0.0,0.0,0.25,0.25,4,5149,0.25,1.0
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,15,0.0,0.0,0.0,0.4,0.6,5,5111,0.2,1.0
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,17,0.541667,0.0,0.0,0.0,0.708333,18,5956,0.166667,0.75
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,17,0.466667,0.0,0.0,0.066667,0.6,12,5100,0.133333,0.8


In [26]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

## 브랜드 코드 피처 생성

In [27]:
agg_list = [
    ("브랜드코드_nunique", "nunique"),
    ("선호브랜드코드", lambda x: x.mode()[0]),
    ("선호브랜드코드_구매비율", lambda x: x[x == x.mode()[0]].count() / x.count()),
    ("브랜드별_구매다양성", lambda x: x.nunique() / x.count())
]

tmp = train_tr_clean.groupby("ID")["브랜드코드"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,점심_구매비율,저녁_구매비율,방문지점수,주구매지점,주구매지점_이용비율,지점별_구매다양성,브랜드코드_nunique,선호브랜드코드,선호브랜드코드_구매비율,브랜드별_구매다양성
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,0.357143,0.571429,2,A112000,0.571429,0.142857,13,5405,0.142857,0.928571
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,0.380952,0.428571,3,A112000,0.738095,0.071429,26,5100,0.142857,0.619048
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,0.196078,0.254902,2,A373000,0.901961,0.019608,58,5159,0.117647,0.568627
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,0.248649,0.372973,3,A144000,0.810811,0.016216,99,5217,0.043243,0.535135
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,0.194444,0.361111,2,A144000,0.888889,0.055556,16,5100,0.472222,0.444444


In [28]:
tmp = test_tr_clean.groupby('ID')["브랜드코드"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left',on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,점심_구매비율,저녁_구매비율,방문지점수,주구매지점,주구매지점_이용비율,지점별_구매다양성,브랜드코드_nunique,선호브랜드코드,선호브랜드코드_구매비율,브랜드별_구매다양성
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,0.428571,0.0,5,5100,0.428571,0.714286,5,5100,0.428571,0.714286
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,0.25,0.25,4,5149,0.25,1.0,4,5149,0.25,1.0
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,0.4,0.6,5,5111,0.2,1.0,5,5111,0.2,1.0
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,0.0,0.708333,18,5956,0.166667,0.75,18,5956,0.166667,0.75
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,0.066667,0.6,12,5100,0.133333,0.8,12,5100,0.133333,0.8


In [29]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

## 주구매 중분류 특성 생성

In [30]:
agg_list = [
    ("중분류_nunique", "nunique"),
    ("주구매_중분류", lambda x: x.mode()[0]),
    
    ('주구매_중분류_아동용품', lambda x: (x.str.contains("아동|유아|신생아|완구|팬시|주니어", regex=True)).mean()),
    ('주구매_중분류_취미용품', lambda x: (x.str.contains('아웃도어|골프|스포츠|취미|수예|레포츠', regex=True)).mean()),
    ('주구매_중분류_가전제품', lambda x: (x.str.contains('TV|냉장고|취사|가전', regex=True)).mean()),
    ('주구매_중분류_가구', lambda x: (x.str.contains('가구|식탁|쇼파|소파|침대|침구|홈|사무', regex=True)).mean()),
    ('주구매_중분류_사치품', lambda x: (x.str.contains('보석|모피,', regex=True)).mean()),
    ("주구매_중분류_악세서리", lambda x: (x.str.contains("악세사리|액세서리|보석|핸드백|장신구|시계|ACC", regex=True)).mean()),
    ("주구매_중분류_명품", lambda x: (x.str.contains("명품|부띠끄|로얄|부틱|엘레강스", regex=True)).mean()),
    ("주구매_중분류_화장품", lambda x: (x.str.contains("화장품|향수", regex=True)).mean()),
    ("주구매_중분류_영패션", lambda x: (x.str.contains("영|캐쥬얼", regex=True)).mean()),
    ('주구매_중분류_중장년타겟제품', lambda x: (x.str.contains('건강식품|머플러|양말|도자기', regex=True)).mean()),
    
    # 추가된 부분
    ('주구매_중분류_주방용품', lambda x: (x.str.contains('용기보증|주방|식기', regex=True)).mean()),
    ('주구매_중분류_미씨패션', lambda x: (x.str.contains('미씨|어덜트', regex=True)).mean())
]

tmp = train_tr_clean.groupby("ID")["중분류"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left')
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,주구매_중분류_가전제품,주구매_중분류_가구,주구매_중분류_사치품,주구매_중분류_악세서리,주구매_중분류_명품,주구매_중분류_화장품,주구매_중분류_영패션,주구매_중분류_중장년타겟제품,주구매_중분류_주방용품,주구매_중분류_미씨패션
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.071429,0.0,0.0
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,0.0,0.0,0.0,0.0,0.047619,0.095238,0.190476,0.0,0.02381,0.0
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,0.0,0.009804,0.0,0.029412,0.127451,0.098039,0.098039,0.019608,0.039216,0.0
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,0.010811,0.0,0.005405,0.064865,0.0,0.216216,0.210811,0.005405,0.021622,0.005405
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,0.0,0.0,0.0,0.0,0.166667,0.138889,0.027778,0.0,0.194444,0.0


In [31]:
tmp = test_tr_clean.groupby('ID')["중분류"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left')
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,주구매_중분류_가전제품,주구매_중분류_가구,주구매_중분류_사치품,주구매_중분류_악세서리,주구매_중분류_명품,주구매_중분류_화장품,주구매_중분류_영패션,주구매_중분류_중장년타겟제품,주구매_중분류_주방용품,주구매_중분류_미씨패션
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0,0.0,0.428571,0.0
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,0.0,0.0,0.0,0.0,0.0,0.4,0.2,0.0,0.0,0.0
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,0.0,0.0,0.0,0.041667,0.0,0.083333,0.25,0.041667,0.0,0.0
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,0.0,0.0,0.0,0.0,0.0,0.066667,0.266667,0.0,0.066667,0.0


In [32]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

In [33]:
agg_list = [
    ("중분류별_구매다양성", lambda x: x.nunique() / x.count())
]

tmp = train_tr_clean.groupby("ID")["중분류"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left')
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,주구매_중분류_가구,주구매_중분류_사치품,주구매_중분류_악세서리,주구매_중분류_명품,주구매_중분류_화장품,주구매_중분류_영패션,주구매_중분류_중장년타겟제품,주구매_중분류_주방용품,주구매_중분류_미씨패션,중분류별_구매다양성
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,0.0,0.0,0.0,0.0,0.071429,0.0,0.071429,0.0,0.0,0.785714
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,0.0,0.0,0.0,0.047619,0.095238,0.190476,0.0,0.02381,0.0,0.571429
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,0.009804,0.0,0.029412,0.127451,0.098039,0.098039,0.019608,0.039216,0.0,0.343137
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,0.0,0.005405,0.064865,0.0,0.216216,0.210811,0.005405,0.021622,0.005405,0.324324
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,0.0,0.0,0.0,0.166667,0.138889,0.027778,0.0,0.194444,0.0,0.527778


In [34]:
tmp = test_tr_clean.groupby('ID')["중분류"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left')
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,주구매_중분류_가구,주구매_중분류_사치품,주구매_중분류_악세서리,주구매_중분류_명품,주구매_중분류_화장품,주구매_중분류_영패션,주구매_중분류_중장년타겟제품,주구매_중분류_주방용품,주구매_중분류_미씨패션,중분류별_구매다양성
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,0.0,0.0,0.0,0.142857,0.142857,0.0,0.0,0.428571,0.0,0.571429
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,1.0
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,0.0,0.0,0.0,0.0,0.4,0.2,0.0,0.0,0.0,0.8
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,0.0,0.0,0.041667,0.0,0.083333,0.25,0.041667,0.0,0.0,0.458333
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,0.0,0.0,0.0,0.0,0.066667,0.266667,0.0,0.066667,0.0,0.733333


In [35]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

## 대분류를 이용한 특성 생성

In [36]:
agg_list = [
    ("대분류_nunique", "nunique"),
    ("주구매_대분류", lambda x: x.mode()[0]),
    
    ("주구매_대분류_아동용품", lambda x: (x.str.contains("아동")).mean()),
    ("주구매_대분류_잡화", lambda x: (x.str.contains("잡화", regex=True)).mean()),
    ('주구매_대분류_스포츠', lambda x: (x.str.contains('골프|스포츠', regex=True)).mean()),
    ("주구매_대분류_가정용품", lambda x: (x.str.contains("가정용품")).mean()),
    ("주구매_대분류_명품", lambda x: (x.str.contains("로얄|명품", regex=True)).mean()),
    ("주구매_대분류_생식품", lambda x: (x.str.contains("생식품", regex=True)).mean()),
    ("주구매_대분류_정장", lambda x: (x.str.contains("정장", regex=True)).mean()),

    # 영만 따로 분류
    ("주구매_대분류_영", lambda x: (x.str.contains("영", regex=True)).mean()),
]

tmp = train_tr_clean.groupby("ID")["대분류"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left')
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,대분류_nunique,주구매_대분류,주구매_대분류_아동용품,주구매_대분류_잡화,주구매_대분류_스포츠,주구매_대분류_가정용품,주구매_대분류_명품,주구매_대분류_생식품,주구매_대분류_정장,주구매_대분류_영
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,9,패션잡화,0.142857,0.285714,0.214286,0.285714,0.0,0.071429,0.142857,0.0
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,12,영플라자,0.02381,0.238095,0.071429,0.0,0.047619,0.02381,0.02381,0.357143
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,14,명품잡화,0.156863,0.372549,0.411765,0.029412,0.264706,0.029412,0.04902,0.0
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,16,케주얼_구두_아동,0.286486,0.335135,0.108108,0.037838,0.021622,0.021622,0.081081,0.048649
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,8,공산품파트,0.111111,0.25,0.0,0.0,0.194444,0.194444,0.0,0.0


In [37]:
tmp = test_tr_clean.groupby('ID')["대분류"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left')
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,대분류_nunique,주구매_대분류,주구매_대분류_아동용품,주구매_대분류_잡화,주구매_대분류_스포츠,주구매_대분류_가정용품,주구매_대분류_명품,주구매_대분류_생식품,주구매_대분류_정장,주구매_대분류_영
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,5,공산품,0.285714,0.142857,0.0,0.0,0.0,0.142857,0.0,0.0
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,3,명품잡화,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,3,잡화파트,0.4,0.4,0.0,0.0,0.0,0.0,0.0,0.0
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,4,영플라자,0.0,0.291667,0.041667,0.041667,0.0,0.0,0.0,0.625
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,6,여성캐주얼,0.2,0.066667,0.066667,0.0,0.0,0.0,0.0,0.066667


In [38]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

## 구매 가격에 대한 특성 생성

In [39]:
high_purchase_tr = train_tr_clean["구매가격"].quantile(0.75)
low_purchase_tr = train_tr_clean["구매가격"].quantile(0.25)
low_refund_tr, high_refund_tr = refund_train["구매가격"].quantile(0.75), refund_train["구매가격"].quantile(0.25)


In [40]:
# 중가제품 구매비율, 구매금액 변동성 추가
# 고가제품 구매건수 삭제

agg_list = [
    ("총구매금액", "sum"),
    ("구매건수", "count"),
    ("평균구매금액", "mean"),
    ("구매금액_median", 'median'),
    ("최대구매금액", 'max'),
    ("최소구매금액", 'min'),
    ("구매금액표준편차", 'std'),
    ("구매금액_skew", 'skew'),
    ("구매금액_kurtosis", lambda x: x.kurtosis()),

    ("고가제품_구매비율", lambda x: np.mean(x > high_purchase_tr)),
    ("중가제품_구매비율", lambda x: np.mean((x >= low_purchase_tr) & (x <= high_purchase_tr))),
    ("저가제품_구매비율", lambda x: np.mean(x < low_purchase_tr)),

    ("구매금액_변동성", lambda x: x.std() / x.mean() if x.mean() != 0 else 0), 

    # 저중고가제품 구매횟수 추가
    ("고가제품_구매횟수", lambda x: np.sum(x > high_purchase_tr)),
    ("중가제품_구매횟수", lambda x: np.sum((x >= low_purchase_tr) & (x <= high_purchase_tr))),
    ("저가제품_구매횟수", lambda x: np.sum(x < low_purchase_tr)),
]

tmp = train_tr_clean.groupby("ID")["구매가격"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, "left", "ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,구매금액표준편차,구매금액_skew,구매금액_kurtosis,고가제품_구매비율,중가제품_구매비율,저가제품_구매비율,구매금액_변동성,고가제품_구매횟수,중가제품_구매횟수,저가제품_구매횟수
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,313025.1891,1.670489,1.990929,0.714286,0.214286,0.071429,1.043119,10,3,1
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,126592.626144,2.651821,9.771083,0.333333,0.547619,0.119048,1.054306,14,23,5
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,415371.770475,4.393399,21.956275,0.441176,0.421569,0.137255,1.884243,45,43,14
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,239163.307349,2.64347,8.320466,0.491892,0.427027,0.081081,1.234529,91,79,15
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,179263.263965,3.01761,10.550421,0.194444,0.472222,0.333333,1.617873,7,17,12


In [41]:
high_purchase_te = test_tr_clean["구매가격"].quantile(0.75)
low_purchase_te = test_tr_clean["구매가격"].quantile(0.25)
low_refund_te, high_refund_te = refund_test["구매가격"].quantile(0.75), refund_test["구매가격"].quantile(0.25)


In [42]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(3030, 0)

In [43]:
mask = train_ft.isnull().sum() > 0
train_ft.isnull().sum()[mask]

구매금액표준편차          405
구매금액_skew         880
구매금액_kurtosis    1340
구매금액_변동성          405
dtype: int64

In [44]:
train_ft["구매금액_변동성"].isnull().sum()

405

In [45]:
# 중가제품 구매비율, 구매금액 변동성 추가
# 고가제품 구매건수 삭제

agg_list = [
    ("총구매금액", "sum"),
    ("구매건수", "count"),
    ("평균구매금액", "mean"),
    ("구매금액_median", 'median'),
    ("최대구매금액", 'max'),
    ("최소구매금액", 'min'),
    ("구매금액표준편차", 'std'),
    ("구매금액_skew", 'skew'),
    ("구매금액_kurtosis", lambda x: x.kurtosis()),

    ("고가제품_구매비율", lambda x: np.mean(x > high_purchase_te)),
    ("중가제품_구매비율", lambda x: np.mean((x >= low_purchase_te) & (x <= high_purchase_te))),
    ("저가제품_구매비율", lambda x: np.mean(x < low_purchase_te)),

    ("구매금액_변동성", lambda x: x.std() / x.mean() if x.mean() != 0 else 0), 

    # 저중고가제품 구매횟수 추가
    ("고가제품_구매횟수", lambda x: np.sum(x > high_purchase_te)),
    ("중가제품_구매횟수", lambda x: np.sum((x >= low_purchase_te) & (x <= high_purchase_te))),
    ("저가제품_구매횟수", lambda x: np.sum(x < low_purchase_te)),
]

tmp = test_tr_clean.groupby('ID')["구매가격"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, 'left', "ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,구매금액표준편차,구매금액_skew,구매금액_kurtosis,고가제품_구매비율,중가제품_구매비율,저가제품_구매비율,구매금액_변동성,고가제품_구매횟수,중가제품_구매횟수,저가제품_구매횟수
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,32293.148313,0.702551,-1.103688,0.0,1.0,0.0,0.526218,0,7,0
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,33658.431336,0.856705,1.438447,0.0,1.0,0.0,0.463296,0,4,0
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,117233.954126,1.969409,4.075918,0.2,0.8,0.0,0.965683,1,4,0
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,102440.792115,1.507265,2.087113,0.291667,0.291667,0.416667,1.066286,7,7,10
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,99879.156875,1.839077,3.277987,0.266667,0.6,0.133333,0.955056,4,9,2


In [46]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(3030, 2161)

## 환불건수에 대한 특성 추가

In [47]:
# 중가제품환불비율 추가
# 총환불금액, 평균환불금액 추가

agg_list = [
    ("환불건수", "count"),
    ("총환불금액", "sum"),
    ("평균환불금액", "mean"),
    ("최소환불금액", "max"),
    ("최대환불금액", "min"),
    ("환불금액표준편차", "std"),
    ("환불금액_skew", "skew"), 
    ("환불금액_kurtosis", lambda x: x.kurtosis()),
    ("고가제품환불비율", lambda x: np.mean(x < high_refund_tr)),
    ("중가제품환불비율", lambda x: np.mean((x >= high_refund_tr) & (x <= low_refund_tr))),
    ("저가제품환불비율", lambda x: np.mean(x > low_refund_tr)),

    # 환불횟수 추가
    ("고가제품환불횟수", lambda x: np.sum(x < high_refund_tr)),
    ("중가제품환불횟수", lambda x: np.sum((x >= high_refund_tr) & (x <= low_refund_tr))),
    ("저가제품환불횟수", lambda x: np.sum(x > low_refund_tr)),
]

tmp = refund_train.groupby("ID")["구매가격"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, "left", "ID").fillna(0)
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,최대환불금액,환불금액표준편차,환불금액_skew,환불금액_kurtosis,고가제품환불비율,중가제품환불비율,저가제품환불비율,고가제품환불횟수,중가제품환불횟수,저가제품환불횟수
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,-1236000.0,554857.6394,1.571433,0.0,0.666667,0.333333,0.0,2.0,1.0,0.0
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,-288000.0,110664.604829,0.310886,-1.234108,0.285714,0.428571,0.285714,2.0,3.0,2.0
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,-768000.0,194802.324715,-2.219921,5.204875,0.214286,0.785714,0.0,3.0,11.0,0.0
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,-1204000.0,379184.767221,-0.381901,-1.126703,0.692308,0.230769,0.076923,9.0,3.0,1.0


In [48]:
agg_list = [
    ("환불건수", "count"),
    ("총환불금액", "sum"),
    ("평균환불금액", "mean"),
    ("최소환불금액", "max"),
    ("최대환불금액", "min"),
    ("환불금액표준편차", "std"),
    ("환불금액_skew", "skew"), 
    ("환불금액_kurtosis", lambda x: x.kurtosis()),
    ("고가제품환불비율", lambda x: np.mean(x < high_refund_te)),
    ("중가제품환불비율", lambda x: np.mean((x >= high_refund_te) & (x <= low_refund_te))),
    ("저가제품환불비율", lambda x: np.mean(x > low_refund_te)),

    # 환불횟수 추가
    ("고가제품환불횟수", lambda x: np.sum(x < high_refund_te)),
    ("중가제품환불횟수", lambda x: np.sum((x >= high_refund_te) & (x <= low_refund_te))),
    ("저가제품환불횟수", lambda x: np.sum(x > low_refund_te)),
]

tmp = test_tr_clean.groupby('ID')["구매가격"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, 'left', "ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,최대환불금액,환불금액표준편차,환불금액_skew,환불금액_kurtosis,고가제품환불비율,중가제품환불비율,저가제품환불비율,고가제품환불횟수,중가제품환불횟수,저가제품환불횟수
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,26643,32293.148313,0.702551,-1.103688,0.0,0.0,1.0,0,0,7
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,37600,33658.431336,0.856705,1.438447,0.0,0.0,1.0,0,0,4
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,38000,117233.954126,1.969409,4.075918,0.0,0.0,1.0,0,0,5
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,7040,102440.792115,1.507265,2.087113,0.0,0.0,1.0,0,0,24
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,16200,99879.156875,1.839077,3.277987,0.0,0.0,1.0,0,0,15


In [49]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 4060)

In [50]:
mask = test_ft.isnull().sum() > 0
test_ft.isnull().sum()[mask]

구매금액표준편차          262
구매금액_skew         631
구매금액_kurtosis    1006
구매금액_변동성          262
환불금액표준편차          262
환불금액_skew         631
환불금액_kurtosis    1006
dtype: int64

- 고객등급 특성 생성

In [51]:
def categorize_customer(total_amount):
    if total_amount >= 20_000_000:
        return 'Diamond'
    elif total_amount >= 15_000_000:
        return 'Platinum'
    elif total_amount >= 10_000_000:
        return 'Gold'
    elif total_amount >= 6_000_000:
        return 'Black'
    else:
        return 'Normal'
    
train_ft['고객등급'] = train_ft['총구매금액'].apply(categorize_customer)
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,환불금액표준편차,환불금액_skew,환불금액_kurtosis,고가제품환불비율,중가제품환불비율,저가제품환불비율,고가제품환불횟수,중가제품환불횟수,저가제품환불횟수,고객등급
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,554857.6394,1.571433,0.0,0.666667,0.333333,0.0,2.0,1.0,0.0,Normal
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,110664.604829,0.310886,-1.234108,0.285714,0.428571,0.285714,2.0,3.0,2.0,Diamond
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,194802.324715,-2.219921,5.204875,0.214286,0.785714,0.0,3.0,11.0,0.0,Diamond
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,379184.767221,-0.381901,-1.126703,0.692308,0.230769,0.076923,9.0,3.0,1.0,Normal


In [52]:
test_ft['고객등급'] = test_ft['총구매금액'].apply(categorize_customer)
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,환불금액표준편차,환불금액_skew,환불금액_kurtosis,고가제품환불비율,중가제품환불비율,저가제품환불비율,고가제품환불횟수,중가제품환불횟수,저가제품환불횟수,고객등급
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,32293.148313,0.702551,-1.103688,0.0,0.0,1.0,0,0,7,Normal
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,33658.431336,0.856705,1.438447,0.0,0.0,1.0,0,0,4,Normal
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,117233.954126,1.969409,4.075918,0.0,0.0,1.0,0,0,5,Normal
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,102440.792115,1.507265,2.087113,0.0,0.0,1.0,0,0,24,Normal
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,99879.156875,1.839077,3.277987,0.0,0.0,1.0,0,0,15,Normal


In [53]:
train_ft.to_csv(f"{DATA_PATH}train_tmp_1113.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp_1113.csv",index=False)

## pivot table

In [54]:
train_ft = pd.read_csv(f"{DATA_PATH}train_tmp_1113.csv")
test_ft = pd.read_csv(f"{DATA_PATH}test_tmp_1113.csv")

train_ft.shape, test_ft.shape

((14940, 134), (12225, 134))

In [55]:
train_tmp = pd.pivot_table(
    train_tr_clean,
    index="ID",
    columns="지점코드",
    values="구매가격",
    aggfunc="count",
    fill_value=0,
).add_prefix("pivot_지점코드_")

train_tmp

지점코드,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
train_0,8,6,0,0
train_1,31,2,0,9
train_10,0,118,10,0
train_100,5,9,0,6
train_1000,0,2,0,13
...,...,...,...,...
train_9995,1,0,0,0
train_9996,1,22,0,0
train_9997,15,0,0,0
train_9998,24,0,0,5


In [56]:
test_tmp = pd.pivot_table(
    test_tr_clean,
    index="ID",
    columns="지점코드",
    values="구매가격",
    aggfunc="count",
    fill_value=0,
).add_prefix("pivot_지점코드_")

test_tmp

지점코드,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
test_0,0,0,7,0
test_1,1,0,0,3
test_10,28,0,0,0
test_100,103,3,0,5
test_1000,0,0,0,3
...,...,...,...,...
test_9995,16,0,0,0
test_9996,0,0,44,0
test_9997,0,0,0,21
test_9998,0,0,62,0


In [57]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="중분류",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("pivot_중분류_")
    .reset_index(),
    on="ID",
    how="left",
)

train_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼,pivot_중분류_GBR 지원,pivot_중분류_L_B침구,pivot_중분류_NB제화,pivot_중분류_NB핸드백,...,pivot_중분류_행사슈즈,pivot_중분류_행사핸드백,pivot_중분류_향수,pivot_중분류_헤어ACC,pivot_중분류_헤어악세사리,pivot_중분류_헤어액세사리,pivot_중분류_홈데코,pivot_중분류_화장잡화,pivot_중분류_화장품,pivot_중분류_훼미닌부틱
0,train_0,8,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,train_1,31,2,0,9,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,train_10,0,118,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,train_100,5,9,0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,train_1000,0,2,0,13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14936,train_9996,1,22,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14937,train_9997,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14938,train_9998,24,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="중분류",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("pivot_중분류_")
    .reset_index(),
    on="ID",
    how="left",
)

test_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼,pivot_중분류_GBR 지원,pivot_중분류_L_B침구,pivot_중분류_NB제화,pivot_중분류_NB핸드백,...,pivot_중분류_행사슈즈,pivot_중분류_행사핸드백,pivot_중분류_향수,pivot_중분류_헤어ACC,pivot_중분류_헤어악세사리,pivot_중분류_헤어액세사리,pivot_중분류_홈데코,pivot_중분류_화장잡화,pivot_중분류_화장품,pivot_중분류_훼미닌부틱
0,test_0,0,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,test_1,1,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,test_10,28,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,test_100,103,3,0,5,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
4,test_1000,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
12221,test_9996,0,0,44,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
12222,test_9997,0,0,0,21,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
12223,test_9998,0,0,62,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="중분류",
        values="구매가격",
        aggfunc="sum",
        fill_value=0,
    )
    .add_prefix("pivot_중분류_")
    .reset_index(),
    on="ID",
    how="left",
)

train_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼_x,pivot_중분류_GBR 지원_x,pivot_중분류_L_B침구_x,pivot_중분류_NB제화_x,pivot_중분류_NB핸드백_x,...,pivot_중분류_행사슈즈_y,pivot_중분류_행사핸드백_y,pivot_중분류_향수_y,pivot_중분류_헤어ACC_y,pivot_중분류_헤어악세사리_y,pivot_중분류_헤어액세사리_y,pivot_중분류_홈데코_y,pivot_중분류_화장잡화_y,pivot_중분류_화장품_y,pivot_중분류_훼미닌부틱_y
0,train_0,8,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,train_1,31,2,0,9,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,train_10,0,118,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,train_100,5,9,0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,train_1000,0,2,0,13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14936,train_9996,1,22,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14937,train_9997,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14938,train_9998,24,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="중분류",
        values="구매가격",
        aggfunc="sum",
        fill_value=0,
    )
    .add_prefix("pivot_중분류_")
    .reset_index(),
    on="ID",
    how="left",
)

test_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼_x,pivot_중분류_GBR 지원_x,pivot_중분류_L_B침구_x,pivot_중분류_NB제화_x,pivot_중분류_NB핸드백_x,...,pivot_중분류_행사슈즈_y,pivot_중분류_행사핸드백_y,pivot_중분류_향수_y,pivot_중분류_헤어ACC_y,pivot_중분류_헤어악세사리_y,pivot_중분류_헤어액세사리_y,pivot_중분류_홈데코_y,pivot_중분류_화장잡화_y,pivot_중분류_화장품_y,pivot_중분류_훼미닌부틱_y
0,test_0,0,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,test_1,1,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,test_10,28,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,test_100,103,3,0,5,0,0,0,0,1,...,0,0,22000,0,0,0,0,0,0,0
4,test_1000,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,145700,0
12221,test_9996,0,0,44,0,0,0,0,0,0,...,0,0,0,0,46000,0,0,0,0,0
12222,test_9997,0,0,0,21,0,0,0,0,0,...,0,0,0,0,0,30000,0,0,0,0
12223,test_9998,0,0,62,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="대분류",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("pivot_대분류_")
    .reset_index(),
    on="ID",
    how="left",
)

train_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼_x,pivot_중분류_GBR 지원_x,pivot_중분류_L_B침구_x,pivot_중분류_NB제화_x,pivot_중분류_NB핸드백_x,...,pivot_대분류_여성캐주얼,pivot_대분류_여성캐쥬얼,pivot_대분류_영라이브,pivot_대분류_영어덜트캐쥬얼,pivot_대분류_영캐릭터,pivot_대분류_영플라자,pivot_대분류_잡화,pivot_대분류_잡화파트,pivot_대분류_케주얼_구두_아동,pivot_대분류_패션잡화
0,train_0,8,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,3
1,train_1,31,2,0,9,0,0,0,0,0,...,6,0,0,0,4,11,0,0,1,8
2,train_10,0,118,10,0,0,0,0,0,0,...,0,0,0,0,0,0,2,19,5,0
3,train_100,5,9,0,6,0,0,0,0,0,...,0,0,0,0,0,1,0,3,4,1
4,train_1000,0,2,0,13,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
14936,train_9996,1,22,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,2,12,0
14937,train_9997,15,0,0,0,0,0,0,0,0,...,2,0,0,0,2,7,0,0,0,1
14938,train_9998,24,0,0,5,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,9


In [62]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="대분류",
        values="구매가격",
        aggfunc="count",
        fill_value=0,
    )
    .add_prefix("pivot_대분류_")
    .reset_index(),
    on="ID",
    how="left",
)

test_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼_x,pivot_중분류_GBR 지원_x,pivot_중분류_L_B침구_x,pivot_중분류_NB제화_x,pivot_중분류_NB핸드백_x,...,pivot_대분류_여성캐주얼,pivot_대분류_여성캐쥬얼,pivot_대분류_영라이브,pivot_대분류_영어덜트캐쥬얼,pivot_대분류_영캐릭터,pivot_대분류_영플라자,pivot_대분류_잡화,pivot_대분류_잡화파트,pivot_대분류_케주얼_구두_아동,pivot_대분류_패션잡화
0,test_0,0,0,7,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
1,test_1,1,0,0,3,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,test_10,28,0,0,0,0,0,0,0,0,...,0,0,0,0,4,3,0,0,0,11
3,test_100,103,3,0,5,0,0,0,0,1,...,5,0,0,0,1,1,0,2,0,16
4,test_1000,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,0,0,0,0,0,...,2,0,0,0,3,3,0,0,0,2
12221,test_9996,0,0,44,0,0,0,0,0,0,...,0,2,4,4,0,0,4,0,0,0
12222,test_9997,0,0,0,21,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
12223,test_9998,0,0,62,0,0,0,0,0,0,...,0,6,10,1,0,0,8,0,0,0


In [63]:
set(train_tmp.columns) - set(test_tmp.columns)

{'pivot_중분류_특판_x', 'pivot_중분류_특판_y'}

In [64]:
cols = ['pivot_중분류_특판_x', 'pivot_중분류_특판_y']

train_tmp = train_tmp.drop(columns=cols)

set(train_tmp.columns) - set(test_tmp.columns)

set()

In [65]:
train_tmp = train_tmp.merge(
    pd.pivot_table(
        train_tr_clean,
        index="ID",
        columns="대분류",
        values="구매가격",
        aggfunc="sum",
        fill_value=0,
    )
    .add_prefix("pivot_대분류_")
    .reset_index(),
    on="ID",
    how="left",
)

train_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼_x,pivot_중분류_GBR 지원_x,pivot_중분류_L_B침구_x,pivot_중분류_NB제화_x,pivot_중분류_NB핸드백_x,...,pivot_대분류_여성캐주얼_y,pivot_대분류_여성캐쥬얼_y,pivot_대분류_영라이브_y,pivot_대분류_영어덜트캐쥬얼_y,pivot_대분류_영캐릭터_y,pivot_대분류_영플라자_y,pivot_대분류_잡화_y,pivot_대분류_잡화파트_y,pivot_대분류_케주얼_구두_아동_y,pivot_대분류_패션잡화_y
0,train_0,8,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,320000,35000,377500
1,train_1,31,2,0,9,0,0,0,0,0,...,1360500,0,0,0,689000,1376700,0,0,39000,710000
2,train_10,0,118,10,0,0,0,0,0,0,...,0,0,0,0,0,0,166000,4154600,371300,0
3,train_100,5,9,0,6,0,0,0,0,0,...,0,0,0,0,0,45000,0,163000,302800,59000
4,train_1000,0,2,0,13,0,0,0,0,0,...,83300,0,0,0,0,0,0,130000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14935,train_9995,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,36000
14936,train_9996,1,22,0,0,0,0,0,0,0,...,0,0,0,0,0,192000,0,133000,1379800,0
14937,train_9997,15,0,0,0,0,0,0,0,0,...,16000,0,0,0,217000,361500,0,0,0,52000
14938,train_9998,24,0,0,5,0,0,0,0,0,...,24000,0,0,0,0,0,0,0,0,465000


In [66]:
test_tmp = test_tmp.merge(
    pd.pivot_table(
        test_tr_clean,
        index="ID",
        columns="대분류",
        values="구매가격",
        aggfunc="sum",
        fill_value=0,
    )
    .add_prefix("pivot_대분류_")
    .reset_index(),
    on="ID",
    how="left",
)

test_tmp

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼_x,pivot_중분류_GBR 지원_x,pivot_중분류_L_B침구_x,pivot_중분류_NB제화_x,pivot_중분류_NB핸드백_x,...,pivot_대분류_여성캐주얼_y,pivot_대분류_여성캐쥬얼_y,pivot_대분류_영라이브_y,pivot_대분류_영어덜트캐쥬얼_y,pivot_대분류_영캐릭터_y,pivot_대분류_영플라자_y,pivot_대분류_잡화_y,pivot_대분류_잡화파트_y,pivot_대분류_케주얼_구두_아동_y,pivot_대분류_패션잡화_y
0,test_0,0,0,7,0,0,0,0,0,0,...,0,100000,0,0,0,0,110000,0,0,0
1,test_1,1,0,0,3,0,0,0,0,0,...,62000,0,0,0,0,0,0,0,0,0
2,test_10,28,0,0,0,0,0,0,0,0,...,0,0,0,0,186500,215700,0,0,0,1074000
3,test_100,103,3,0,5,0,0,0,0,1,...,148700,0,0,0,79000,77600,0,320000,0,758000
4,test_1000,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,73000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12220,test_9995,16,0,0,0,0,0,0,0,0,...,145000,0,0,0,148000,145700,0,0,0,195000
12221,test_9996,0,0,44,0,0,0,0,0,0,...,0,229000,108000,254200,0,0,254000,0,0,0
12222,test_9997,0,0,0,21,0,0,0,0,0,...,404000,0,0,0,0,0,0,0,0,0
12223,test_9998,0,0,62,0,0,0,0,0,0,...,0,478000,582300,59000,0,0,1213800,0,0,0


In [67]:
요일별_구매금액_sum = pd.pivot_table(
    train_tr_clean,
    index='ID',
    columns=train_tr_clean['구매날짜'].dt.day_name(),
    values='구매가격',
    aggfunc='sum',
    fill_value=0
)

요일별_구매금액_sum.columns = [f"{day}_sum" for day in 요일별_구매금액_sum.columns]
train_tmp = train_tmp.merge(요일별_구매금액_sum, how='left', on='ID')
train_tmp.head()

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼_x,pivot_중분류_GBR 지원_x,pivot_중분류_L_B침구_x,pivot_중분류_NB제화_x,pivot_중분류_NB핸드백_x,...,pivot_대분류_잡화파트_y,pivot_대분류_케주얼_구두_아동_y,pivot_대분류_패션잡화_y,Friday_sum,Monday_sum,Saturday_sum,Sunday_sum,Thursday_sum,Tuesday_sum,Wednesday_sum
0,train_0,8,6,0,0,0,0,0,0,0,...,320000,35000,377500,43500,320000,1430000,340000,1507400,0,560300
1,train_1,31,2,0,9,0,0,0,0,0,...,0,39000,710000,643400,316000,0,30000,1822450,1193000,1038175
2,train_10,0,118,10,0,0,0,0,0,0,...,4154600,371300,0,10912013,0,124688,0,0,129640,68529
3,train_100,5,9,0,6,0,0,0,0,0,...,163000,302800,59000,669520,73000,983800,45567,0,45000,59000
4,train_1000,0,2,0,13,0,0,0,0,0,...,130000,0,0,0,29000,29000,292300,110000,62000,235200


In [68]:
요일별_구매금액_sum = pd.pivot_table(
    test_tr_clean,
    index='ID',
    columns=test_tr_clean['구매날짜'].dt.day_name(),
    values='구매가격',
    aggfunc='sum',
    fill_value=0
)

요일별_구매금액_sum.columns = [f"{day}_sum" for day in 요일별_구매금액_sum.columns]
test_tmp = test_tmp.merge(요일별_구매금액_sum, how='left', on='ID')
test_tmp.head()

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼_x,pivot_중분류_GBR 지원_x,pivot_중분류_L_B침구_x,pivot_중분류_NB제화_x,pivot_중분류_NB핸드백_x,...,pivot_대분류_잡화파트_y,pivot_대분류_케주얼_구두_아동_y,pivot_대분류_패션잡화_y,Friday_sum,Monday_sum,Saturday_sum,Sunday_sum,Thursday_sum,Tuesday_sum,Wednesday_sum
0,test_0,0,0,7,0,0,0,0,0,0,...,0,0,0,0,100000,136643,94400,0,98536,0
1,test_1,1,0,0,3,0,0,0,0,0,...,0,0,0,0,155600,62000,0,0,0,73000
2,test_10,28,0,0,0,0,0,0,0,0,...,0,0,1074000,493300,62588,120000,153000,146000,1157900,133000
3,test_100,103,3,0,5,0,0,0,0,1,...,320000,0,758000,718347,322549,160081,572907,531251,145493,375763
4,test_1000,0,0,0,3,0,0,0,0,0,...,0,0,73000,0,0,0,73000,0,0,154000


In [69]:
train_tr_clean['평일_주말'] = train_tr_clean['구매날짜'].dt.dayofweek  
train_tr_clean['평일_주말'] = train_tr_clean['평일_주말'].apply(lambda x: '주말' if x >= 5 else '평일')


평일_주말_구매금액_sum = pd.pivot_table(
    train_tr_clean,
    index='ID',
    columns='평일_주말',
    values='구매가격',
    aggfunc='sum',
    fill_value=0
)

평일_주말_구매금액_sum.columns = ['주말_구매금액', '평일_구매금액']

train_tmp = train_tmp.merge(평일_주말_구매금액_sum, how='left', on='ID')
train_tmp.head()

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼_x,pivot_중분류_GBR 지원_x,pivot_중분류_L_B침구_x,pivot_중분류_NB제화_x,pivot_중분류_NB핸드백_x,...,pivot_대분류_패션잡화_y,Friday_sum,Monday_sum,Saturday_sum,Sunday_sum,Thursday_sum,Tuesday_sum,Wednesday_sum,주말_구매금액,평일_구매금액
0,train_0,8,6,0,0,0,0,0,0,0,...,377500,43500,320000,1430000,340000,1507400,0,560300,1770000,2431200
1,train_1,31,2,0,9,0,0,0,0,0,...,710000,643400,316000,0,30000,1822450,1193000,1038175,30000,5013025
2,train_10,0,118,10,0,0,0,0,0,0,...,0,10912013,0,124688,0,0,129640,68529,124688,11110182
3,train_100,5,9,0,6,0,0,0,0,0,...,59000,669520,73000,983800,45567,0,45000,59000,1029367,846520
4,train_1000,0,2,0,13,0,0,0,0,0,...,0,0,29000,29000,292300,110000,62000,235200,321300,436200


In [70]:
test_tr_clean['평일_주말'] = test_tr_clean['구매날짜'].dt.dayofweek  # 0=월, 1=화, ..., 5=토, 6=일
test_tr_clean['평일_주말'] = test_tr_clean['평일_주말'].apply(lambda x: '주말' if x >= 5 else '평일')


평일_주말_구매금액_sum = pd.pivot_table(
    test_tr_clean,
    index='ID',
    columns='평일_주말',
    values='구매가격',
    aggfunc='sum',
    fill_value=0
)

평일_주말_구매금액_sum.columns = ['주말_구매금액', '평일_구매금액']

test_tmp = test_tmp.merge(평일_주말_구매금액_sum, how='left', on='ID')
test_tmp.head()

Unnamed: 0,ID,pivot_지점코드_A112000,pivot_지점코드_A144000,pivot_지점코드_A202000,pivot_지점코드_A373000,pivot_중분류_DC캐주얼_x,pivot_중분류_GBR 지원_x,pivot_중분류_L_B침구_x,pivot_중분류_NB제화_x,pivot_중분류_NB핸드백_x,...,pivot_대분류_패션잡화_y,Friday_sum,Monday_sum,Saturday_sum,Sunday_sum,Thursday_sum,Tuesday_sum,Wednesday_sum,주말_구매금액,평일_구매금액
0,test_0,0,0,7,0,0,0,0,0,0,...,0,0,100000,136643,94400,0,98536,0,231043,198536
1,test_1,1,0,0,3,0,0,0,0,0,...,0,0,155600,62000,0,0,0,73000,62000,228600
2,test_10,28,0,0,0,0,0,0,0,0,...,1074000,493300,62588,120000,153000,146000,1157900,133000,273000,1992788
3,test_100,103,3,0,5,0,0,0,0,1,...,758000,718347,322549,160081,572907,531251,145493,375763,732988,2093403
4,test_1000,0,0,0,3,0,0,0,0,0,...,73000,0,0,0,73000,0,0,154000,73000,154000


In [71]:
train_ft = train_ft.merge(train_tmp, on="ID", how="left")
test_ft = test_ft.merge(test_tmp, how="left", on="ID")

In [72]:
weekdays_count = ["월요일_구매건수", "화요일_구매건수", "수요일_구매건수", "목요일_구매건수", "금요일_구매건수"]
weekend_count = ["토요일_구매건수", "일요일_구매건수"]
weekdays_per = ["월요일_구매비율", "화요일_구매비율", "수요일_구매비율", "목요일_구매비율", "금요일_구매비율"]
weekend_count = ["토요일_구매비율", "일요일_구매비율"]

agg_list = [    
    ("월요일_구매건수", lambda x: np.sum(x.dt.weekday == 0)),
    ("화요일_구매건수", lambda x: np.sum(x.dt.weekday == 1)),
    ("수요일_구매건수", lambda x: np.sum(x.dt.weekday == 2)),
    ("목요일_구매건수", lambda x: np.sum(x.dt.weekday == 3)),
    ("금요일_구매건수", lambda x: np.sum(x.dt.weekday == 4)),
    ("토요일_구매건수", lambda x: np.sum(x.dt.weekday == 5)),
    ("일요일_구매건수", lambda x: np.sum(x.dt.weekday == 6)),
]

tmp = train_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,Wednesday_sum,주말_구매금액,평일_구매금액,월요일_구매건수,화요일_구매건수,수요일_구매건수,목요일_구매건수,금요일_구매건수,토요일_구매건수,일요일_구매건수
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,560300,1770000,2431200,1,0,5,4,1,2,1
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,1038175,30000,5013025,2,11,9,14,5,0,1
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,1700356,4889800,17595586,26,19,13,15,8,10,11
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,3091720,4260900,31578862,14,21,32,41,44,12,21
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,490200,1122404,2866462,3,2,3,3,12,4,9


In [73]:
tmp = test_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how="left", on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,Wednesday_sum,주말_구매금액,평일_구매금액,월요일_구매건수,화요일_구매건수,수요일_구매건수,목요일_구매건수,금요일_구매건수,토요일_구매건수,일요일_구매건수
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,0,231043,198536,1,2,0,0,0,2,2
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,73000,62000,228600,2,0,1,0,0,1,0
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,0,0,607000,3,0,0,2,0,0,0
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,148000,10000,2295740,2,2,1,10,8,0,1
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,0,980890,587800,1,3,0,0,4,1,6


In [74]:
a = [0,1,2,3,4]
b = [5,6]

agg_list = [    
    ("평일_구매건수", lambda x: np.sum(x.dt.weekday.isin(a))),
    ("주말_구매건수", lambda x: np.sum(x.dt.weekday.isin(b)))
]

tmp = train_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,평일_구매금액,월요일_구매건수,화요일_구매건수,수요일_구매건수,목요일_구매건수,금요일_구매건수,토요일_구매건수,일요일_구매건수,평일_구매건수,주말_구매건수
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,2431200,1,0,5,4,1,2,1,11,3
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,5013025,2,11,9,14,5,0,1,41,1
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,17595586,26,19,13,15,8,10,11,81,21
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,31578862,14,21,32,41,44,12,21,152,33
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,2866462,3,2,3,3,12,4,9,23,13


In [75]:
tmp = test_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how="left", on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,평일_구매금액,월요일_구매건수,화요일_구매건수,수요일_구매건수,목요일_구매건수,금요일_구매건수,토요일_구매건수,일요일_구매건수,평일_구매건수,주말_구매건수
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,198536,1,2,0,0,0,2,2,3,4
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,228600,2,0,1,0,0,1,0,3,1
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,607000,3,0,0,2,0,0,0,5,0
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,2295740,2,2,1,10,8,0,1,23,1
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,587800,1,3,0,0,4,1,6,8,7


In [76]:
a = [0,1,2,3,4]
b = [5,6]

agg_list = [    
    ("평일_구매비율", lambda x: np.mean(x.dt.weekday.isin(a))),
    ("주말_구매비율", lambda x: np.mean(x.dt.weekday.isin(b)))
]

tmp = train_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how="left", on="ID")
train_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,화요일_구매건수,수요일_구매건수,목요일_구매건수,금요일_구매건수,토요일_구매건수,일요일_구매건수,평일_구매건수,주말_구매건수,평일_구매비율,주말_구매비율
0,train_0,9,2004-05-07,2005-01-16,255,28,2,1.555556,22.744789,104,...,0,5,4,1,2,1,11,3,0.785714,0.214286
1,train_1,21,2004-05-11,2005-04-27,352,16,1,2.0,11.739134,3,...,11,9,14,5,0,1,41,1,0.97619,0.02381
2,train_2,55,2004-05-06,2005-04-26,356,6,11,1.854545,5.019987,4,...,19,13,15,8,10,11,81,21,0.794118,0.205882
3,train_3,90,2004-05-11,2005-04-20,345,3,19,2.055556,3.137146,10,...,21,32,41,44,12,21,152,33,0.821622,0.178378
4,train_4,24,2004-05-09,2005-03-17,313,13,6,1.5,11.962731,44,...,2,3,3,12,4,9,23,13,0.638889,0.361111


In [77]:
tmp = test_tr_clean.groupby("ID")["구매날짜"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how="left", on="ID")
test_ft.head()

Unnamed: 0,ID,총방문일수,첫구매날짜,마지막구매날짜,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,...,화요일_구매건수,수요일_구매건수,목요일_구매건수,금요일_구매건수,토요일_구매건수,일요일_구매건수,평일_구매건수,주말_구매건수,평일_구매비율,주말_구매비율
0,test_0,5,2004-05-16,2004-10-26,164,32,3,1.4,30.416096,186,...,2,0,0,0,2,2,3,4,0.428571,0.571429
1,test_1,3,2004-09-11,2005-02-23,166,55,1,1.333333,57.879185,66,...,0,1,0,0,1,0,3,1,0.75,0.25
2,test_2,2,2004-08-16,2004-09-02,18,9,0,2.5,7.361216,240,...,0,0,2,0,0,0,5,0,1.0,0.0
3,test_3,14,2004-07-22,2005-04-19,272,19,1,1.714286,17.715031,11,...,2,1,10,8,0,1,23,1,0.958333,0.041667
4,test_4,7,2004-05-25,2005-02-13,265,37,3,2.142857,36.517818,76,...,3,0,0,4,1,6,8,7,0.533333,0.466667


In [78]:
#추가

train_ft["주말_방문_선호도"] = train_ft["주말_구매비율"] / (train_ft["평일_구매비율"] + 1)
train_ft["방문일수_대비_구매건수"] = train_ft["구매건수"] / train_ft["총방문일수"]
train_ft["전체_성수기_구매비율"] = train_ft[["추석성수기_구매비율", "연말연시_구매비율", "설날성수기_구매비율"]].sum(axis=1)
train_ft["전체_준성수기_구매비율"] = train_ft[["여름시즌_구매비율", "신학기_구매비율"]].sum(axis=1)
train_ft["전체_중간기_구매비율"] = train_ft[["여름끝추석전_구매비율", "연말연시끝설날전_구매비율", "신학기후_구매비율"]].sum(axis=1)
train_ft["전체_비수기_구매비율"] = train_ft[["초여름_구매비율", "추석후연말전_구매비율", "봄철비수기_구매비율"]].sum(axis=1)
train_ft["성수기_구매집중도"] = train_ft["전체_성수기_구매비율"] / (train_ft["전체_비수기_구매비율"] + 1)
train_ft["계절_구매변동성"] = train_ft[["봄_구매비율", "여름_구매비율", "가을_구매비율", "겨울_구매비율"]].std(axis=1)

test_ft["주말_방문_선호도"] = test_ft["주말_구매비율"] / (test_ft["평일_구매비율"] + 1)
test_ft["방문일수_대비_구매건수"] = test_ft["구매건수"] / test_ft["총방문일수"]
test_ft["전체_성수기_구매비율"] = test_ft[["추석성수기_구매비율", "연말연시_구매비율", "설날성수기_구매비율"]].sum(axis=1)
test_ft["전체_준성수기_구매비율"] = test_ft[["여름시즌_구매비율", "신학기_구매비율"]].sum(axis=1)
test_ft["전체_중간기_구매비율"] = test_ft[["여름끝추석전_구매비율", "연말연시끝설날전_구매비율", "신학기후_구매비율"]].sum(axis=1)
test_ft["전체_비수기_구매비율"] = test_ft[["초여름_구매비율", "추석후연말전_구매비율", "봄철비수기_구매비율"]].sum(axis=1)
test_ft["성수기_구매집중도"] = test_ft["전체_성수기_구매비율"] / (test_ft["전체_비수기_구매비율"] + 1)
test_ft["계절_구매변동성"] = test_ft[["봄_구매비율", "여름_구매비율", "가을_구매비율", "겨울_구매비율"]].std(axis=1)


In [79]:
cols = [col for col in train_ft.columns if col.startswith("pivot_중분류_")]

train_ft["중분류별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["중분류별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["중분류별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["중분류별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["중분류별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["중분류별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

In [80]:
cols = [col for col in train_ft.columns if col.startswith("pivot_지점코드_")]

train_ft["지점코드별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["지점코드별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["지점코드별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["지점코드별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["지점코드별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["지점코드별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

In [81]:
cols = [col for col in train_ft.columns if col.startswith("pivot_대분류_")]

train_ft["대분류별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["대분류별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["대분류별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["대분류별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["대분류별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["대분류별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

In [82]:
cols = [col for col in train_ft.columns if col.endswith("요일_구매비율")]

train_ft["요일별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["요일별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["요일별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["요일별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["요일별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["요일별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

In [83]:
cols = [col for col in train_ft.columns if col.endswith("월_구매비율")]

train_ft["월별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["월별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["월별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["월별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["월별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["월별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 841), (12225, 841))

In [84]:
cols = ["봄_구매비율", "여름_구매비율", "가을_구매비율", "겨울_구매비율"]

train_ft["계절별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["계절별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["계절별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["계절별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["계절별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["계절별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 844), (12225, 844))

In [85]:
cols = [ col for col in train_ft.columns if col.endswith("분기_구매비율") ]

train_ft["분기별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["분기별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["분기별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["분기별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["분기별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["분기별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 847), (12225, 847))

## 인코딩

In [86]:
train_ft.to_csv(f"{DATA_PATH}train_tmp_1113.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp_1113.csv",index=False)

In [87]:
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [88]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

ID         14940
첫구매날짜        337
마지막구매날짜      334
주구매지점          4
주구매_중분류      232
주구매_대분류       28
고객등급           5
dtype: int64

In [89]:
cols = ["ID", "첫구매날짜", "마지막구매날짜", "주구매지점", "주구매_중분류", "주구매_대분류"]

train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)

- 고객등급은 onehotencoding

In [90]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft["고객등급"])
train_ft = pd.concat([train_ft, tmp], axis=1)
train_ft.head()

Unnamed: 0,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,평균구매요일,주구매요일,월요일_구매비율,...,계절별_구매횟수_skew,계절별_구매횟수_kurt,분기별_구매횟수_std,분기별_구매횟수_skew,분기별_구매횟수_kurt,고객등급_1,고객등급_2,고객등급_3,고객등급_4,고객등급_5
0,9,255,28,2,1.555556,22.744789,104,3.0,2,0.071429,...,-1.539601,2.888889,0.14869,0.0,0.390533,1,0,0,0,0
1,21,352,16,1,2.0,11.739134,3,2.309524,3,0.047619,...,-0.123691,-5.290173,0.090141,1.658524,2.615468,1,0,0,0,0
2,55,356,6,11,1.854545,5.019987,4,2.333333,0,0.254902,...,1.295669,2.179478,0.127702,0.238913,-4.582219,0,1,0,0,0
3,90,345,3,19,2.055556,3.137146,10,3.081081,4,0.075676,...,1.902132,3.642104,0.073505,0.281261,-0.460929,0,1,0,0,0
4,24,313,13,6,1.5,11.962731,44,3.861111,4,0.083333,...,0.729678,-1.947285,0.188398,1.794946,3.390359,1,0,0,0,0


In [91]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(test_ft["고객등급"])
test_ft = pd.concat([test_ft, tmp], axis=1)
test_ft.head()

Unnamed: 0,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,평균구매요일,주구매요일,월요일_구매비율,...,계절별_구매횟수_skew,계절별_구매횟수_kurt,분기별_구매횟수_std,분기별_구매횟수_skew,분기별_구매횟수_kurt,고객등급_1,고객등급_2,고객등급_3,고객등급_4,고객등급_5
0,5,164,32,3,1.4,30.416096,186,3.428571,1,0.142857,...,-1.129338,2.227147,0.214286,-0.37037,-3.901235,1,0,0,0,0
1,3,166,55,1,1.333333,57.879185,66,1.75,0,0.5,...,1.414214,1.5,0.204124,0.0,1.5,1,0,0,0,0
2,2,18,9,0,2.5,7.361216,240,1.2,0,0.6,...,0.37037,-3.901235,0.5,2.0,4.0,1,0,0,0,0
3,14,272,19,1,1.714286,17.715031,11,3.0,3,0.083333,...,0.155055,-5.11157,0.247674,1.028435,-0.208793,1,0,0,0,0
4,7,265,37,3,2.142857,36.517818,76,4.0,6,0.066667,...,-1.129338,2.227147,0.083887,-1.129338,2.227147,1,0,0,0,0


- 주구매시간대, 평균구매시간, 주구매요일, 평균구매요일은 onehotencoding


In [92]:
train_ft["평균구매요일"] = train_ft["평균구매요일"].astype(int)
test_ft["평균구매요일"] = test_ft["평균구매요일"].astype(int)

train_ft["평균구매시간"] = train_ft["평균구매시간"].astype(int)
test_ft["평균구매시간"] = test_ft["평균구매시간"].astype(int)

In [93]:
cols = ["주구매시간대", "평균구매시간", "주구매요일", "평균구매요일"]

enc = OneHotEncoder(handle_unknown="ignore")
tmp = pd.concat([train_ft[cols], test_ft[cols]])
enc.fit(tmp)

train_ft[enc.get_feature_names_out()] = enc.transform(train_ft[cols]).toarray()
test_ft[enc.get_feature_names_out()] = enc.transform(test_ft[cols]).toarray()

train_ft.shape, test_ft.shape

((14940, 883), (12225, 883))

In [94]:
cols = ["고객등급", "주구매시간대", "평균구매시간", "주구매요일", "평균구매요일"]

train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)

In [95]:
train_ft.to_csv(f"{DATA_PATH}train_encoding_1113.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_encoding_1113.csv",index=False)

## 스케일링

In [96]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, RobustScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = LogisticRegression(max_iter=300, random_state=42)

In [98]:
scaler_cls_list = [StandardScaler, MinMaxScaler,RobustScaler, Normalizer]
def check_scaling_score(scaler_cls_list, x, y, model, cv, scoring):
    for scaler_cls in scaler_cls_list:
        scaler = scaler_cls()
        x_train = scaler.fit_transform(x)
        scores = cross_val_score(model, x_train, y, scoring=scoring, cv=cv, n_jobs=-1)
        print(scaler_cls.__name__ , scores.mean())

# MinMaxScaler 점수가 가장 높으므로 사용

In [100]:
scaler = MinMaxScaler()

train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)

train_ft.head()

Unnamed: 0,총방문일수,백화점이용기간,구매주기,주말방문일수,일별평균구매건수,구매간격_표준편차,마지막구매후_경과일,월요일_구매비율,화요일_구매비율,수요일_구매비율,...,주구매요일_4,주구매요일_5,주구매요일_6,평균구매요일_0,평균구매요일_1,평균구매요일_2,평균구매요일_3,평균구매요일_4,평균구매요일_5,평균구매요일_6
0,0.035398,0.699725,0.153409,0.037037,0.079365,0.135386,0.283747,0.071429,0.0,0.357143,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.088496,0.966942,0.085227,0.018519,0.142857,0.069876,0.00551,0.047619,0.261905,0.214286,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.238938,0.977961,0.028409,0.203704,0.122078,0.029881,0.008264,0.254902,0.186275,0.127451,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.393805,0.947658,0.011364,0.351852,0.150794,0.018673,0.024793,0.075676,0.113514,0.172973,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.10177,0.859504,0.068182,0.111111,0.071429,0.071207,0.118457,0.083333,0.055556,0.083333,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [101]:
train_ft.to_csv(f"{DATA_PATH}train_scaling_1113.csv")
test_ft.to_csv(f"{DATA_PATH}test_scaling_1113.csv")