In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk
import datetime as dt

import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()
''
import warnings

warnings.filterwarnings('ignore')

  from pandas.core import datetools


In [2]:
# 테이블 현황 파악을 위한 함수 
def summary_table(table):
    df = pd.DataFrame()
    for i in table.columns:
        name = i
        dtype = table[i].dtype.name
        null = table[i].isnull().sum()
        act = table.shape[0] - null
        unique = len(table[i].unique())
        data = {'name': name, 'dtype': dtype, 'null': null, 'act': act, 'unique': unique}
        df = df.append(data, ignore_index=True)
    return df

### 데이터 불러오기: 일본어는 영어로 번역, 지역에 Prefecture가 붙은 지명은 Prefecture 제외

In [3]:
# train 
detail_train = pd.read_csv('coupon_data_project2/coupon_detail_train_translated_en.csv', parse_dates=['I_DATE'])
visit_train = pd.read_csv('coupon_data_project2/coupon_visit_train.csv', parse_dates=['I_DATE'])

area_train = pd.read_csv('coupon_data_project2/coupon_area_train_translated_en.csv')
coupon_list_train = pd.read_csv('coupon_data_project2/coupon_list_train_translated_en.csv', parse_dates=['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND'])

# base data
location = pd.read_csv('coupon_data_project2/train_location.csv')
user_list = pd.read_csv('coupon_data_project2/user_list_translated_en.csv', parse_dates=['WITHDRAW_DATE', 'REG_DATE'])

# test data
area_test = pd.read_csv('coupon_data_project2/test_location.csv')
coupon_list_test = pd.read_csv('coupon_data_project2/coupon_list_test_translated_en.csv', parse_dates=['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND'])

# submisiion
submission = pd.read_csv('coupon_data_project2/sample_submission.csv')

--------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------
## A. Preprocessing
--------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------


### A-1. detail_train
--------------------------------------------------------------------------------------------------------

#### 1) 신규 columns 생성

1-1) merge 후 구매 구분을 위한 PURCHASE_FLG

In [4]:
detail_train['PURCHASE_FLG'] = 1

#### 2) column명 변경

2-1) I_DATE -> purchase_date: 구매 일자 구분

2-2) SMALL_AREA_NAME: coupon list의 지역(판매 spot)과 구분하기 위함

In [5]:
detail_train.rename(columns = {'I_DATE': 'purchase_date'}, inplace=True)
detail_train.rename(columns = {'SMALL_AREA_NAME': 'resid_small'}, inplace=True)

#### 3) drop: ITEM_COUNT는 활요여부 판단후 처리

In [6]:
detail_train.drop(labels = ['ITEM_COUNT'], axis=1, inplace=True)
detail_train.drop(labels = ['PURCHASEID_hash'], axis=1, inplace=True)
detail_train.drop(labels = ['resid_small'], axis=1, inplace=True)
detail_train.drop(labels = ['purchase_date'], axis=1, inplace=True)

### A-2. visit_train
--------------------------------------------------------------------------------------------------------

#### 1) 신규 column 생성

In [7]:
visit_train['VIEW'] = 1

#### 2) column명 변경

In [8]:
visit_train.rename(columns = {'I_DATE': 'VIEW_DATE'}, inplace=True)
visit_train.rename(columns = {'VIEW_COUPON_ID_hash': 'COUPON_ID_hash'}, inplace=True)

#### 3) drop

In [9]:
visit_train.drop(labels = ['PAGE_SERIAL'], axis=1, inplace=True)
visit_train.drop(labels = ['REFERRER_hash'], axis=1, inplace=True)
visit_train.drop(labels = ['SESSION_ID_hash'], axis=1, inplace=True)
visit_train.drop(labels = ['PURCHASEID_hash'], axis=1, inplace=True)

In [10]:
visit_train[:2]

Unnamed: 0,PURCHASE_FLG,VIEW_DATE,COUPON_ID_hash,USER_ID_hash,VIEW
0,0,2012-03-28 14:15:00,34c48f84026e08355dc3bd19b427f09a,d9dca3cb44bab12ba313eaa681f663eb,1
1,0,2012-03-28 14:17:28,34c48f84026e08355dc3bd19b427f09a,d9dca3cb44bab12ba313eaa681f663eb,1


### A-3. Coupon_list
--------------------------------------------------------------------------------------------------------

#### 1) 전처리 일관성을 유지를 위한 coupon_list merge(311~ train임)

In [11]:
coupon_list = pd.merge(coupon_list_test, coupon_list_train, how='outer')

In [12]:
coupon_list_test.shape, coupon_list_train.shape, coupon_list.shape

((310, 24), (19413, 24), (19723, 24))

#### 2) 신규 columns 생성

In [13]:
# 캡슐과 장르 통합 및 명칭 변경
coupon_list['Case'] = coupon_list['CAPSULE_TEXT'] + coupon_list['GENRE_NAME']
coupon_list['Case'] = coupon_list['Case'].apply(lambda x: "HOTEL" if x == 'Guest houseHotel and Japanese hotel' 
                          or x == 'HotelHotel and Japanese hotel'
                          or x == 'Japanese hotelHotel and Japanese hotel'
                          or x == 'Japanse guest houseHotel and Japanese hotel'
                          or x == 'LodgeHotel and Japanese hotel'
                          or x == 'Public hotelHotel and Japanese hotel'
                          or x == 'Resort innHotel and Japanese hotel'
                          or x == 'Vacation rentalHotel and Japanese hotel'
                          else "NAIL" if x == 'Nail and eye salonNail and eye salon'
                          else "HAIR" if x == 'Hair salonHair salon'
                          else "FOOD" if x == 'FoodFood'
                          else "SPA" if x == 'SpaSpa'
                          else "BEAUTY" if x == 'BeautyBeauty'
                          else "CLASS" if x == 'ClassLesson'
                          else "CORRESPONDENCE" if x == 'Correspondence courseLessonClassLesson'
                          else "DELIVERY" if x == 'Delivery serviceDelivery service'
                          else "EVENT" if x == 'EventOther coupon'
                          else "GIFT" if x == 'Gift cardGift card'
                          else "HEALTH" if x == 'Health and medicalHealth and medical'
                          else "LEISURE" if x == 'LeisureLeisure'
                          else "LESSON" if x == 'LessonLesson'
                          else "OTHER" if x == 'OtherOther coupon'
                          else "RELAXATION" if x == 'RelaxationRelaxation'
                          else "WEB" if x == 'Web serviceOther coupon'
                          else 'OTHER'
                          )

In [14]:
# 실판매가 게산
coupon_list['Price'] = coupon_list['CATALOG_PRICE'] + coupon_list['DISCOUNT_PRICE']

In [15]:
# 실판매가 정규화
coupon_list["lnDPRICE"] = np.log1p(coupon_list["Price"])
coupon_list["mDPRICE"] = coupon_list.groupby("Case")["lnDPRICE"].transform(np.mean)
coupon_list["sDPRICE"] = coupon_list.groupby("Case")["lnDPRICE"].transform(np.std)
coupon_list["zprice"] = (coupon_list["lnDPRICE"] - coupon_list["mDPRICE"]) / coupon_list["sDPRICE"]

#### 3) column명 변경

In [16]:
# 지역명 변경
coupon_list.rename(columns = {"LARGE_AREA_NAME": "spot_large", 
                              "ken_name": "spot_pref", 
                              "SMALL_AREA_NAME": "spot_small"}, inplace=True)

#### 4) Null 값 및 오류 처리

In [17]:
# usable: nan -> 1, 2 -> 0
coupon_list['USABLE_DATE_MON'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_TUE'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_WED'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_THU'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_FRI'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_SAT'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_SUN'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_HOLIDAY'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_BEFORE_HOLIDAY'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)

#### 5) drop

In [18]:
coupon_list.drop(labels = ['CAPSULE_TEXT'], axis=1, inplace=True )
coupon_list.drop(labels = ['GENRE_NAME'], axis=1, inplace=True )
coupon_list.drop(labels = ['CATALOG_PRICE'], axis=1, inplace=True )
coupon_list.drop(labels = ['DISCOUNT_PRICE'], axis=1, inplace=True )
coupon_list.drop(labels = ['DISPFROM'], axis=1, inplace=True )
coupon_list.drop(labels = ['DISPEND'], axis=1, inplace=True )
coupon_list.drop(labels = ['VALIDFROM'], axis=1, inplace=True )
coupon_list.drop(labels = ['VALIDEND'], axis=1, inplace=True )
coupon_list.drop(labels = ['lnDPRICE'], axis=1, inplace=True )
coupon_list.drop(labels = ['mDPRICE'], axis=1, inplace=True )
# coupon_list.drop(labels = ['Price'], axis=1, inplace=True )
coupon_list.drop(labels = ['spot_pref'], axis=1, inplace=True )  # 판단이슈 
coupon_list.drop(labels = ['spot_small'], axis=1, inplace=True ) # 판단이슈 
coupon_list.drop(labels = ['spot_large'], axis=1, inplace=True ) # 판단이슈 

#### 6) train & test set 분리

In [19]:
# train set과 test set을 다시 분리
coupon_list_train = coupon_list[311:]
coupon_list_test = coupon_list[:310]

### A4. User_list
--------------------------------------------------------------------------------------------------------

#### 1) column명 변경

In [20]:
user_list.rename(columns = {'PREF_NAME': 'user_pref'}, inplace=True)

#### 2) SEX_ID 0,1 로 변경(f: 0, m: 1)

In [21]:
# SEX_ID: f는 0으로 M은 1로
user_list['SEX_ID'] = user_list['SEX_ID'].apply(lambda x: 0 if x == 'f' else 1) 

#### 3) drop

In [22]:
# REG_DATE , WITHDRAW_DATE 삭제
user_list.drop(labels = ['REG_DATE'], axis=1, inplace=True)
user_list.drop(labels = ['WITHDRAW_DATE'], axis=1, inplace=True)
user_list.drop(labels = ['user_pref'], axis=1, inplace=True)  # 판단 이슈


In [23]:
user_list[:2]

Unnamed: 0,SEX_ID,AGE,USER_ID_hash
0,0,25,d9dca3cb44bab12ba313eaa681f663eb
1,0,34,560574a339f1b25e57b0221e486907ed


### A5. train set 구성
--------------------------------------------------------------------------------------------------------

#### 1) visit_train & detail_train -> train

In [24]:
train= pd.merge(visit_train, detail_train, how='outer')

In [25]:
train.shape

(2913043, 5)

#### 2) train & coupon_list

In [26]:
train = pd.merge(train, coupon_list, how='left', on='COUPON_ID_hash')

In [27]:
train.shape # 315,029 개의 sell은 기초정보에 존재 하지 않던 쿠폰리스트임

(2913043, 21)

#### 3) train & user_list

In [28]:
train = pd.merge(train, user_list, how='left', on='USER_ID_hash')

In [29]:
train.shape

(2913043, 23)

#### 4) train & location

In [30]:
train['key'] = train['COUPON_ID_hash']+ train['USER_ID_hash']

In [31]:
location['key'] = location['COUPON_ID_hash'] + location['USER_ID_hash'] 
location.drop_duplicates(['key'], inplace=True)

In [32]:
train.insert(2, 'distance', train['key'].map(location.set_index('key')['distance']))
train.insert(2, 'PREF_in', train['key'].map(location.set_index('key')['PREF_in']))

In [33]:
train.drop(labels=['key'], axis=1, inplace=True)

In [34]:
train[:2]

Unnamed: 0,PURCHASE_FLG,VIEW_DATE,PREF_in,distance,COUPON_ID_hash,USER_ID_hash,VIEW,PRICE_RATE,DISPPERIOD,VALIDPERIOD,...,USABLE_DATE_SAT,USABLE_DATE_SUN,USABLE_DATE_HOLIDAY,USABLE_DATE_BEFORE_HOLIDAY,Case,Price,sDPRICE,zprice,SEX_ID,AGE
0,0,2012-03-28 14:15:00,0,,34c48f84026e08355dc3bd19b427f09a,d9dca3cb44bab12ba313eaa681f663eb,1.0,78.0,4.0,,...,1.0,1.0,1.0,1.0,DELIVERY,8775.0,0.649353,0.427954,0,25
1,0,2012-03-28 14:17:28,0,,34c48f84026e08355dc3bd19b427f09a,d9dca3cb44bab12ba313eaa681f663eb,1.0,78.0,4.0,,...,1.0,1.0,1.0,1.0,DELIVERY,8775.0,0.649353,0.427954,0,25


#### 4) train 현황 점검 및 NaN값 처리

4-1) null 이 315,301개인 것들은 기초정보(coupon_list(test 포함)에 없는 것들이므로 제외 -> zprice 기준으로 처리

4-2) VALIDPERIOD(null: 773,492)은 무제한이라는 의미에서 10,000으로 처리

4-3) user_pref(null: 488,972) 을 NN 으로 처리

In [35]:
# null 이 315,301개인 것들은 기초정보(coupon_list(test 포함)에 없는 것들이므로 제외 -> zprice 기준으로 처리
train = train[train['Price'] >= 0]

In [36]:
train['VALIDPERIOD'] = train['VALIDPERIOD'].fillna(10000)
# train['user_pref'] = train['user_pref'].fillna('NN')
train['VIEW'] = train['VIEW'].fillna(0)

In [37]:
train['distance'] = train['distance'].fillna(train['distance'].mean())

In [38]:
summary_table(train).sort_values(by='unique', ascending = True)

Unnamed: 0,act,dtype,name,null,unique
0,2597810.0,int64,PURCHASE_FLG,0.0,2.0
18,2597810.0,float64,USABLE_DATE_BEFORE_HOLIDAY,0.0,2.0
17,2597810.0,float64,USABLE_DATE_HOLIDAY,0.0,2.0
16,2597810.0,float64,USABLE_DATE_SUN,0.0,2.0
15,2597810.0,float64,USABLE_DATE_SAT,0.0,2.0
14,2597810.0,float64,USABLE_DATE_FRI,0.0,2.0
13,2597810.0,float64,USABLE_DATE_THU,0.0,2.0
23,2597810.0,int64,SEX_ID,0.0,2.0
11,2597810.0,float64,USABLE_DATE_TUE,0.0,2.0
10,2597810.0,float64,USABLE_DATE_MON,0.0,2.0


--------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------
## B. 모델링
--------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------

### B1. 데이터 생성
--------------------------------------------------------------------------------------------------------

#### 1) dummy list 생성(train & test의 일관성을 위함)

In [39]:
ls_dummy = ['Case']
# ls_dummy = ['user_pref', 'spot_small', 'spot_pref', 'spot_large', 'Case']

#### 2) train data dummy 처리

In [40]:
train = pd.get_dummies(train, columns = ls_dummy)

#### 3) test data 생성 -> 아래 로케이션 부분은 슬랙에서설명한 부분 참조

In [41]:
# test 데이터 생성
coupon_list_test['A'] = 1
user_list['A'] = 1
test = pd.merge(coupon_list_test, user_list, how='outer')

In [42]:
test[:2]

Unnamed: 0,PRICE_RATE,DISPPERIOD,VALIDPERIOD,USABLE_DATE_MON,USABLE_DATE_TUE,USABLE_DATE_WED,USABLE_DATE_THU,USABLE_DATE_FRI,USABLE_DATE_SAT,USABLE_DATE_SUN,...,USABLE_DATE_BEFORE_HOLIDAY,COUPON_ID_hash,Case,Price,sDPRICE,zprice,A,SEX_ID,AGE,USER_ID_hash
0,52,4,118.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,c76ea297ebd3a5a4d3bf9f75269f66fa,FOOD,8349,0.604073,0.487562,1,0,25,d9dca3cb44bab12ba313eaa681f663eb
1,52,4,118.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,c76ea297ebd3a5a4d3bf9f75269f66fa,FOOD,8349,0.604073,0.487562,1,0,34,560574a339f1b25e57b0221e486907ed


In [43]:
# area_test 삽입
test['key'] = test['COUPON_ID_hash'] + test['USER_ID_hash'] 
area_test['key'] = area_test['COUPON_ID_hash'] + area_test['USER_ID_hash'] 
test.insert(2, 'distance', test['key'].map(area_test.set_index('key')['distance']))
test.insert(2, 'PREF_in', test['key'].map(area_test.set_index('key')['PREF_in']))
test.drop(labels = ['key'], axis=1, inplace=True)

#### 4) test data dummy처리

In [45]:
test = pd.get_dummies(test, columns = ls_dummy)
test.drop(labels = ['A'], axis=1, inplace=True)

#### 5)  test & train set columns 비교 -> 지역이 문제임. 지역은 개인 판단하에 위에 drop부분에서 삭제해주시길

5-1) PURCHASE_FLG: train의 y값으로 활용될 것임

5-2) VIEW_DATE: 향후 활용 가능성이 있음. 우선은 mod_ls에서 걸러짐.

5-3) VIEW: 향후 활용 가능성 있음(가중치 넣는 식). 우선은 mod_ls에서 걸러짐

In [46]:
compare_not_test = [i for i in train.columns if i not in test.columns]
compare_not_train = [i for i in test.columns if i not in train.columns]
print('only_train: {}  \n'.format(compare_not_test))
print('only_test: {}'.format(compare_not_train))

only_train: ['PURCHASE_FLG', 'VIEW_DATE', 'VIEW', 'Case_BEAUTY', 'Case_EVENT', 'Case_LESSON']  

only_test: []


### B2. train data set
--------------------------------------------------------------------------------------------------------

In [47]:
to_be_removed_train = {'PURCHASE_FLG', 'USER_ID_hash','COUPON_ID_hash', "VIEW"}
ls_train = [i for i in list(train.columns) if i not in to_be_removed_train]
X_train = train.filter(ls_train)
y_train = train.PURCHASE_FLG

In [48]:
X_train[:2]

Unnamed: 0,VIEW_DATE,PREF_in,distance,PRICE_RATE,DISPPERIOD,VALIDPERIOD,USABLE_DATE_MON,USABLE_DATE_TUE,USABLE_DATE_WED,USABLE_DATE_THU,...,Case_HAIR,Case_HEALTH,Case_HOTEL,Case_LEISURE,Case_LESSON,Case_NAIL,Case_OTHER,Case_RELAXATION,Case_SPA,Case_WEB
0,2012-03-28 14:15:00,0,237.740409,78.0,4.0,10000.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,2012-03-28 14:17:28,0,237.740409,78.0,4.0,10000.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0


### B3. 모델링: xgboost
--------------------------------------------------------------------------------------------------------

In [None]:
# from sklearn import clone
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
import xgboost

#### 1) train 에 활용할 컬럼 선정(test 컬럼과 일치시킴)

In [None]:
mod_ls = [i for i in X_train.columns if i in test.columns]
X_train = train.filter(mod_ls)
y_train = train.PURCHASE_FLG

#### 2) parameter 지정(parameter는 우수사례 벤치마킹, 논리 및 개선여부 검토해봐야함)

In [None]:
model_xgb = xgboost.XGBClassifier(n_estimators=300, max_depth=3,
                                 objective = 'reg:logistic',
                                 subsample= 0.85,
                                 colsample_bytree=0.8,
                                 random_state=12345,
                                 min_child_weight=1,
                                 learning_rate=0.05)

In [None]:
model_xgb = model_xgb.fit(X_train, y_train)

### B5. Predict

#### 1) predict 후 sum을통해 1이 몇개인지 확인 -> 할때마다 0이 나옴 .. 아래 확률로 접근해야 함

In [None]:
# predict
y_pred_xgb = model_xgb.predict(test.filter(mod_ls))
y_pred_xgb.sum()

#### 2) proba를 생성해서 test set과 merge

In [None]:
y_hat_proba = model_xgb.predict_proba(test.filter(mod_ls))
df_y_hat_proba= pd.DataFrame(y_hat_proba, columns=['n','y'])

In [None]:
test_xgb = test.filter(['USER_ID_hash', 'COUPON_ID_hash'])

In [None]:
result_proba_df = pd.concat([test_xgb, pd.DataFrame(df_y_hat_proba)],1)

#### 3) 기준을 잡기 위해 확률의 평균을 확인

In [None]:
result_proba_df.y.mean()

#### 4) 모델 개선 및 현황 파악을 위한 feature importance 점검

In [None]:
importances = model_xgb.feature_importances_
df_imp = pd.DataFrame()
for i, j in zip(mod_ls, list(importances)):
    data = {'columns': i, 'importance': np.round(j*100,1)} 
    df_imp = df_imp.append(data, ignore_index=True)

df_imp.sort_values(by='importance', ascending=False)

#### 5) 확률을 선정 -> 3번의 기준으로 어림잡아 선정 -> 최종 제출시에는 각 유저별 상위 10개로 지정하는게 좋겠음

In [None]:
result_proba_df2 = result_proba_df[result_proba_df['y']>0.08].sort_values(by='y', ascending=False)

#### 6) 제출양식에 맞춰 lookup_table을 형성

In [None]:
lookup_table = result_proba_df2.groupby('USER_ID_hash').apply(lambda x: list(x.COUPON_ID_hash))
lookup_table = pd.DataFrame(lookup_table, columns = ['COUPON_ID_hash']).reset_index()
lookup_table

lookup_table.rename(columns={'COUPON_ID_hash':'PURCHASED_COUPONS'},inplace=True)

#### 7) 매칭 및 양식에 맞춘 마무리 작업

In [None]:
submission.insert(2, 'COUPON', submission['USER_ID_hash'].map(lookup_table.set_index('USER_ID_hash')['PURCHASED_COUPONS']))

In [None]:
submission.drop('PURCHASED_COUPONS', axis=1, inplace=True)
submission.rename(columns={'COUPON':'PURCHASED_COUPONS'}, inplace=True)

submission['PURCHASED_COUPONS'] = submission['PURCHASED_COUPONS'].astype('str')
submission['PURCHASED_COUPONS'] = submission['PURCHASED_COUPONS'].apply(lambda x: x.replace('[',''))
submission['PURCHASED_COUPONS'] = submission['PURCHASED_COUPONS'].apply(lambda x: x.replace(']',''))
submission['PURCHASED_COUPONS'] = submission['PURCHASED_COUPONS'].apply(lambda x: x.replace("'",''))
submission['PURCHASED_COUPONS'] = submission['PURCHASED_COUPONS'].apply(lambda x: x.replace(',',''))
# submission.drop(labels=['index'], axis=1, inplace=True)

### B7. 검증(그래프 같은것들??)

### B8. submission

In [None]:
submission.to_csv('test_submission_xgb.csv', index=False)