In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from xgboost import XGBRegressor
from statsmodels.tsa.seasonal import seasonal_decompose
from datetime import datetime, timedelta

import os 
from tqdm import tqdm
import warnings 
warnings.filterwarnings(action='ignore')
pd.options.display.max_columns = None

# 한글 폰트 설정
from statsmodels import robust
from matplotlib import font_manager, rc
%matplotlib inline

import platform
your_os = platform.system()
if your_os == 'Linux':
    rc('font', family='NanumGothic')
elif your_os == 'Windows':
    ttf = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=ttf).get_name()
    rc('font', family=font_name)
elif your_os == 'Darwin':
    rc('font', family='AppleGothic')
rc('axes', unicode_minus=False)

In [2]:
# 데이터 불러오기
train = pd.read_csv('data/trainset.csv')
test = pd.read_csv('data/testset.csv')

# susbmssion
submission = pd.read_csv('data/sample_submission.csv')

In [3]:
train.shape, test.shape, submission.shape

((23009, 33), (5754, 32), (5754, 2))

## EDA

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23009 entries, 0 to 23008
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SEQ         23009 non-null  object 
 1   접수일         23009 non-null  object 
 2   접수시각        3113 non-null   object 
 3   장기서비스여부     23009 non-null  int64  
 4   최초서비스일      23009 non-null  object 
 5   전체회차        23009 non-null  int64  
 6   현재회차        23009 non-null  int64  
 7   서비스일자       23009 non-null  object 
 8   서비스시작시간     23009 non-null  object 
 9   서비스종료시간     23009 non-null  object 
 10  기존고객여부      23009 non-null  int64  
 11  결재형태        23009 non-null  object 
 12  서비스주소       23009 non-null  object 
 13  주거형태        23009 non-null  object 
 14  평수          13856 non-null  object 
 15  고객가입일       23009 non-null  object 
 16  반려동물        20234 non-null  object 
 17  부재중여부       13856 non-null  float64
 18  우선청소        10979 non-null  object 
 19  쿠폰사용여부      23009 non-nul

In [5]:
train.columns

Index(['SEQ', '접수일', '접수시각', '장기서비스여부', '최초서비스일', '전체회차', '현재회차', '서비스일자',
       '서비스시작시간', '서비스종료시간', '기존고객여부', '결재형태', '서비스주소', '주거형태', '평수', '고객ID',
       '고객가입일', '반려동물', '부재중여부', '우선청소', '쿠폰사용여부', '매니저생년월일', '매니저ID',
       '매니저최초가입일', '매니저최초서비스일', '매니저성별', '매니저사용휴대폰', '매니저주소', '매니저이동방법',
       '근무가능지역', 'CS교육이수여부', '청소교육이수여부', '부재중서비스가능여부', '추천인여부', '매칭성공여부'],
      dtype='object')

In [6]:
# 데이터 불균형 심함 -> 오버샘플링(SMOTE) or 언더샘플링 고려
train['매칭성공여부'].value_counts() / train['매칭성공여부'].count()

0    0.904472
1    0.095528
Name: 매칭성공여부, dtype: float64

In [7]:
# 결제횟수
# 64% -> 4회 결제(이벤트가 있나?)
# 10% -> 1회 결제(단순호기심?)
train['전체회차'].value_counts() / train['전체회차'].count()

4     0.638315
1     0.099222
10    0.063280
8     0.060715
5     0.035899
9     0.033813
6     0.025773
7     0.024599
2     0.010865
3     0.006910
30    0.000608
Name: 전체회차, dtype: float64

In [8]:
# 주어진 기간동안 결제한 만큼을 다 사용한 고객
train[train['전체회차'] == train['현재회차']]['고객ID'].nunique()

265

In [9]:
# train과 test의 기간이 같음 -> train-valid 느낌
print('train 기간 :', train['서비스일자'].min(), '~', train['서비스일자'].max())
print('test 기간 :', test['서비스일자'].min(), '~', test['서비스일자'].max())

train 기간 : 2019-04-09 ~ 2021-10-06
test 기간 : 2019-04-09 ~ 2021-10-06


In [10]:
# 기존고객이 사용하는 경우가 대부분
train['기존고객여부'].value_counts()

1    20084
0     2925
Name: 기존고객여부, dtype: int64

In [11]:
train['결재형태'].value_counts()

신용카드     15428
무통장입금     5961
가상계좌      1609
미수          11
Name: 결재형태, dtype: int64

In [12]:
train['서비스주소'].value_counts().head()

충남 천안시    12554
충남 아산시     3420
강원 원주시     2491
서울 용산구      389
광주 서구       338
Name: 서비스주소, dtype: int64

In [13]:
train['주거형태'].value_counts()

일반주택       19655
오피스텔/원룸     3354
Name: 주거형태, dtype: int64

In [14]:
train['평수'].value_counts()

40평대이상    5831
30평대      5357
20평대      1772
10평대       896
Name: 평수, dtype: int64

In [15]:
feature = pd.concat([train, test])

In [16]:
feature['매칭성공여부'].value_counts()

0.0    20811
1.0     2198
Name: 매칭성공여부, dtype: int64

In [17]:
feature['기존고객여부'].value_counts()

1    25130
0     3633
Name: 기존고객여부, dtype: int64

In [18]:
feature['결재형태'].value_counts()

신용카드     19322
무통장입금     7426
가상계좌      2002
미수          13
Name: 결재형태, dtype: int64

In [19]:
feature['서비스주소'].value_counts().head()

충남 천안시    15628
충남 아산시     4366
강원 원주시     3075
서울 용산구      491
광주 서구       424
Name: 서비스주소, dtype: int64

In [20]:
# train과 test의 구성이 매우 유사하다고 볼 수 있음

## 기본 전처리 및 feature 생성

In [21]:
# 기본 처리

In [21]:
# 날짜 처리
train['접수일'] = pd.to_datetime(train['접수일'])
train['최초서비스일'] = pd.to_datetime(train['최초서비스일'])
train['서비스일자'] = pd.to_datetime(train['서비스일자'])

# 시간 처리
train['서비스시작시간'] = pd.to_datetime(train['서비스시작시간'], format='%H:%M:%S')
train['서비스종료시간'] = pd.to_datetime(train['서비스종료시간'], format='%H:%M:%S')

train['서비스소요시간'] = train['서비스종료시간'] - train['서비스시작시간']
train['접수후걸린시일'] = train['최초서비스일'] - train['접수일']
train['회차사용비율'] = train['현재회차'] / train['전체회차']

In [22]:
# 날짜 처리
test['접수일'] = pd.to_datetime(test['접수일'])
test['최초서비스일'] = pd.to_datetime(test['최초서비스일'])
test['서비스일자'] = pd.to_datetime(test['서비스일자'])

# 시간 처리
test['서비스시작시간'] = pd.to_datetime(test['서비스시작시간'], format='%H:%M:%S')
test['서비스종료시간'] = pd.to_datetime(test['서비스종료시간'], format='%H:%M:%S')

test['서비스소요시간'] = test['서비스종료시간'] - test['서비스시작시간']
test['접수후걸린시일'] = test['최초서비스일'] - test['접수일']
test['회차사용비율'] = test['현재회차'] / test['전체회차']

In [23]:
train[['서비스소요시간', '접수후걸린시일', '회차사용비율']].describe()

Unnamed: 0,서비스소요시간,접수후걸린시일,회차사용비율
count,23009,23009,23009.0
mean,0 days 03:56:28.308922595,7 days 17:20:23.990612369,0.540951
std,0 days 00:43:59.172904291,6 days 08:25:44.391509399,0.284242
min,0 days 02:00:00,-4 days +00:00:00,0.033333
25%,0 days 04:00:00,4 days 00:00:00,0.25
50%,0 days 04:00:00,7 days 00:00:00,0.5
75%,0 days 04:00:00,8 days 00:00:00,0.75
max,0 days 09:00:00,70 days 00:00:00,1.0


In [24]:
# 좀 너무한데..?
train[train['서비스소요시간'] == '0 days 09:00:00']['매칭성공여부'].value_counts()

0    204
1     24
Name: 매칭성공여부, dtype: int64

### mean_encoding

In [25]:
train.shape

(23009, 38)

In [26]:
# 전체회차 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('전체회차')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['전체회차_mean'] = train['전체회차'].map(smooth)
test['전체회차_mean'] = test['전체회차'].map(smooth)

In [27]:
# 결제형태 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('결재형태')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['결제형태_mean'] = train['결재형태'].map(smooth)
test['결제형태_mean'] = test['결재형태'].map(smooth)

In [28]:
# 서비스주소 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('서비스주소')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['서비스주소_mean'] = train['서비스주소'].map(smooth)
test['서비스주소_mean'] = test['서비스주소'].map(smooth)

In [29]:
# 주거형태 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('주거형태')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['주거형태_mean'] = train['주거형태'].map(smooth)
test['주거형태_mean'] = test['주거형태'].map(smooth)

In [30]:
# 매니저사용휴대폰 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('매니저사용휴대폰')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['매니저사용휴대폰_mean'] = train['매니저사용휴대폰'].map(smooth)
test['매니저사용휴대폰_mean'] = test['매니저사용휴대폰'].map(smooth)

In [31]:
# 매니저주소 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('매니저주소')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['매니저주소_mean'] = train['매니저주소'].map(smooth)
test['매니저주소_mean'] = test['매니저주소'].map(smooth)

In [32]:
# 매니저이동방법 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('매니저이동방법')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['매니저이동방법_mean'] = train['매니저이동방법'].map(smooth)
test['매니저이동방법_mean'] = test['매니저이동방법'].map(smooth)

In [33]:
# 근무가능지역 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('근무가능지역')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['근무가능지역_mean'] = train['근무가능지역'].map(smooth)
test['근무가능지역_mean'] = test['근무가능지역'].map(smooth)

In [34]:
# 서비스소요시간 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('서비스소요시간')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['서비스소요시간_mean'] = train['서비스소요시간'].map(smooth)
test['서비스소요시간_mean'] = test['서비스소요시간'].map(smooth)

In [35]:
# 접수후걸린시일 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('접수후걸린시일')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['접수후걸린시일_mean'] = train['접수후걸린시일'].map(smooth)
test['접수후걸린시일_mean'] = test['접수후걸린시일'].map(smooth)

In [36]:
# 반려동물 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('반려동물')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['반려동물_mean'] = train['반려동물'].map(smooth)
test['반려동물_mean'] = test['반려동물'].map(smooth)

In [37]:
# 우선청소 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('우선청소')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['우선청소_mean'] = train['우선청소'].map(smooth)
test['우선청소_mean'] = test['우선청소'].map(smooth)

In [38]:
# 부재중여부 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('부재중여부')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['부재중여부_mean'] = train['부재중여부'].map(smooth)
test['부재중여부_mean'] = test['부재중여부'].map(smooth)

In [39]:
# 쿠폰사용여부 mean_encoding
mean = train['매칭성공여부'].mean()
agg = train.groupby('쿠폰사용여부')['매칭성공여부'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean) / (counts+weight)
train['쿠폰사용여부_mean'] = train['쿠폰사용여부'].map(smooth)
test['쿠폰사용여부_mean'] = test['쿠폰사용여부'].map(smooth)

In [40]:
# CS교육이수여부 ME
mean = train['매칭성공여부'].mean()
agg = train.groupby('CS교육이수여부')['매칭성공여부'].agg(['count','mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean)/(counts+weight)
train['CS교육이수여부_mean'] = train['CS교육이수여부'].map(smooth)
test['CS교육이수여부_mean'] = test['CS교육이수여부'].map(smooth)

In [41]:
# 청소교육이수여부 ME
mean = train['매칭성공여부'].mean()
agg = train.groupby('청소교육이수여부')['매칭성공여부'].agg(['count','mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean)/(counts+weight)
train['청소교육이수여부_mean'] = train['청소교육이수여부'].map(smooth)
test['청소교육이수여부_mean'] = test['청소교육이수여부'].map(smooth)

In [42]:
# 부재중서비스가능여부 ME
mean = train['매칭성공여부'].mean()
agg = train.groupby('부재중서비스가능여부')['매칭성공여부'].agg(['count','mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean)/(counts+weight)
train['부재중서비스가능여부_mean'] = train['부재중서비스가능여부'].map(smooth)
test['부재중서비스가능여부_mean'] = test['부재중서비스가능여부'].map(smooth)

In [43]:
# 추천인여부 ME
mean = train['매칭성공여부'].mean()
agg = train.groupby('추천인여부')['매칭성공여부'].agg(['count','mean'])
counts = agg['count']
means = agg['mean']
weight = 80
smooth = (counts*means+weight*mean)/(counts+weight)
train['추천인여부_mean'] = train['추천인여부'].map(smooth)
test['추천인여부_mean'] = test['추천인여부'].map(smooth)

In [44]:
train.shape

(23009, 56)

### 특이사항

In [45]:
# 음수값은 대체 뭐냐....
agg.head()

Unnamed: 0_level_0,count,mean
추천인여부,Unnamed: 1_level_1,Unnamed: 2_level_1
0,22848,0.095632
1,161,0.080745


In [46]:
train.loc[train['접수후걸린시일'] < '0 days']['고객ID'].unique()

array(['CLEAN_MY_ID_576', 'CLEAN_MY_ID_8552', 'CLEAN_MY_ID_1463'],
      dtype=object)

In [47]:
train.loc[train['접수후걸린시일'] < '0 days']['매칭성공여부'].mean()

0.07936507936507936

In [48]:
feature = pd.concat([train, test])

In [49]:
feature.loc[feature['접수후걸린시일'] < '0 days']['고객ID'].unique()

array(['CLEAN_MY_ID_576', 'CLEAN_MY_ID_8552', 'CLEAN_MY_ID_1463'],
      dtype=object)

In [50]:
test.loc[test['접수후걸린시일'] < '0 days']['고객ID'].unique()

array(['CLEAN_MY_ID_8552', 'CLEAN_MY_ID_576', 'CLEAN_MY_ID_1463'],
      dtype=object)

In [51]:
test.loc[test['접수후걸린시일'] < '0 days'].shape

(14, 55)

### ont-hot encoding - 굳이 할 필요 있을까 싶음

In [52]:
oh1 = pd.get_dummies(train['결재형태'])
oh2 = pd.get_dummies(train['서비스주소'])
oh3 = pd.get_dummies(train['주거형태'])

train = pd.concat([train, oh1, oh2, oh3], axis=1)

In [53]:
oh1 = pd.get_dummies(test['결재형태'])
oh2 = pd.get_dummies(test['서비스주소'])
oh3 = pd.get_dummies(test['주거형태'])

test = pd.concat([test, oh1, oh2, oh3], axis=1)

In [54]:
train.shape, test.shape

((23009, 104), (5754, 103))

In [55]:
train.head(1)

Unnamed: 0,SEQ,접수일,접수시각,장기서비스여부,최초서비스일,전체회차,현재회차,서비스일자,서비스시작시간,서비스종료시간,기존고객여부,결재형태,서비스주소,주거형태,평수,고객ID,고객가입일,반려동물,부재중여부,우선청소,쿠폰사용여부,매니저생년월일,매니저ID,매니저최초가입일,매니저최초서비스일,매니저성별,매니저사용휴대폰,매니저주소,매니저이동방법,근무가능지역,CS교육이수여부,청소교육이수여부,부재중서비스가능여부,추천인여부,매칭성공여부,서비스소요시간,접수후걸린시일,회차사용비율,전체회차_mean,결제형태_mean,서비스주소_mean,주거형태_mean,매니저사용휴대폰_mean,매니저주소_mean,매니저이동방법_mean,근무가능지역_mean,서비스소요시간_mean,접수후걸린시일_mean,반려동물_mean,우선청소_mean,부재중여부_mean,쿠폰사용여부_mean,CS교육이수여부_mean,청소교육이수여부_mean,부재중서비스가능여부_mean,추천인여부_mean,가상계좌,무통장입금,미수,신용카드,강원 원주시,강원 춘천시,경기 성남시,경기 안성시,경기 의정부시,경기 평택시,경남 창원시,광주 광산구,광주 남구,광주 동구,광주 북구,광주 서구,부산 강서구,부산 동구,부산 동래구,부산 사하구,부산 연제구,부산 해운대구,서울 강남구,서울 강동구,서울 강북구,서울 강서구,서울 관악구,서울 구로구,서울 금천구,서울 노원구,서울 도봉구,서울 동대문구,서울 동작구,서울 마포구,서울 서대문구,서울 서초구,서울 성동구,서울 성북구,서울 영등포구,서울 용산구,서울 은평구,서울 종로구,서울 중구,서울 중랑구,충남 아산시,충남 천안시,오피스텔/원룸,일반주택
0,T06420,2019-07-09,,1,2019-07-15,3,3,2019-07-29,1900-01-01 09:00:00,1900-01-01 13:00:00,1,무통장입금,충남 천안시,일반주택,,CLEAN_MY_ID_38,2019-04-22,,,,0,1956-04-19,HELPER_ID_437,2020-04-02,2020-06-30,0,안드로이드,서울 양천구,대중교통,서울,0.0,0.0,1,0,0,0 days 04:00:00,6 days,1.0,0.103106,0.093965,0.095033,0.0957,0.097482,0.085593,0.099812,0.086232,0.095759,0.095668,,,,0.095549,0.100704,0.101248,0.093087,0.095632,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


### 외부데이터

In [56]:
# 이용후기 크롤링 - 어떤 요소가 만족스러운지 파악 가능
# https://blog.naver.com/cleanveteran/222545015505

### 결측값 처리

In [46]:
#!pip install missingpy

In [55]:
from missingpy import MissForest
imputer = MissForest(random_state=0)
train_imputed = imputer.fit(train, cat_vars=cat_features)

ModuleNotFoundError: No module named 'sklearn.neighbors.base'

### 모델링

In [57]:
train.columns

Index(['SEQ', '접수일', '접수시각', '장기서비스여부', '최초서비스일', '전체회차', '현재회차', '서비스일자',
       '서비스시작시간', '서비스종료시간',
       ...
       '서울 영등포구', '서울 용산구', '서울 은평구', '서울 종로구', '서울 중구', '서울 중랑구', '충남 아산시',
       '충남 천안시', '오피스텔/원룸', '일반주택'],
      dtype='object', length=104)

In [58]:
# 결측치 포함 열 제외
features = ['장기서비스여부', '전체회차', '현재회차', '기존고객여부', '결재형태', '서비스주소', '주거형태',
            '쿠폰사용여부', '매니저주소', '근무가능지역', '부재중서비스가능여부', '추천인여부', '서비스소요시간', 
            '접수후걸린시일', '회차사용비율']

In [59]:
ftr = train[features]
test_x = test[features]
target = train['매칭성공여부']

In [60]:
from lightgbm import LGBMRegressor
from xgboost import XGBRFRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from ngboost import NGBRegressor
from catboost import CatBoostRegressor, Pool

In [61]:
kf = KFold(n_splits=15, random_state=0, shuffle=True)

In [62]:
cat_features = ['기존고객여부', '결재형태', '서비스주소', '주거형태',
                #'반려동물', '부재중여부', '우선청소', '매니저사용휴대폰', '매니저이동방법', 'CS교육이수여부', '청소교육이수여부'
                '쿠폰사용여부', '매니저주소', '근무가능지역', '부재중서비스가능여부', '추천인여부']

In [63]:
cb = CatBoostRegressor(iterations=10000, learning_rate=0.01, eval_metric='MAE', silent=True, loss_function='MAE')

In [64]:
cb_pred = np.zeros((test_x.shape[0]))
mae_list = []
for tr_idx, val_idx in kf.split(ftr):
    tr_x, val_x = ftr.iloc[tr_idx], ftr.iloc[val_idx]
    tr_y, val_y = target.iloc[tr_idx], target.iloc[val_idx]
    train_data = Pool(data = tr_x, label = tr_y, cat_features = cat_features)
    val_data = Pool(data = val_x, label = val_y, cat_features = cat_features)
    cb.fit(train_data, eval_set = val_data, early_stopping_rounds=1000, use_best_model=True, verbose=2000)
    best = cb.best_iteration_
    pred = cb.predict(val_x, ntree_end = best)
    mae = mean_absolute_error(val_y, pred)
    mae_list.append(mae)
    print(f'FOLD MAE = {mae}')
    sub_pred = cb.predict(test_x, ntree_end = best) / 15
    cb_pred += sub_pred
print(f'\n{cb.__class__.__name__} MAE = {np.mean(mae_list)}')

0:	learn: 0.0950407	test: 0.1023468	best: 0.1023468 (0)	total: 188ms	remaining: 31m 19s
2000:	learn: 0.0891603	test: 0.0936930	best: 0.0936929 (1999)	total: 40s	remaining: 2m 39s
4000:	learn: 0.0877063	test: 0.0928631	best: 0.0928631 (4000)	total: 1m 20s	remaining: 2m 1s
6000:	learn: 0.0867497	test: 0.0924713	best: 0.0924712 (5998)	total: 2m 1s	remaining: 1m 20s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.09223185322
bestIteration = 6298

Shrink model to first 6299 iterations.
FOLD MAE = 0.09223231103207027
0:	learn: 0.0955064	test: 0.0958279	best: 0.0958279 (0)	total: 28.1ms	remaining: 4m 40s
2000:	learn: 0.0951345	test: 0.0957277	best: 0.0957277 (1516)	total: 36.7s	remaining: 2m 26s
4000:	learn: 0.0946051	test: 0.0952784	best: 0.0952784 (4000)	total: 1m 16s	remaining: 1m 54s
6000:	learn: 0.0887390	test: 0.0888676	best: 0.0888676 (6000)	total: 1m 56s	remaining: 1m 17s
8000:	learn: 0.0872005	test: 0.0881909	best: 0.0880906 (7516)	total: 2m 37s	remaining: 39.3s

In [65]:
cb_pred.shape

(5754,)

### pseudo labeling

### submission

In [66]:
submission['pred'] = cb_pred

In [63]:
submission['pred'].astype(int)

0       0
1       0
2       0
3       0
4       0
       ..
5749    0
5750    0
5751    0
5752    0
5753    0
Name: pred, Length: 5754, dtype: int32

In [67]:
submission['pred'] = submission['pred'].apply(lambda x : 1 if x >= 0.5 else 0)

In [68]:
submission['pred'].value_counts()

0    5706
1      48
Name: pred, dtype: int64

In [69]:
submission.to_csv('1107_catboost.csv')