# DACON 

## [문화] 영화 관객수 예측 모델 개발

****

## 1. 패키지 불러오기

In [939]:
# Wrangling Package
import pandas as pd
import numpy as np
from scipy.stats import skew
import datetime

# EDA Package
import seaborn as sns
import matplotlib.pyplot as plt # to plot graph
# 한글 그래픽 처리
from matplotlib import rc
rc('font', family='malgun gothic')
rc('axes', unicode_minus=False)
%matplotlib inline


# Setting Package
import warnings
warnings.filterwarnings('ignore')

# Modeling Package
from bayes_opt import BayesianOptimization
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
from ngboost import NGBRegressor
from catboost import CatBoostRegressor

****

## 2. 데이터 불러오기

In [940]:
train = pd.read_csv("movies_train.csv") # trainain
test = pd.read_csv("movies_test.csv") # test
submission = pd.read_csv('submission.csv') # submission

In [941]:
train.head(2)

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
0,개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398
1,내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501


In [942]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           600 non-null    object 
 1   distributor     600 non-null    object 
 2   genre           600 non-null    object 
 3   release_time    600 non-null    object 
 4   time            600 non-null    int64  
 5   screening_rat   600 non-null    object 
 6   director        600 non-null    object 
 7   dir_prev_bfnum  270 non-null    float64
 8   dir_prev_num    600 non-null    int64  
 9   num_staff       600 non-null    int64  
 10  num_actor       600 non-null    int64  
 11  box_off_num     600 non-null    int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 56.4+ KB


In [943]:
test.head(2)

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor
0,용서는 없다,시네마서비스,느와르,2010-01-07,125,청소년 관람불가,김형준,300529.0,2,304,3
1,아빠가 여자를 좋아해,(주)쇼박스,멜로/로맨스,2010-01-14,113,12세 관람가,이광재,342700.25,4,275,3


In [944]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           243 non-null    object 
 1   distributor     243 non-null    object 
 2   genre           243 non-null    object 
 3   release_time    243 non-null    object 
 4   time            243 non-null    int64  
 5   screening_rat   243 non-null    object 
 6   director        243 non-null    object 
 7   dir_prev_bfnum  107 non-null    float64
 8   dir_prev_num    243 non-null    int64  
 9   num_staff       243 non-null    int64  
 10  num_actor       243 non-null    int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 21.0+ KB


****

## 3. 기본 전처리

'dir_prev_bfnum' 컬럼은 결측치가 많이 존재해 컬럼 자체를 삭제

In [945]:
train.drop('dir_prev_bfnum', axis = 1, inplace = True)
test.drop('dir_prev_bfnum', axis = 1, inplace = True)

배급사를 의미하는 distribuotr 컬럼으로 '배급사파워' 변수 생성

In [946]:
def dis_power(x):
    if '프리비젼' in x :
        return '프리비젼'
    elif '타임스토리' in x :
        return '타임스토리'
    elif x == '와이드 릴리즈(주)' :
        return '와이드릴리즈(주)'
    elif '싸이더스' in x :
        return '싸이더스'
    elif x == '시네마 달' :
        return '시네마달'
    elif '스폰지' in x :
        return '스폰지'
    elif '메가박스' in x :
        return '메가박스'
    elif '마운틴' in x :
        return '마운틴'
    elif '리틀빅' in x :
        return '리틀빅픽쳐스'
    elif '롯데' in x :
        return '롯데'
    elif 'SK' in x :
        return 'SK'
    elif ('CGV' in x) or ('CJ' in x) :
        return 'CJ'
    elif '에이원' in x :
        return '에이원'
    elif '마인스' in x :
        return '마인스'
    elif '디씨드' in x :
        return '디씨드'
    elif '드림팩트' in x :
        return '드림팩트'
    elif 'KT' in x :
        return 'KT'
    elif '쇼박스' in x :
        return '쇼박스'
    else :
        return x

In [947]:
train['distributor'] = train['distributor'].apply(dis_power)
test['distributor'] = test['distributor'].apply(dis_power)

In [948]:
dis_rank = train.groupby('distributor').box_off_num.mean().reset_index(name = '배급사평균관객').sort_values(by = '배급사평균관객', ascending = False)

In [949]:
dis_rank['배급사파워'] = pd.qcut(dis_rank.배급사평균관객, 10, labels = False)

In [950]:
dis_rank.sample(5)

Unnamed: 0,distributor,배급사평균관객,배급사파워
147,화앤담이엔티,172196.5,8
84,스튜디오 블루,771699.0,9
95,어뮤즈,5581.2,5
87,시너지,456663.6,8
146,홀리가든,1721.5,3


In [951]:
train = pd.merge(train, dis_rank.iloc[:, [0,2]], how = 'left', on = 'distributor')
test = pd.merge(test, dis_rank.iloc[:, [0,2]], how = 'left', on = 'distributor').fillna(0)

****

## 4. 기본 모델 성능 확인

### 1) Numeric Columns + dummies columns(genre, screening_rat)

In [185]:
X = train[['genre', 'dir_prev_num', 'num_staff', 'num_actor', 'screening_rat', 'time','배급사파워']]

In [186]:
X = pd.get_dummies(columns = ['genre', 'screening_rat'], data = X)

In [187]:
y = np.log1p(train.box_off_num)

In [188]:
target = test[['genre', 'dir_prev_num', 'num_staff', 'num_actor', 'screening_rat', 'time','배급사파워']]
target = pd.get_dummies(columns = ['genre', 'screening_rat'], data = target)

RMSE = 1365898.4317376306	
****

### 2) 1번 + scaling

In [258]:
X = train[['genre', 'dir_prev_num', 'num_staff', 'num_actor', 'screening_rat', 'time','배급사파워']]

In [259]:
X = pd.get_dummies(columns = ['genre', 'screening_rat'], data = X)

In [260]:
scaler = StandardScaler()

In [261]:
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

In [263]:
target = test[['dir_prev_num', 'num_staff', 'num_actor', 'time','배급사파워']]
target = pd.concat([target, pd.get_dummies(columns = ['genre', 'screening_rat'], data = test[['genre','screening_rat']])], axis = 1)
target = pd.DataFrame(scaler.transform(target), columns = target.columns)

MinMax = GBM이 1372060.0419581125로 최곻
****

### 3) 로그 변환

In [348]:
X = train[['genre', 'dir_prev_num', 'num_staff', 'num_actor', 'screening_rat', 'time','배급사파워']]

In [349]:
X = pd.get_dummies(columns = ['genre', 'screening_rat'], data = X)

In [350]:
X[['dir_prev_num', 'num_actor']] = np.log1p(X[['dir_prev_num', 'num_actor']])

In [351]:
y = np.log1p(train.box_off_num)

In [352]:
target = test[['dir_prev_num', 'num_staff', 'num_actor', 'time','배급사파워']]
target = pd.concat([target, pd.get_dummies(columns = ['genre', 'screening_rat'], data = test[['genre','screening_rat']])], axis = 1)
target[['dir_prev_num', 'num_actor']] = np.log1p(target[['dir_prev_num', 'num_actor']])

****
### 4) 로그변환 + scaling

In [953]:
X = train[['genre', 'dir_prev_num', 'num_staff', 'num_actor', 'screening_rat', 'time','배급사파워']]
X = pd.get_dummies(columns = ['genre', 'screening_rat'], data = X)
X[['dir_prev_num', 'num_actor']] = np.log1p(X[['dir_prev_num', 'num_actor']])

In [954]:
scaler = RobustScaler()

In [955]:
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

In [956]:
target = test[['dir_prev_num', 'num_staff', 'num_actor', 'time','배급사파워']]
target = pd.concat([target, pd.get_dummies(columns = ['genre', 'screening_rat'], data = test[['genre','screening_rat']])], axis = 1)
target[['dir_prev_num', 'num_actor']] = np.log1p(target[['dir_prev_num', 'num_actor']])
target = pd.DataFrame(scaler.transform(target), columns = target.columns)

### BayesOpt

In [957]:
# 하이퍼 파라미터 범위

pbounds = { 'n_estimators': (8,512),
            'max_depth': (3,10), # 나무의 범위를 넓게 주었기에 과적합을 막고자 깊이를 10이상으론 올라가지 않도록 하였다
            'max_features': (0.5,1),
            'min_samples_leaf': (1, 10)}

def rf_opt(n_estimators, max_depth, max_features, min_samples_leaf):
    
    params = {
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'min_samples_leaf' : int(round(min_samples_leaf))
    }

    rf = RandomForestRegressor(**params, n_jobs=-1, random_state=50)
    
    kf = KFold(n_splits = 10 , shuffle = True, random_state=50)
    
    score = cross_val_score(rf, X, y, scoring='neg_mean_absolute_error', cv=kf, n_jobs=-1)
    
    return np.mean(score)

BO_rf = BayesianOptimization(f = rf_opt, pbounds = pbounds, random_state=0)

In [958]:
BO_rf.maximize(init_points = 20, n_iter = 100)

|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.208   [0m | [0m 6.842   [0m | [0m 0.8576  [0m | [0m 6.425   [0m | [0m 282.6   [0m |
| [0m 2       [0m | [0m-1.211   [0m | [0m 5.966   [0m | [0m 0.8229  [0m | [0m 4.938   [0m | [0m 457.5   [0m |
| [95m 3       [0m | [95m-1.203   [0m | [95m 9.746   [0m | [95m 0.6917  [0m | [95m 8.126   [0m | [95m 274.6   [0m |
| [0m 4       [0m | [0m-1.211   [0m | [0m 6.976   [0m | [0m 0.9628  [0m | [0m 1.639   [0m | [0m 51.91   [0m |
| [0m 5       [0m | [0m-1.228   [0m | [0m 3.142   [0m | [0m 0.9163  [0m | [0m 8.003   [0m | [0m 446.5   [0m |
| [0m 6       [0m | [0m-1.209   [0m | [0m 9.85    [0m | [0m 0.8996  [0m | [0m 5.153   [0m | [0m 401.4   [0m |
| [0m 7       [0m | [0m-1.217   [0m | [0m 3.828   [0m | [0m 0.82    [0m | [0m 2.29    [0m | [0m 484.1  

| [0m 67      [0m | [0m-1.211   [0m | [0m 9.322   [0m | [0m 0.8819  [0m | [0m 9.869   [0m | [0m 375.9   [0m |
| [0m 68      [0m | [0m-1.202   [0m | [0m 9.947   [0m | [0m 0.8735  [0m | [0m 9.227   [0m | [0m 511.7   [0m |
| [0m 69      [0m | [0m-1.21    [0m | [0m 10.0    [0m | [0m 0.8741  [0m | [0m 9.535   [0m | [0m 507.4   [0m |
| [0m 70      [0m | [0m-1.208   [0m | [0m 8.579   [0m | [0m 0.9015  [0m | [0m 4.584   [0m | [0m 511.6   [0m |
| [0m 71      [0m | [0m-1.207   [0m | [0m 5.273   [0m | [0m 0.6932  [0m | [0m 9.161   [0m | [0m 511.5   [0m |
| [0m 72      [0m | [0m-1.235   [0m | [0m 3.0     [0m | [0m 0.5932  [0m | [0m 2.555   [0m | [0m 506.4   [0m |
| [0m 73      [0m | [0m-1.208   [0m | [0m 5.817   [0m | [0m 0.7857  [0m | [0m 1.196   [0m | [0m 204.9   [0m |
| [0m 74      [0m | [0m-1.231   [0m | [0m 3.014   [0m | [0m 0.9155  [0m | [0m 8.088   [0m | [0m 204.5   [0m |
| [0m 75      [0m | [

In [959]:
BO_rf.max

{'target': -1.2022119424997129,
 'params': {'max_depth': 8.562306660694897,
  'max_features': 0.8035416599413274,
  'min_samples_leaf': 8.135411976682523,
  'n_estimators': 350.3425712555171}}

In [960]:
params = BO_rf.max['params']
params['n_estimators'] = int(round(params['n_estimators']))
params['max_depth'] = int(round(params['max_depth']))
params['min_samples_leaf'] = int(round(params['min_samples_leaf']))

In [961]:
rf = RandomForestRegressor(random_state = 1010, **params)
rf.fit(X, y)
pred_rf = np.expm1(rf.predict(target))

In [962]:
submission['box_off_num'] = pred_rf

In [963]:
submission.sort_values(by = 'box_off_num', ascending = False)[:10]

Unnamed: 0,title,box_off_num
178,명량,3861968.0
229,베테랑,3349966.0
179,군도: 민란의 시대,3281265.0
135,용의자,3101611.0
108,신세계,2869400.0
142,박수건달,2618141.0
15,포화 속으로,2609610.0
0,용서는 없다,2561096.0
106,댄싱퀸,2440785.0
212,간신,2436854.0


In [964]:
pbounds = { 'learning_rate': (0.0005, 0.2),
            'n_estimators': (8, 512),
            'max_depth': (3,10),   
            'subsample': (0.5, 1), 
            'colsample_bytree': (0.5, 1),   
            'num_leaves': (2,10),
            'min_child_weight': (1, 7)}


def lgbm_opt(learning_rate, n_estimators, max_depth, subsample, colsample_bytree, num_leaves, min_child_weight):

    params = {
        'learning_rate': learning_rate,
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'subsample': subsample,
        'colsample_bytree' : colsample_bytree,
        'num_leaves' : int(round(num_leaves)),
        'min_child_weight' : min_child_weight,
        'n_jobs' : -1
    }
    
    lgbm = LGBMRegressor(**params)
    
    kf = KFold(n_splits = 10 , shuffle=False, random_state=50)

    score = cross_val_score(lgbm, X, y, scoring='neg_mean_absolute_error', cv = kf, n_jobs=-1)
    
    return np.mean(score)

BO_lgbm = BayesianOptimization(f = lgbm_opt, pbounds = pbounds, random_state=1)    

In [965]:
BO_lgbm.maximize(init_points = 20, n_iter = 100)

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.219   [0m | [0m 0.7085  [0m | [0m 0.1442  [0m | [0m 3.001   [0m | [0m 2.814   [0m | [0m 81.96   [0m | [0m 2.739   [0m | [0m 0.5931  [0m |
| [0m 2       [0m | [0m-1.268   [0m | [0m 0.6728  [0m | [0m 0.07966 [0m | [0m 6.772   [0m | [0m 3.515   [0m | [0m 353.4   [0m | [0m 3.636   [0m | [0m 0.9391  [0m |
| [95m 3       [0m | [95m-1.213   [0m | [95m 0.5137  [0m | [95m 0.1343  [0m | [95m 5.921   [0m | [95m 4.352   [0m | [95m 78.76   [0m | [95m 3.585   [0m | [95m 0.9004  [0m |
| [0m 4       [0m | [0m-1.245   [0m | [0m 0.9841  [0m | [0m 0.06303 [0m | [0m 7.846   [0m | [0m 6.258   [0m | [0m 458.9   [0m | [0m 2.68    [0m | [0m 0.5195  [0m |
| [0m 5       [0m | [0m-1.406   [0m | 

| [0m 45      [0m | [0m-1.361   [0m | [0m 1.0     [0m | [0m 0.2     [0m | [0m 3.264   [0m | [0m 7.0     [0m | [0m 265.4   [0m | [0m 5.646   [0m | [0m 1.0     [0m |
| [0m 46      [0m | [0m-2.654   [0m | [0m 0.5     [0m | [0m 0.0005  [0m | [0m 6.807   [0m | [0m 1.0     [0m | [0m 270.6   [0m | [0m 6.352   [0m | [0m 1.0     [0m |
| [0m 47      [0m | [0m-1.216   [0m | [0m 0.9145  [0m | [0m 0.02562 [0m | [0m 7.077   [0m | [0m 5.891   [0m | [0m 262.1   [0m | [0m 2.857   [0m | [0m 0.7032  [0m |
| [0m 48      [0m | [0m-1.27    [0m | [0m 0.5806  [0m | [0m 0.104   [0m | [0m 5.729   [0m | [0m 4.243   [0m | [0m 258.1   [0m | [0m 4.169   [0m | [0m 0.9651  [0m |
| [0m 49      [0m | [0m-1.535   [0m | [0m 0.9933  [0m | [0m 0.004378[0m | [0m 8.615   [0m | [0m 6.951   [0m | [0m 263.9   [0m | [0m 7.233   [0m | [0m 0.8691  [0m |
| [0m 50      [0m | [0m-1.225   [0m | [0m 0.8218  [0m | [0m 0.08451 [0m | [0m 3.7

| [0m 90      [0m | [0m-1.221   [0m | [0m 0.5     [0m | [0m 0.2     [0m | [0m 5.765   [0m | [0m 7.0     [0m | [0m 85.66   [0m | [0m 3.216   [0m | [0m 1.0     [0m |
| [0m 91      [0m | [0m-2.658   [0m | [0m 1.0     [0m | [0m 0.0005  [0m | [0m 8.223   [0m | [0m 2.491   [0m | [0m 256.5   [0m | [0m 2.0     [0m | [0m 0.5     [0m |
| [0m 92      [0m | [0m-1.339   [0m | [0m 1.0     [0m | [0m 0.2     [0m | [0m 8.476   [0m | [0m 3.114   [0m | [0m 262.3   [0m | [0m 4.824   [0m | [0m 1.0     [0m |
| [0m 93      [0m | [0m-1.239   [0m | [0m 1.0     [0m | [0m 0.2     [0m | [0m 3.0     [0m | [0m 4.722   [0m | [0m 206.0   [0m | [0m 2.0     [0m | [0m 1.0     [0m |
| [0m 94      [0m | [0m-1.298   [0m | [0m 0.5033  [0m | [0m 0.1991  [0m | [0m 6.056   [0m | [0m 3.677   [0m | [0m 459.9   [0m | [0m 2.842   [0m | [0m 0.9431  [0m |
| [0m 95      [0m | [0m-1.338   [0m | [0m 1.0     [0m | [0m 0.2     [0m | [0m 5.0

In [966]:
BO_lgbm.max

{'target': -1.20917632275196,
 'params': {'colsample_bytree': 0.7261814680472192,
  'learning_rate': 0.10707694152697225,
  'max_depth': 8.239262857431134,
  'min_child_weight': 3.9711828483549247,
  'n_estimators': 79.22844477316195,
  'num_leaves': 3.2862522821416347,
  'subsample': 0.6077260486252966}}

In [967]:
params = BO_lgbm.max['params']
params['max_depth'] = int(round(params['max_depth']))
params['n_estimators'] = int(round(params['n_estimators']))
params['num_leaves'] = int(round(params['num_leaves']))

In [968]:
lgbm = LGBMRegressor(random_state = 1010, **params)
lgbm.fit(X, y)
pred_lgbm = np.expm1(lgbm.predict(target))

In [969]:
submission['box_off_num'] = pred_lgbm

In [970]:
submission.sort_values(by = 'box_off_num', ascending = False)[:20]

Unnamed: 0,title,box_off_num
229,베테랑,3273155.0
178,명량,2935712.0
179,군도: 민란의 시대,2597441.0
106,댄싱퀸,2512630.0
15,포화 속으로,2466358.0
142,박수건달,2461039.0
135,용의자,2445491.0
184,기술자들,2171414.0
166,방황하는 칼날,1945400.0
61,수상한 고객들,1773254.0


In [971]:
pbounds = { 'learning_rate': (0.0005, 0.2),
            'n_estimators': (16, 512),
            'max_depth': (3,10),   
            'subsample': (0.5,1), 
            'min_samples_split': (2,5),   
            'min_samples_leaf': (1,5)}

def gbm_opt(learning_rate, n_estimators, max_depth, subsample, min_samples_split, min_samples_leaf):

    params = {
        'learning_rate': learning_rate,
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'subsample': subsample,
        'min_samples_split' : int(round(min_samples_split)),
        'min_samples_leaf' : int(round(min_samples_leaf))
    }
    
    gbm = GradientBoostingRegressor(**params)
    
    kf = KFold(n_splits = 10 , shuffle = True, random_state=50)

    score = cross_val_score(gbm, X, y, scoring='neg_mean_absolute_error', cv = kf, n_jobs=-1)
    
    return np.mean(score)

BO_gbm = BayesianOptimization(f = gbm_opt, pbounds = pbounds, random_state=0)    

In [972]:
BO_gbm.maximize(init_points = 20, n_iter = 100)

|   iter    |  target   | learni... | max_depth | min_sa... | min_sa... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.335   [0m | [0m 0.11    [0m | [0m 8.006   [0m | [0m 3.411   [0m | [0m 3.635   [0m | [0m 226.1   [0m | [0m 0.8229  [0m |
| [0m 2       [0m | [0m-1.379   [0m | [0m 0.0878  [0m | [0m 9.242   [0m | [0m 4.855   [0m | [0m 3.15    [0m | [0m 408.7   [0m | [0m 0.7644  [0m |
| [95m 3       [0m | [95m-1.284   [0m | [95m 0.1138  [0m | [95m 9.479   [0m | [95m 1.284   [0m | [95m 2.261   [0m | [95m 26.03   [0m | [95m 0.9163  [0m |
| [0m 4       [0m | [0m-1.373   [0m | [0m 0.1557  [0m | [0m 9.09    [0m | [0m 4.914   [0m | [0m 4.397   [0m | [0m 244.9   [0m | [0m 0.8903  [0m |
| [95m 5       [0m | [95m-1.246   [0m | [95m 0.0241  [0m | [95m 7.479   [0m | [95m 1.573   [0m | [95m 4.834   [0m | [95m 274.8   [0m |

| [0m 51      [0m | [0m-1.283   [0m | [0m 0.1616  [0m | [0m 8.177   [0m | [0m 1.622   [0m | [0m 2.419   [0m | [0m 30.22   [0m | [0m 0.8858  [0m |
| [0m 52      [0m | [0m-1.199   [0m | [0m 0.01816 [0m | [0m 4.12    [0m | [0m 3.778   [0m | [0m 2.469   [0m | [0m 455.2   [0m | [0m 0.9444  [0m |
| [0m 53      [0m | [0m-2.767   [0m | [0m 0.0005  [0m | [0m 3.186   [0m | [0m 3.55    [0m | [0m 3.943   [0m | [0m 113.6   [0m | [0m 0.5     [0m |
| [0m 54      [0m | [0m-1.393   [0m | [0m 0.1771  [0m | [0m 9.285   [0m | [0m 2.347   [0m | [0m 4.719   [0m | [0m 310.5   [0m | [0m 0.5062  [0m |
| [0m 55      [0m | [0m-1.298   [0m | [0m 0.09354 [0m | [0m 3.473   [0m | [0m 1.663   [0m | [0m 4.413   [0m | [0m 315.7   [0m | [0m 0.6133  [0m |
| [0m 56      [0m | [0m-1.348   [0m | [0m 0.1634  [0m | [0m 4.937   [0m | [0m 3.372   [0m | [0m 4.314   [0m | [0m 114.7   [0m | [0m 0.6785  [0m |
| [0m 57      [0m | [0m-1

| [0m 102     [0m | [0m-1.323   [0m | [0m 0.1496  [0m | [0m 3.978   [0m | [0m 2.457   [0m | [0m 2.757   [0m | [0m 113.8   [0m | [0m 0.7288  [0m |
| [0m 103     [0m | [0m-1.211   [0m | [0m 0.02431 [0m | [0m 4.362   [0m | [0m 4.752   [0m | [0m 3.208   [0m | [0m 113.4   [0m | [0m 0.9203  [0m |
| [0m 104     [0m | [0m-1.307   [0m | [0m 0.08672 [0m | [0m 3.683   [0m | [0m 1.658   [0m | [0m 4.59    [0m | [0m 236.5   [0m | [0m 0.6109  [0m |
| [0m 105     [0m | [0m-1.199   [0m | [0m 0.06219 [0m | [0m 4.728   [0m | [0m 2.143   [0m | [0m 3.722   [0m | [0m 48.26   [0m | [0m 0.8595  [0m |
| [0m 106     [0m | [0m-1.221   [0m | [0m 0.1474  [0m | [0m 5.148   [0m | [0m 1.907   [0m | [0m 3.311   [0m | [0m 46.72   [0m | [0m 0.5647  [0m |
| [0m 107     [0m | [0m-1.248   [0m | [0m 0.1975  [0m | [0m 4.648   [0m | [0m 2.176   [0m | [0m 3.466   [0m | [0m 46.17   [0m | [0m 0.9498  [0m |
| [0m 108     [0m | [0m-1

In [973]:
BO_gbm.max

{'target': -1.1918625469408792,
 'params': {'learning_rate': 0.05957996940566792,
  'max_depth': 3.8310940326797085,
  'min_samples_leaf': 2.271932717575904,
  'min_samples_split': 3.24278898354401,
  'n_estimators': 47.81715818899704,
  'subsample': 0.8462360596850099}}

In [988]:
params = BO_gbm.max['params']
params['max_depth'] = int(round(params['max_depth']))
params['n_estimators'] = int(round(params['n_estimators']))
params['min_samples_leaf'] = int(round(params['min_samples_leaf']))
params['min_samples_split'] = int(round(params['min_samples_split']))

In [989]:
gbm = GradientBoostingRegressor(random_state = 123, **params)
gbm.fit(X, y)
pred_gbm = np.expm1(gbm.predict(target))

In [990]:
submission['box_off_num'] = pred_gbm

In [991]:
submission.sort_values(by = 'box_off_num', ascending = False)[:10]

Unnamed: 0,title,box_off_num
178,명량,2642481.0
179,군도: 민란의 시대,2522183.0
135,용의자,2328536.0
229,베테랑,2326920.0
142,박수건달,2313457.0
106,댄싱퀸,2133304.0
0,용서는 없다,2093105.0
108,신세계,2093105.0
159,하이힐,2079551.0
83,늑대소년,2020221.0


In [979]:
submission.to_csv("basic1010.csv", index = False)

In [980]:
pbounds = {'learning_rate': (0.0005, 1.5),
           'n_estimators': (8, 512),
           'max_depth': (3,10),
           'subsample': (0.5,1),
           'colsample_bytree': (0.5,1),
           'gamma': (0, 5)}

def xgb_opt(learning_rate, n_estimators, max_depth, subsample, colsample_bytree, gamma):
    
    params = {
        'learning_rate': learning_rate,
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,   
        'gamma': gamma,
        'n_jobs' : -1
    }
    
    xgb = XGBRegressor(**params)
    
    kf = KFold(n_splits = 10 , shuffle=True, random_state=50)

    score = cross_val_score(xgb, X, y, scoring='neg_mean_absolute_error', cv=kf, n_jobs=-1)
    
    return np.mean(score)

BO_xgb = BayesianOptimization(f = xgb_opt, pbounds = pbounds, random_state=0)

In [981]:
BO_xgb.maximize(init_points = 20, n_iter = 100)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.648   [0m | [0m 0.7744  [0m | [0m 3.576   [0m | [0m 0.9043  [0m | [0m 6.814   [0m | [0m 221.5   [0m | [0m 0.8229  [0m |
| [0m 2       [0m | [0m-2.293   [0m | [0m 0.7188  [0m | [0m 4.459   [0m | [0m 1.446   [0m | [0m 5.684   [0m | [0m 407.0   [0m | [0m 0.7644  [0m |
| [0m 3       [0m | [0m-1.71    [0m | [0m 0.784   [0m | [0m 4.628   [0m | [0m 0.107   [0m | [0m 3.61    [0m | [0m 18.19   [0m | [0m 0.9163  [0m |
| [0m 4       [0m | [0m-2.044   [0m | [0m 0.8891  [0m | [0m 4.35    [0m | [0m 1.468   [0m | [0m 8.594   [0m | [0m 240.6   [0m | [0m 0.8903  [0m |
| [95m 5       [0m | [95m-1.31    [0m | [95m 0.5591  [0m | [95m 3.2     [0m | [95m 0.2155  [0m | [95m 9.613   [0m | [95m 271.0   [0m | [95m 0

| [0m 51      [0m | [0m-1.26    [0m | [0m 0.7185  [0m | [0m 3.016   [0m | [0m 0.2505  [0m | [0m 4.798   [0m | [0m 16.01   [0m | [0m 0.5241  [0m |
| [0m 52      [0m | [0m-1.7     [0m | [0m 0.9365  [0m | [0m 3.15    [0m | [0m 1.296   [0m | [0m 5.408   [0m | [0m 15.13   [0m | [0m 0.8442  [0m |
| [0m 53      [0m | [0m-1.742   [0m | [0m 0.7452  [0m | [0m 1.865   [0m | [0m 1.001   [0m | [0m 4.416   [0m | [0m 15.74   [0m | [0m 0.5413  [0m |
| [0m 54      [0m | [0m-1.422   [0m | [0m 0.794   [0m | [0m 3.354   [0m | [0m 0.9427  [0m | [0m 3.079   [0m | [0m 16.19   [0m | [0m 0.7554  [0m |
| [0m 55      [0m | [0m-1.304   [0m | [0m 0.9475  [0m | [0m 3.622   [0m | [0m 0.6245  [0m | [0m 3.722   [0m | [0m 14.89   [0m | [0m 0.9407  [0m |
| [0m 56      [0m | [0m-2.598   [0m | [0m 0.5044  [0m | [0m 2.422   [0m | [0m 1.446   [0m | [0m 3.804   [0m | [0m 14.01   [0m | [0m 0.5134  [0m |
| [0m 57      [0m | [0m-9

| [0m 102     [0m | [0m-1.28    [0m | [0m 0.7087  [0m | [0m 3.525   [0m | [0m 0.5199  [0m | [0m 5.296   [0m | [0m 15.68   [0m | [0m 0.7406  [0m |
| [0m 103     [0m | [0m-8.764   [0m | [0m 1.0     [0m | [0m 2.797   [0m | [0m 0.0005  [0m | [0m 7.408   [0m | [0m 153.8   [0m | [0m 0.5     [0m |
| [0m 104     [0m | [0m-1.365   [0m | [0m 0.7161  [0m | [0m 0.4881  [0m | [0m 0.2473  [0m | [0m 9.043   [0m | [0m 232.1   [0m | [0m 0.6109  [0m |
| [0m 105     [0m | [0m-1.34    [0m | [0m 0.6037  [0m | [0m 2.671   [0m | [0m 0.4869  [0m | [0m 5.69    [0m | [0m 17.11   [0m | [0m 0.677   [0m |
| [0m 106     [0m | [0m-1.361   [0m | [0m 0.809   [0m | [0m 3.48    [0m | [0m 0.1623  [0m | [0m 6.032   [0m | [0m 16.41   [0m | [0m 0.8732  [0m |
| [0m 107     [0m | [0m-1.295   [0m | [0m 0.9444  [0m | [0m 2.809   [0m | [0m 0.4337  [0m | [0m 6.95    [0m | [0m 16.51   [0m | [0m 0.9337  [0m |
| [0m 108     [0m | [0m-1

In [982]:
BO_xgb.max

{'target': -1.228418531606232,
 'params': {'colsample_bytree': 0.6556101182042811,
  'gamma': 2.443105577670689,
  'learning_rate': 0.16074351964042566,
  'max_depth': 9.261606149262697,
  'n_estimators': 154.11290847057532,
  'subsample': 0.9844995581609928}}

In [983]:
params = BO_xgb.max['params']
params['max_depth'] = int(round(params['max_depth']))
params['n_estimators'] = int(round(params['n_estimators']))

In [984]:
xgb = XGBRegressor(random_state = 123, **params)
xgb.fit(X, y)
pred_xgb = np.expm1(xgb.predict(target))

In [985]:
submission['box_off_num'] = pred_xgb

In [986]:
submission.sort_values(by = 'box_off_num', ascending = False)

Unnamed: 0,title,box_off_num
178,명량,4.696000e+06
106,댄싱퀸,4.276029e+06
135,용의자,3.878530e+06
142,박수건달,3.751679e+06
108,신세계,3.634373e+06
...,...,...
176,내 연애의 기억,7.078725e+01
130,댄서김의 은밀한 교수법,6.902303e+01
173,옹녀뎐,6.610818e+01
40,량강도 아이들,5.440986e+01


In [992]:
submission.to_csv("basic1010.csv", index = False)

****
##  Blending

In [919]:
submission.box_off_num = (pred_xgb * pred_rf * pred_gbm) ** .3333

In [920]:
submission.sort_values(by = 'box_off_num', ascending = False)[:10]

Unnamed: 0,title,box_off_num
178,명량,7921406.0
106,댄싱퀸,5983879.0
158,우는 남자,5541416.0
179,군도: 민란의 시대,4159172.0
229,베테랑,4064107.0
3,의형제,3738146.0
70,글러브,3504876.0
53,퀵,3263172.0
48,도가니,3181761.0
77,남영동1985,2553446.0


In [921]:
submission.to_csv("basic1010.csv", index = False)

In [215]:
vot = VotingRegressor(estimators = [('lgbm', lgbm), ('gbm', gbm), ('rf',rf), (], weights = [.2, .5, .3])
cv_vot = cross_val_score(vot, X, y, cv = KFold(10, shuffle = True, random_state = 100), n_jobs = -1, scoring = 'neg_mean_squared_error')
cv_vot.mean()

-2.6376215964161442

In [216]:
pred_vot = np.expm1(vot.predict(target))

NotFittedError: This VotingRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
submission['box_off_num'] = pred_vot

In [None]:
submission.sort_values(by = 'box_off_num', ascending = False)[:10]

In [313]:
submission.to_csv("basic1010.csv", index = False)

In [324]:
submission['box_off_num'] = 0.5 * pred_gbm + 0.3 * pred_rf + 0.2 * pred_lgbm

In [325]:
submission.sort_values(by = 'box_off_num', ascending = False)[:10]

Unnamed: 0,title,box_off_num
178,명량,3791475.0
179,군도: 민란의 시대,3554449.0
229,베테랑,3424879.0
135,용의자,3174623.0
227,미쓰 와이프,2756855.0
106,댄싱퀸,2751606.0
142,박수건달,2732386.0
15,포화 속으로,2381330.0
108,신세계,2362684.0
166,방황하는 칼날,2309629.0


In [326]:
submission.to_csv("basic1010.csv", index = False)