In [57]:
import numpy as np
import pandas as pd

# install cmake, libomp
from xgboost import XGBRegressor
from xgboost import plot_importance

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from matplotlib import pyplot

In [2]:
# Get data-set
main_df = pd.read_csv('./movie_clean_bc.csv', encoding='cp949')

In [40]:
#main_df.describe()
#main_df.info()
#main_df.shape[0]
#main_df['15세이상관람가'].unique()
df = main_df 
drop_flag = False

In [41]:
# 상영 전 평론가, 관객의 점수는 지표로 사용할 수 없다. (Drop)
if drop_flag == False:
    df = df.drop(['비평가 점수', '관객 평가(네티즌평가)'], axis=1)
    drop_flag = True

In [42]:
y = [idx for idx in range(0, df.shape[0])]
# Test-Set 분리
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.5, random_state=10)
x_train.shape, x_test.shape

((1644, 34), (1644, 34))

In [50]:
y = np.log1p(x_train['총관객수'])
y.head(), y.shape

(414     12.825845
 3091    10.031661
 2459    10.045768
 1911     9.775654
 2998    13.007401
 Name: 총관객수, dtype: float64,
 (1644,))

In [51]:
y_vali = np.log1p(x_test['총관객수'])

In [52]:
x = x_train.drop('총관객수', axis=1)
x_vali = x_test.drop('총관객수', axis=1)
x_vali.head(), x_vali.shape

(      계절성여부  상영시간(분)  스크린수  전체관람가  12세이상관람가  15세이상관람가      배우파워1      배우파워2  \
 413       0      110   299      0         0         1  2199359.0  1066765.0   
 290       0      103   182      0         0         1   520753.0        1.0   
 860       1       95   167      0         1         0        1.0        1.0   
 2952      0      106   143      0         0         1  2026574.0   137340.0   
 1835      0      124   356      1         0         0   243094.0   471310.0   
 
         감독파워  다양성(독립)영화  ...  범죄  미스터리  뮤지컬  가족  서부극(웨스턴)  공연  성인물(에로)  기타  \
 413    10036          0  ...   0     0    0   0         0   0        0   0   
 290   101879          0  ...   0     0    0   0         0   0        0   0   
 860        1          1  ...   0     0    0   0         0   0        0   0   
 2952   20413          1  ...   0     0    0   0         0   0        0   0   
 1835  742475          0  ...   0     0    0   0         0   0        0   0   
 
       다국적  배급사파워1  
 413     1      29  


In [53]:
# K-fold로 10개의 generator 생성
kf = KFold(n_splits =  10, shuffle = True, random_state = 96)
# Make Model
xgb = XGBRegressor(random_state = 256)

In [60]:
rmse_list = []
xgb_pred = np.zeros((x_train.shape[0]))

# K(10)-fold validation Loop
for tr_idx, val_idx in kf.split(x, y) :
    tr_x, tr_y = x.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = x.iloc[val_idx], y.iloc[val_idx]
    
    xgb.fit(tr_x, tr_y)
    
    pred = np.expm1([0 if x < 0 else x for x in xgb.predict(val_x)])
    sub_pred = np.expm1([0 if x < 0 else x for x in xgb.predict(x_vali)])
    rmse = np.sqrt(mean_squared_error(val_y, pred))
    
    
    rmse_list.append(rmse)
    xgb_pred += (sub_pred / 10)

In [55]:
"MSE: {}".format(np.mean(rmse_list))

'MSE: 1419981.7626805957'

In [65]:
predictions = []
#np.expm1([0 if x < 0 else x for x in xgb.predict(x_vali)])
for x in xgb.predict(x_vali):
    predictions.append(np.expm1(x))

#accuracy_score(y_test, predictions)
predictions

[1068994.4,
 408992.06,
 29879.467,
 149897.56,
 664306.1,
 191400.84,
 13231.533,
 17105.395,
 432993.5,
 29225.559,
 285381.97,
 143404.45,
 195331.39,
 30800.35,
 86449.2,
 266832.97,
 383607.94,
 732806.75,
 142081.67,
 4011234.5,
 12759.907,
 118137.41,
 49603.566,
 25543.082,
 1918.4994,
 336629.38,
 93572.695,
 9971.326,
 14390.736,
 1618559.2,
 26584.16,
 2303543.5,
 59944.902,
 131671.44,
 1484602.1,
 585399.3,
 80174.18,
 5643.276,
 495610.8,
 284206.5,
 87068.766,
 23335.426,
 29791.174,
 397340.5,
 13784.703,
 21322.715,
 343370.0,
 1505402.0,
 42299.273,
 32911.836,
 22769.807,
 269398.6,
 11520.682,
 3400260.5,
 116635.516,
 1479815.9,
 223019.42,
 97893.234,
 115510.84,
 331114.1,
 634605.06,
 2543463.8,
 365246.53,
 276844.2,
 3440991.5,
 198467.03,
 656394.56,
 84664.4,
 1847.7896,
 15875.586,
 231919.92,
 50808.438,
 27010.645,
 1101767.2,
 21065.725,
 45654.656,
 30234.37,
 25875.262,
 218556.95,
 93915.46,
 361381.2,
 316173.06,
 538546.75,
 86620.04,
 920152.7,
 58

In [49]:
# feature 중요도 확인
#plot_importance(xgb)