In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

train = pd.read_csv('movies_train.csv')
test = pd.read_csv('movies_test.csv')

# 데이터 전처리

In [2]:
# 결측치가 많은 데이터 제거
train = train.drop(['dir_prev_bfnum'],axis = 1)
test =  test.drop(['dir_prev_bfnum'],axis = 1)

In [3]:
# 감독명 : 너무 다양해서 제거
train = train.drop(['director'],axis = 1)
test = test.drop(['director'],axis = 1)

In [4]:
# 제목 : 의미가 없기 때문에 제거
train = train.drop(['title'],axis= 1)
test = test.drop(['title'],axis= 1)

In [5]:
# 상위 5개의 배급사를 제외하고 '기타'로처리
distributor_list = train.distributor.value_counts()[:5]
def func(distributor):
    if distributor in distributor_list:
        return distributor
    else:
        return '기타'

train['distributor'] = train['distributor'].apply(lambda x : func(x))
test['distributor'] = test['distributor'].apply(lambda x : func(x))

In [6]:
# 개봉일을 바탕으로 년,월 변수 생성
train['년'] = train['release_time'].apply(lambda x: int(x[:4]))
train['월'] = train['release_time'].apply(lambda x: int(x[5:7]))
train =  train.drop(['release_time'],axis = 1)

test['년'] = test['release_time'].apply(lambda x: int(x[:4]))
test['월'] = test['release_time'].apply(lambda x: int(x[5:7]))
test =  test.drop(['release_time'],axis = 1)

In [7]:
# 원핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# 모델 생성 및 예측(과제부분)

In [8]:
train_x = train.drop(['box_off_num'],axis= 1)
train_y = train['box_off_num']

In [9]:
from xgboost import XGBRFRegressor as xg
from bayes_opt import BayesianOptimization as bo

def xgb(max_depth, gamma, min_child_weight):
    sample_model = xg(max_depth = int(max_depth),
                      gamma = gamma,
                      min_child_weight = min_child_weight)
    sample_model.fit(train_x, train_y)
    
    return sample_model.score(train_x, train_y)



epsilon = 0.1 ** 10
float_range = (epsilon, 1 - epsilon)
int_range = (1, 20000)

pbounds = {'max_depth': int_range,
           'gamma': float_range,
           'min_child_weight': int_range
          }

optimizer = bo(f=xgb, pbounds=pbounds)
optimizer.maximize(n_iter=100)

|   iter    |  target   |   gamma   | max_depth | min_ch... |
-------------------------------------------------------------
| [0m 1       [0m | [0m-1.17e-05[0m | [0m 0.6101  [0m | [0m 2.133e+0[0m | [0m 283.4   [0m |
| [0m 2       [0m | [0m-0.1503  [0m | [0m 0.6103  [0m | [0m 1.384e+0[0m | [0m 1.427e+0[0m |
| [0m 3       [0m | [0m-0.1503  [0m | [0m 0.9467  [0m | [0m 1.384e+0[0m | [0m 1.275e+0[0m |
| [0m 4       [0m | [0m-0.1503  [0m | [0m 0.6293  [0m | [0m 6.064e+0[0m | [0m 1.536e+0[0m |
| [0m 5       [0m | [0m-0.1503  [0m | [0m 0.8619  [0m | [0m 6.131e+0[0m | [0m 1.918e+0[0m |
| [0m 6       [0m | [0m-0.1503  [0m | [0m 0.9471  [0m | [0m 1.208e+0[0m | [0m 3.64e+03[0m |
| [95m 7       [0m | [95m 0.1823  [0m | [95m 0.7147  [0m | [95m 2.038e+0[0m | [95m 201.7   [0m |
| [0m 8       [0m | [0m-0.1503  [0m | [0m 0.3077  [0m | [0m 1.006e+0[0m | [0m 1.055e+0[0m |
| [95m 9       [0m | [95m 0.2031  [0m | [95m 0.900

| [0m 80      [0m | [0m 0.1462  [0m | [0m 0.3147  [0m | [0m 28.69   [0m | [0m 230.7   [0m |
| [0m 81      [0m | [0m 0.2954  [0m | [0m 0.3521  [0m | [0m 2.193e+0[0m | [0m 101.5   [0m |
| [0m 82      [0m | [0m 0.2492  [0m | [0m 0.2847  [0m | [0m 1.119e+0[0m | [0m 144.1   [0m |
| [0m 83      [0m | [0m 0.2512  [0m | [0m 0.2411  [0m | [0m 2.881e+0[0m | [0m 142.2   [0m |
| [0m 84      [0m | [0m 0.6183  [0m | [0m 0.1145  [0m | [0m 1.155e+0[0m | [0m 6.951   [0m |
| [0m 85      [0m | [0m 0.2322  [0m | [0m 0.158   [0m | [0m 287.7   [0m | [0m 158.6   [0m |
| [0m 86      [0m | [0m 0.5912  [0m | [0m 0.2413  [0m | [0m 1.442e+0[0m | [0m 7.73    [0m |
| [0m 87      [0m | [0m 0.3127  [0m | [0m 0.5894  [0m | [0m 1.402e+0[0m | [0m 75.32   [0m |
| [0m 88      [0m | [0m 0.3989  [0m | [0m 0.3595  [0m | [0m 1.478e+0[0m | [0m 40.91   [0m |
| [0m 89      [0m | [0m 0.3079  [0m | [0m 0.5274  [0m | [0m 1.206e+0[0m | 

In [10]:
from xgboost import XGBRFRegressor as xg

model=xg(gamma=0.8183,  max_depth=20, min_child_weight=1.7)
model.fit(train_x,train_y)

XGBRFRegressor(base_score=0.5, booster='gbtree', callbacks=None,
               colsample_bylevel=1, colsample_bytree=1,
               early_stopping_rounds=None, enable_categorical=False,
               eval_metric=None, gamma=0.8183, gpu_id=-1,
               grow_policy='depthwise', importance_type=None,
               interaction_constraints='', max_bin=256, max_cat_to_onehot=4,
               max_delta_step=0, max_depth=20, max_leaves=0,
               min_child_weight=1.7, missing=nan, monotone_constraints='()',
               n_estimators=100, n_jobs=0, num_parallel_tree=100,
               objective='reg:squarederror', predictor='auto', random_state=0,
               reg_alpha=0, sampling_method='uniform', scale_pos_weight=1, ...)

In [11]:
pred = model.predict(test)

# 제출

In [12]:
submission = pd.read_csv('submission.csv')
submission

Unnamed: 0,title,box_off_num
0,용서는 없다,2.093115e+06
1,아빠가 여자를 좋아해,1.300433e+06
2,하모니,1.144886e+06
3,의형제,1.967740e+06
4,평행 이론,1.057309e+06
...,...,...
238,해에게서 소년에게,2.512371e+04
239,울보 권투부,5.556886e+03
240,어떤살인,2.737843e+05
241,말하지 못한 비밀,9.288580e+03


In [13]:
submission['box_off_num'] = pred

In [14]:
submission

Unnamed: 0,title,box_off_num
0,용서는 없다,2.093115e+06
1,아빠가 여자를 좋아해,1.300433e+06
2,하모니,1.144886e+06
3,의형제,1.967740e+06
4,평행 이론,1.057309e+06
...,...,...
238,해에게서 소년에게,2.512371e+04
239,울보 권투부,5.556886e+03
240,어떤살인,2.737843e+05
241,말하지 못한 비밀,9.288580e+03


In [15]:
submission.to_csv('submission.csv',index = False)

# 제출 점수(과제부분)

https://dacon.io/competitions/open/235536/overview/description

![image.png](attachment:image.png)