In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("movies_train.csv")
print(train.shape)
train.head()

(600, 12)


Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
0,개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398
1,내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501
2,은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,220775.25,4,343,4,6959083
3,나는 공무원이다,(주)NEW,코미디,2012-07-12,101,전체 관람가,구자홍,23894.0,2,20,6,217866
4,불량남녀,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,신근호,1.0,1,251,2,483387


In [3]:
test = pd.read_csv("movies_test.csv")
print(test.shape)
test.head()

(243, 11)


Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor
0,용서는 없다,시네마서비스,느와르,2010-01-07,125,청소년 관람불가,김형준,300529.0,2,304,3
1,아빠가 여자를 좋아해,(주)쇼박스,멜로/로맨스,2010-01-14,113,12세 관람가,이광재,342700.2,4,275,3
2,하모니,CJ 엔터테인먼트,드라마,2010-01-28,115,12세 관람가,강대규,4206611.0,3,419,7
3,의형제,(주)쇼박스,액션,2010-02-04,116,15세 관람가,장훈,691342.0,2,408,2
4,평행 이론,CJ 엔터테인먼트,공포,2010-02-18,110,15세 관람가,권호영,31738.0,1,380,1


## 데이터 전처리 (베이스 코드)

In [4]:
# 결측치가 많은 데이터 제거
train = train.drop(['dir_prev_bfnum'],axis = 1)
test =  test.drop(['dir_prev_bfnum'],axis = 1)

In [5]:
# 감독명 : 너무 다양해서 제거
train = train.drop(['director'],axis = 1)
test = test.drop(['director'],axis = 1)

In [6]:
# 제목 : 의미가 없기 때문에 제거
train = train.drop(['title'],axis= 1)
test = test.drop(['title'],axis= 1)

In [7]:
train.distributor.value_counts()

CJ 엔터테인먼트        54
롯데엔터테인먼트         52
(주)NEW           30
(주)마운틴픽쳐스        29
(주)쇼박스           26
                 ..
OAL(올)            1
(주)에이원 엔터테인먼트     1
(주)콘텐츠 윙          1
위더스필름             1
퍼스트런              1
Name: distributor, Length: 169, dtype: int64

In [8]:
# 상위 5개의 배급사를 제외하고 '기타'로처리
distributor_list = train.distributor.value_counts()[:5]
def func(distributor):
    if distributor in distributor_list:
        return distributor
    else:
        return '기타'

train['distributor'] = train['distributor'].apply(lambda x : func(x))
test['distributor'] = test['distributor'].apply(lambda x : func(x))

In [9]:
# 개봉일을 바탕으로 년,월 변수 생성
train['년'] = train['release_time'].apply(lambda x: int(x[:4]))
train['월'] = train['release_time'].apply(lambda x: int(x[5:7]))
train = train.drop(['release_time'],axis = 1)

test['년'] = test['release_time'].apply(lambda x: int(x[:4]))
test['월'] = test['release_time'].apply(lambda x: int(x[5:7]))
test = test.drop(['release_time'],axis = 1)

In [10]:
# 원핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)

## 모델 정의 및 학습

In [11]:
X_train = train.drop(['box_off_num'],axis= 1)
y_train = train['box_off_num']

## 회귀 트리 모델 학습/예측/평가

### 1. RandomForest


In [12]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=1000)
rf.fit(X_train, y_train)
rf_pred = rf.predict(test)

### 2. LightGBM

In [13]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor(n_estimators=1000)
lgbm.fit(X_train, y_train)
lgbm_pred = lgbm.predict(test)

### 3. XGBoost

In [14]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=1000)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(test)



### 4. GradientBoosting

In [15]:
from sklearn.ensemble import GradientBoostingRegressor
gbm = GradientBoostingRegressor(n_estimators=1000)
gbm.fit(X_train, y_train)
gbm_pred = gbm.predict(test)

## 학습된 모델로 예측 데이터 생성

In [16]:
pred = (rf_pred + lgbm_pred + xgb_pred + gbm_pred) / 4

In [17]:
for i in range(len(pred)):
  if pred[i] < 0:
    pred[i] = 0

## 제출 파일 생성

In [18]:
submission = pd.read_csv('submission.csv')
submission

Unnamed: 0,title,box_off_num
0,용서는 없다,0
1,아빠가 여자를 좋아해,0
2,하모니,0
3,의형제,0
4,평행 이론,0
...,...,...
238,해에게서 소년에게,0
239,울보 권투부,0
240,어떤살인,0
241,말하지 못한 비밀,0


In [19]:
submission['box_off_num'] = pred

In [20]:
submission

Unnamed: 0,title,box_off_num
0,용서는 없다,1.694444e+06
1,아빠가 여자를 좋아해,1.602516e+06
2,하모니,1.050598e+06
3,의형제,2.173298e+06
4,평행 이론,7.304331e+05
...,...,...
238,해에게서 소년에게,2.523667e+05
239,울보 권투부,1.667035e+06
240,어떤살인,2.072848e+05
241,말하지 못한 비밀,0.000000e+00


In [21]:
submission.to_csv('submission_pred.csv',index = False)