# 데이터 탐색

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler, PowerTransformer, RobustScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score

cv_kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_str_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [2]:
train = pd.read_csv("movies_train.csv")
test = pd.read_csv("movies_test.csv")
submission = pd.read_csv("submission.csv")

train.shape, test.shape, submission.shape

((600, 12), (243, 11), (243, 2))

In [3]:
display(train.head(3))
test.head(3)

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
0,개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398
1,내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501
2,은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,220775.25,4,343,4,6959083


Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor
0,용서는 없다,시네마서비스,느와르,2010-01-07,125,청소년 관람불가,김형준,300529.0,2,304,3
1,아빠가 여자를 좋아해,(주)쇼박스,멜로/로맨스,2010-01-14,113,12세 관람가,이광재,342700.2,4,275,3
2,하모니,CJ 엔터테인먼트,드라마,2010-01-28,115,12세 관람가,강대규,4206611.0,3,419,7


## 데이터 한국어 변형

In [4]:
train = train.rename(columns={"title":"제목", "distributor":"배급사", "genre":"장르", "release_time":"개봉날짜", "time":'상영시간', "screening_rat":"상영등급", "director": "감독",
                      "dir_prev_bfnum":"감독별_이전영화_평균관객수", "dir_prev_num" : "감독별_이전영화제작_참여횟수", "num_staff":"스탭수", "num_actor" :"배우수", "box_off_num" :"관객수"})
                      
test = test.rename(columns={"title":"제목", "distributor":"배급사", "genre":"장르", "release_time":"개봉날짜", "time":'상영시간', "screening_rat":"상영등급", "director": "감독",
                      "dir_prev_bfnum":"감독별_이전영화_평균관객수", "dir_prev_num" : "감독별_이전영화제작_참여횟수", "num_staff":"스탭수", "num_actor" :"배우수"})

target_ = train[["관객수"]]                   

## 요약본

In [5]:
def resumtable(df):
    summary = pd.DataFrame(df.dtypes).reset_index().rename(columns={"index":"features", 0:"dtypes"})
    summary["결측치수"] = df.isnull().sum().values
    summary["고윳값수"] = df.nunique().values
    summary["첫번째값"] = df.iloc[0].values
    summary["두번째값"] = df.iloc[1].values
    summary["세번째값"] = df.iloc[2].values
    
    return summary

In [6]:
resumtable(train)

Unnamed: 0,features,dtypes,결측치수,고윳값수,첫번째값,두번째값,세번째값
0,제목,object,0,600,개들의 전쟁,내부자들,은밀하게 위대하게
1,배급사,object,0,169,롯데엔터테인먼트,(주)쇼박스,(주)쇼박스
2,장르,object,0,12,액션,느와르,액션
3,개봉날짜,object,0,330,2012-11-22,2015-11-19,2013-06-05
4,상영시간,int64,0,86,96,130,123
5,상영등급,object,0,4,청소년 관람불가,청소년 관람불가,15세 관람가
6,감독,object,0,472,조병옥,우민호,장철수
7,감독별_이전영화_평균관객수,float64,330,245,,1161602.5,220775.25
8,감독별_이전영화제작_참여횟수,int64,0,6,0,2,4
9,스탭수,int64,0,296,91,387,343


In [7]:
resumtable(test)

Unnamed: 0,features,dtypes,결측치수,고윳값수,첫번째값,두번째값,세번째값
0,제목,object,0,243,용서는 없다,아빠가 여자를 좋아해,하모니
1,배급사,object,0,93,시네마서비스,(주)쇼박스,CJ 엔터테인먼트
2,장르,object,0,12,느와르,멜로/로맨스,드라마
3,개봉날짜,object,0,182,2010-01-07,2010-01-14,2010-01-28
4,상영시간,int64,0,70,125,113,115
5,상영등급,object,0,4,청소년 관람불가,12세 관람가,12세 관람가
6,감독,object,0,220,김형준,이광재,강대규
7,감독별_이전영화_평균관객수,float64,136,103,300529.0,342700.25,4206610.666667
8,감독별_이전영화제작_참여횟수,int64,0,7,2,4,3
9,스탭수,int64,0,165,304,275,419


## 데이트타임 변경 후 기존 개봉날짜 삭제

In [9]:
train

Unnamed: 0,제목,배급사,장르,개봉날짜,상영시간,상영등급,감독,감독별_이전영화_평균관객수,감독별_이전영화제작_참여횟수,스탭수,배우수,관객수
0,개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398
1,내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,1161602.50,2,387,3,7072501
2,은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,220775.25,4,343,4,6959083
3,나는 공무원이다,(주)NEW,코미디,2012-07-12,101,전체 관람가,구자홍,23894.00,2,20,6,217866
4,불량남녀,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,신근호,1.00,1,251,2,483387
...,...,...,...,...,...,...,...,...,...,...,...,...
595,해무,(주)NEW,드라마,2014-08-13,111,청소년 관람불가,심성보,3833.00,1,510,7,1475091
596,파파로티,(주)쇼박스,드라마,2013-03-14,127,15세 관람가,윤종찬,496061.00,1,286,6,1716438
597,살인의 강,(주)마운틴픽쳐스,공포,2010-09-30,99,청소년 관람불가,김대현,,0,123,4,2475
598,악의 연대기,CJ 엔터테인먼트,느와르,2015-05-14,102,15세 관람가,백운학,,0,431,4,2192525


In [9]:
train["개봉날짜"] = pd.to_datetime(train["개봉날짜"])
test["개봉날짜"] = pd.to_datetime(test["개봉날짜"])

In [10]:
train["개봉연도"] = train["개봉날짜"].dt.year
test["개봉연도"] = test["개봉날짜"].dt.year

train["개봉월"] = train["개봉날짜"].dt.month
test["개봉월"] = test["개봉날짜"].dt.month

train["개봉일"] = train["개봉날짜"].dt.day
test["개봉일"] = test["개봉날짜"].dt.day

train = train.drop(columns="개봉날짜")
test = test.drop(columns="개봉날짜")

train = train.drop(columns="관객수")
train = pd.concat([train, target_], axis=1)

train.shape, test.shape

((600, 14), (243, 13))

## 중간 저장

In [272]:
# train.to_csv("train_common.csv", index=False)
# test.to_csv("test_common.csv", index=False)

# 결측치

In [231]:
train = train.fillna(0)
test = test.fillna(0)

train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

# 인코딩

In [232]:
train_ft = train.iloc[:,1:-1]
test_ft = test.iloc[:,1:]
target = train["관객수"]

train_ft.shape, test_ft.shape

((600, 12), (243, 12))

## 원핫 인코딩

In [233]:
cols = train_ft.select_dtypes("object").columns

onehot_enc = OneHotEncoder(handle_unknown="ignore")
train_ft[onehot_enc.get_feature_names_out()] = onehot_enc.fit_transform(train_ft[cols]).toarray()
test_ft[onehot_enc.get_feature_names_out()] = onehot_enc.transform(test_ft[cols]).toarray()

train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)

train_ft.shape, test_ft.shape

  train_ft[onehot_enc.get_feature_names_out()] = onehot_enc.fit_transform(train_ft[cols]).toarray()
  train_ft[onehot_enc.get_feature_names_out()] = onehot_enc.fit_transform(train_ft[cols]).toarray()
  train_ft[onehot_enc.get_feature_names_out()] = onehot_enc.fit_transform(train_ft[cols]).toarray()
  train_ft[onehot_enc.get_feature_names_out()] = onehot_enc.fit_transform(train_ft[cols]).toarray()
  train_ft[onehot_enc.get_feature_names_out()] = onehot_enc.fit_transform(train_ft[cols]).toarray()
  train_ft[onehot_enc.get_feature_names_out()] = onehot_enc.fit_transform(train_ft[cols]).toarray()
  train_ft[onehot_enc.get_feature_names_out()] = onehot_enc.fit_transform(train_ft[cols]).toarray()
  train_ft[onehot_enc.get_feature_names_out()] = onehot_enc.fit_transform(train_ft[cols]).toarray()
  train_ft[onehot_enc.get_feature_names_out()] = onehot_enc.fit_transform(train_ft[cols]).toarray()
  train_ft[onehot_enc.get_feature_names_out()] = onehot_enc.fit_transform(train_ft[cols]).toarray()


((600, 665), (243, 665))

# 스케일링

In [234]:
minmax_scaler = MinMaxScaler()
train_ft[train_ft.columns] = minmax_scaler.fit_transform(train_ft)
test_ft[test_ft.columns] = minmax_scaler.transform(test_ft)

# 모델링

In [235]:
rf_model = RandomForestRegressor(random_state=42)
cat_model = CatBoostRegressor(random_state=42)
lr_model = LinearRegression()
ridge_model = Ridge(random_state=42)
lasso_model = Lasso(random_state=42)
xgb_model = XGBRegressor(random_state=42)
lgbm_model = LGBMRegressor(random_state=42)
knn_model = KNeighborsRegressor()
gb_model = GradientBoostingRegressor(random_state=42)
hgb_model = HistGradientBoostingRegressor(random_state=42)

models = [rf_model, cat_model, lr_model, ridge_model, lasso_model, xgb_model, lgbm_model, knn_model, gb_model, hgb_model]

In [236]:
score = []
for model in models:
    scores = cross_val_score(model, train_ft, target, cv=cv_kfold, scoring="neg_root_mean_squared_error" ).mean()
    score.append(scores)

Learning rate set to 0.03646
0:	learn: 1904076.3314557	total: 835us	remaining: 834ms
1:	learn: 1878350.2592556	total: 1.62ms	remaining: 808ms
2:	learn: 1855055.6205935	total: 2.44ms	remaining: 811ms
3:	learn: 1838668.5752364	total: 3.16ms	remaining: 788ms
4:	learn: 1814471.0102655	total: 3.86ms	remaining: 768ms
5:	learn: 1794215.4531449	total: 4.58ms	remaining: 760ms
6:	learn: 1770223.8837173	total: 5.27ms	remaining: 748ms
7:	learn: 1752472.5448094	total: 6.02ms	remaining: 747ms
8:	learn: 1732709.2612712	total: 6.79ms	remaining: 747ms
9:	learn: 1714360.5568560	total: 7.53ms	remaining: 746ms
10:	learn: 1697529.3328090	total: 8.32ms	remaining: 748ms
11:	learn: 1677606.9294481	total: 9.11ms	remaining: 750ms
12:	learn: 1659139.4641030	total: 9.83ms	remaining: 746ms
13:	learn: 1642042.8173532	total: 10.6ms	remaining: 744ms
14:	learn: 1627327.2343256	total: 11.3ms	remaining: 745ms
15:	learn: 1609587.5234562	total: 12.1ms	remaining: 743ms
16:	learn: 1595461.0047511	total: 12.9ms	remaining: 74

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000258 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 388
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 24
[LightGBM] [Info] Start training from score 723519.229167
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 24
[LightGBM] [Info] Start training from score 760071.437500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000233 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 354
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 24
[LightGBM] [Info] Start tr

In [237]:
print(f"rf_model_score: {-score[0]}")
print(f"cat_model_score: {-score[1]}")
print(f"lr_model_score: {-score[2]}")
print(f"ridge_model_score: {-score[3]}")
print(f"lasso_model_score: {-score[4]}")
print(f"xgb_model_score: {-score[5]}")
print(f"lgbm_model_score: {-score[6]}")
print(f"knn_model_score: {-score[7]}")
print(f"gb_model_score: {-score[8]}")
print(f"hgb_model_score: {-score[9]}")

rf_model_score: 1438633.9101682378
cat_model_score: 1441422.446338678
lr_model_score: 2.1311535362938876e+19
ridge_model_score: 1448748.2063507983
lasso_model_score: 4730059.4427021835
xgb_model_score: 1599173.331531854
lgbm_model_score: 1514585.9688722752
knn_model_score: 1604857.3884541471
gb_model_score: 1437523.566079268
hgb_model_score: 1532937.9573408253


In [238]:
np.argsort([score[0], score[1], score[2], score[3], score[4], score[5], score[6], score[7], score[8], score[9]])[::-1]

array([8, 0, 1, 3, 6, 9, 5, 7, 4, 2])

In [260]:
lasso_model.fit(train_ft, target)

  model = cd_fast.enet_coordinate_descent(


In [261]:
pred = lasso_model.predict(test_ft)
submission["box_off_num"] = pred
submission.to_csv("1201_9.csv", index=False)