# 모델링 테스트
- 각 모델은 하이퍼파라미터를 전혀 튜닝하지 않은 바닐라 모델 사용
- 우선 롤모델(레퍼런스)에서 사용했던 4개의 회귀 모델부터 적용
- 데이터는 롤모델(레퍼런스)에서 사용된 csv 파일을 적절히 떼와서 사용

## 데이터 로드

In [1]:
import pandas as pd
dt = pd.read_csv("음원예측.csv")
dt.head()

Unnamed: 0,X1,artist,name,rank_g,title_song,week,runtime,top_freq,gg_score,nv_score,...,previous_ranking_200,previous_ranking_30,previous_ranking_40,previous_ranking_50,previous_ranking_60,previous_ranking_70,previous_ranking_80,previous_ranking_90,previous_ranking_nan,rank_g_pred
0,0,10cm,폰서트,90,1,2018-05-13,0.508876,0.240385,0.32,0.485515,...,0,0,0,0,0,0,0,1,0,90.00666
1,1,BLACKPINK,마지막처럼,80,1,2018-05-13,0.553254,0.317308,0.07,0.511548,...,0,0,0,0,0,1,0,0,0,73.23409
2,2,Camila Cabello,Havana,20,1,2018-05-13,0.565089,0.855769,0.33,0.871046,...,0,0,0,0,0,0,0,0,0,23.793497
3,3,Carla Bruni,Stand By Your Man,80,1,2018-05-13,0.408284,0.038462,0.41,0.857812,...,0,0,0,0,0,1,0,0,0,89.114334
4,4,DEAN,instagram,50,1,2018-05-13,0.680473,0.25,0.48,0.804124,...,0,1,0,0,0,0,0,0,0,35.190857


## 피처 데이터(X) 추출
#### 불필요한 Feature 컬럼 제거

In [2]:
dt_X = dt.drop(columns=["rank", "X1", "artist", "name", "rank_g", "week", "st_day", "rank_g_pred"])
dt_X

Unnamed: 0,title_song,runtime,top_freq,gg_score,nv_score,total_view,season_genre_score,pd_score,dc_total_numb,dc_mean_reccomend,...,previous_ranking_20,previous_ranking_200,previous_ranking_30,previous_ranking_40,previous_ranking_50,previous_ranking_60,previous_ranking_70,previous_ranking_80,previous_ranking_90,previous_ranking_nan
0,1,0.508876,0.240385,0.32,0.485515,0.793178,0.513550,0.774053,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,1,0.553254,0.317308,0.07,0.511548,0.847350,0.551930,0.680336,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2,1,0.565089,0.855769,0.33,0.871046,0.812555,0.529333,0.000000,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,1,0.408284,0.038462,0.41,0.857812,0.000000,0.529333,0.819462,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
4,1,0.680473,0.250000,0.48,0.804124,0.787867,0.566807,0.000000,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17058,0,0.576923,0.192308,0.38,0.540225,0.000000,0.477572,0.480855,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
17059,0,0.606509,0.000000,0.27,0.487808,0.000000,0.566807,0.677504,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
17060,0,0.517751,0.000000,0.27,0.487808,0.000000,0.566807,0.677504,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
17061,0,0.627219,0.980769,0.27,0.487808,0.000000,0.566807,0.677504,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## 라벨 데이터(y) 추출
#### 라벨은 랭크(rank) 컬럼을 사용

In [3]:
dt_y = dt["rank"]
dt_y.head()

0    98
1    80
2    26
3    86
4    53
Name: rank, dtype: int64

## 학습 데이터, 테스트 데이터 분리

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dt_X, dt_y, test_size=0.2)

### 성능지표 RMSE값 비교용 데이터프레임 생성

In [5]:
import numpy as np
import pandas as pd

result = pd.DataFrame(columns=["Model", "train_RMSE", "test_RMSE"])

## 모델 학습
### 1. 단순 선형 회귀 모델(LinearRegression)

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

In [7]:
train_score = -1 * cross_val_score(lr, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(lr, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

[441.62540636 432.91323689 468.41503549 443.72752404 454.36888736]
평균 학습 MSE :  448.2100180288461
평균 학습 RMSE :  21.17097111681101
[5.42185716e+02 4.94281391e+02 4.12654443e+02 4.91240647e+25
 3.99992777e+02]
평균 테스트 MSE :  9.824812948742963e+24
평균 테스트 RMSE :  3134455765957.3


In [8]:
model_result = pd.DataFrame({'Model':['Linear Regression'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

  result = pd.concat([result, model_result])


### 2. 랜덤 포레스트 회귀 모델(LinearRegression)

In [9]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [10]:
train_score = -1 * cross_val_score(rf, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(rf, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

[341.95819609 360.26166833 374.72898587 355.71178478 379.33073244]
평균 학습 MSE :  362.3982734997053
평균 학습 RMSE :  19.03676110843715
[416.82649219 445.9497561  353.16728238 349.42609376 333.40873843]
평균 테스트 MSE :  379.7556725731044
평균 테스트 RMSE :  19.487320815676647


In [11]:
model_result = pd.DataFrame({'Model':['Random Forest Regression'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

### 3. CatBoost 모델

In [12]:
from catboost import CatBoostRegressor

cb = CatBoostRegressor()
cb.fit(X_train, y_train)

Learning rate set to 0.061877
0:	learn: 54.8388969	total: 139ms	remaining: 2m 18s
1:	learn: 52.9862855	total: 143ms	remaining: 1m 11s
2:	learn: 51.3862958	total: 147ms	remaining: 49s
3:	learn: 49.9841215	total: 154ms	remaining: 38.2s
4:	learn: 48.4493855	total: 158ms	remaining: 31.4s
5:	learn: 47.1110032	total: 162ms	remaining: 26.9s
6:	learn: 45.9291389	total: 169ms	remaining: 23.9s
7:	learn: 44.8061490	total: 174ms	remaining: 21.6s
8:	learn: 43.6090334	total: 179ms	remaining: 19.7s
9:	learn: 42.5417848	total: 185ms	remaining: 18.3s
10:	learn: 41.4892591	total: 190ms	remaining: 17.1s
11:	learn: 40.4780539	total: 196ms	remaining: 16.2s
12:	learn: 39.6227183	total: 202ms	remaining: 15.3s
13:	learn: 38.6996815	total: 206ms	remaining: 14.5s
14:	learn: 38.0254779	total: 212ms	remaining: 13.9s
15:	learn: 37.1950410	total: 217ms	remaining: 13.4s
16:	learn: 36.4323606	total: 222ms	remaining: 12.8s
17:	learn: 35.7401065	total: 227ms	remaining: 12.4s
18:	learn: 35.1284958	total: 233ms	remaining

<catboost.core.CatBoostRegressor at 0x2216fea2120>

In [13]:
train_score = -1 * cross_val_score(cb, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(cb, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

Learning rate set to 0.059734
0:	learn: 54.8398903	total: 4.47ms	remaining: 4.46s
1:	learn: 53.0524049	total: 9.36ms	remaining: 4.67s
2:	learn: 51.5049151	total: 14.3ms	remaining: 4.74s
3:	learn: 50.1556728	total: 19ms	remaining: 4.74s
4:	learn: 48.6600034	total: 23.7ms	remaining: 4.72s
5:	learn: 47.3030202	total: 29.2ms	remaining: 4.84s
6:	learn: 46.1421795	total: 33.5ms	remaining: 4.75s
7:	learn: 45.1295304	total: 37.6ms	remaining: 4.66s
8:	learn: 43.9475783	total: 42.4ms	remaining: 4.67s
9:	learn: 42.9555655	total: 46.5ms	remaining: 4.6s
10:	learn: 41.9354363	total: 50.4ms	remaining: 4.53s
11:	learn: 41.0334702	total: 54.6ms	remaining: 4.49s
12:	learn: 40.1245463	total: 59.5ms	remaining: 4.52s
13:	learn: 39.1892498	total: 63.8ms	remaining: 4.49s
14:	learn: 38.5102184	total: 67.6ms	remaining: 4.44s
15:	learn: 37.6764063	total: 71.6ms	remaining: 4.41s
16:	learn: 36.9286225	total: 76.6ms	remaining: 4.43s
17:	learn: 36.2966534	total: 80.7ms	remaining: 4.4s
18:	learn: 35.5766553	total: 8

In [14]:
model_result = pd.DataFrame({'Model':['CatBoost Regression'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

### 4. XGBoost 모델

In [15]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train, y_train)

In [16]:
train_score = -1 * cross_val_score(xgb, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(xgb, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

[342.30718878 361.52451065 357.39391127 369.9077974  382.92613516]
평균 학습 MSE :  362.8119086504804
평균 학습 RMSE :  19.047622125884384
[396.37777586 471.75997511 376.71019751 423.90235499 375.19033696]
평균 테스트 MSE :  408.7881280844723
평균 테스트 RMSE :  20.21850954161736


In [17]:
model_result = pd.DataFrame({'Model':['XGBoost Regression'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

## 모델 평가지표(RMSE) 비교

In [18]:
result.set_index('Model', inplace=True)
result

Unnamed: 0_level_0,train_RMSE,test_RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Linear Regression,21.170971,3134456000000.0
Random Forest Regression,19.036761,19.48732
CatBoost Regression,18.256948,18.73735
XGBoost Regression,19.047622,20.21851


#### 테스트셋 RMSE 저 머꼬??