# 모델링 테스트
- 각 모델은 하이퍼파라미터를 전혀 튜닝하지 않은 바닐라 모델 사용
- 우선 롤모델(레퍼런스)에서 사용했던 4개의 회귀 모델부터 적용
- 데이터는 롤모델(레퍼런스)에서 사용된 csv 파일을 적절히 떼와서 사용

## 데이터 로드

In [1]:
import pandas as pd
dt = pd.read_csv("음원예측.csv")
dt.head()

Unnamed: 0,X1,artist,name,rank_g,title_song,week,runtime,top_freq,gg_score,nv_score,...,previous_ranking_200,previous_ranking_30,previous_ranking_40,previous_ranking_50,previous_ranking_60,previous_ranking_70,previous_ranking_80,previous_ranking_90,previous_ranking_nan,rank_g_pred
0,0,10cm,폰서트,90,1,2018-05-13,0.508876,0.240385,0.32,0.485515,...,0,0,0,0,0,0,0,1,0,90.00666
1,1,BLACKPINK,마지막처럼,80,1,2018-05-13,0.553254,0.317308,0.07,0.511548,...,0,0,0,0,0,1,0,0,0,73.23409
2,2,Camila Cabello,Havana,20,1,2018-05-13,0.565089,0.855769,0.33,0.871046,...,0,0,0,0,0,0,0,0,0,23.793497
3,3,Carla Bruni,Stand By Your Man,80,1,2018-05-13,0.408284,0.038462,0.41,0.857812,...,0,0,0,0,0,1,0,0,0,89.114334
4,4,DEAN,instagram,50,1,2018-05-13,0.680473,0.25,0.48,0.804124,...,0,1,0,0,0,0,0,0,0,35.190857


## 피처 데이터(X) 추출
#### 불필요한 Feature 컬럼 제거

In [2]:
dt_X = dt.drop(columns=["rank", "X1", "artist", "name", "rank_g", "week", "st_day", "rank_g_pred"])
dt_X

Unnamed: 0,title_song,runtime,top_freq,gg_score,nv_score,total_view,season_genre_score,pd_score,dc_total_numb,dc_mean_reccomend,...,previous_ranking_20,previous_ranking_200,previous_ranking_30,previous_ranking_40,previous_ranking_50,previous_ranking_60,previous_ranking_70,previous_ranking_80,previous_ranking_90,previous_ranking_nan
0,1,0.508876,0.240385,0.32,0.485515,0.793178,0.513550,0.774053,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,1,0.553254,0.317308,0.07,0.511548,0.847350,0.551930,0.680336,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2,1,0.565089,0.855769,0.33,0.871046,0.812555,0.529333,0.000000,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,1,0.408284,0.038462,0.41,0.857812,0.000000,0.529333,0.819462,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
4,1,0.680473,0.250000,0.48,0.804124,0.787867,0.566807,0.000000,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17058,0,0.576923,0.192308,0.38,0.540225,0.000000,0.477572,0.480855,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
17059,0,0.606509,0.000000,0.27,0.487808,0.000000,0.566807,0.677504,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
17060,0,0.517751,0.000000,0.27,0.487808,0.000000,0.566807,0.677504,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
17061,0,0.627219,0.980769,0.27,0.487808,0.000000,0.566807,0.677504,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## 라벨 데이터(y) 추출
#### 라벨은 랭크(rank) 컬럼을 사용

In [3]:
dt_y = dt["rank"]
dt_y.head()

0    98
1    80
2    26
3    86
4    53
Name: rank, dtype: int64

## 학습 데이터, 테스트 데이터 분리

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dt_X, dt_y, test_size=0.2, random_state=36)

### 성능지표 RMSE값 비교용 데이터프레임 생성

In [6]:
import numpy as np
import pandas as pd

result = pd.DataFrame(columns=["Model", "train_RMSE", "test_RMSE"])

## 모델 학습
### 롤모델(레퍼런스)이 썼던 모델 4가지
### 1. 단순 선형 회귀 모델(LinearRegression)

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

In [8]:
train_score = -1 * cross_val_score(lr, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(lr, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

[435.67873741 458.10095582 378.84697516 483.35509959 507.85141369]
평균 학습 MSE :  452.7666363324176
평균 학습 RMSE :  21.278313756790446
[5.06616598e+02 4.40205998e+02 3.59396486e+24 4.22191899e+02
 3.82489508e+02]
평균 테스트 MSE :  7.18792972958195e+23
평균 테스트 RMSE :  847816591579.9213


In [10]:
model_result = pd.DataFrame({'Model':['Linear Regression'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = model_result

### 2. 랜덤 포레스트 회귀 모델(LinearRegression)

In [11]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [12]:
train_score = -1 * cross_val_score(rf, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(rf, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

[339.83071198 379.81219577 308.98981423 400.96237214 416.55425504]
평균 학습 MSE :  369.229869830011
평균 학습 RMSE :  19.215355053446476
[435.3984918  404.09277482 354.86967584 334.03644032 334.59762683]
평균 테스트 MSE :  372.5990019230323
평균 테스트 RMSE :  19.302823677457976


In [13]:
model_result = pd.DataFrame({'Model':['Random Forest Regression'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

### 3. CatBoost 모델

In [14]:
from catboost import CatBoostRegressor

cb = CatBoostRegressor()
cb.fit(X_train, y_train)

Learning rate set to 0.061877
0:	learn: 55.0619187	total: 142ms	remaining: 2m 21s
1:	learn: 53.2167490	total: 145ms	remaining: 1m 12s
2:	learn: 51.5461332	total: 148ms	remaining: 49.3s
3:	learn: 50.1519691	total: 153ms	remaining: 38s
4:	learn: 48.6152028	total: 156ms	remaining: 31.1s
5:	learn: 47.1940467	total: 160ms	remaining: 26.4s
6:	learn: 46.0053212	total: 163ms	remaining: 23.2s
7:	learn: 44.9524113	total: 168ms	remaining: 20.8s
8:	learn: 43.7478701	total: 172ms	remaining: 19s
9:	learn: 42.6187131	total: 176ms	remaining: 17.5s
10:	learn: 41.6495894	total: 181ms	remaining: 16.2s
11:	learn: 40.8133627	total: 186ms	remaining: 15.3s
12:	learn: 39.9205655	total: 190ms	remaining: 14.5s
13:	learn: 38.9418338	total: 195ms	remaining: 13.7s
14:	learn: 38.1674043	total: 200ms	remaining: 13.2s
15:	learn: 37.2871363	total: 205ms	remaining: 12.6s
16:	learn: 36.6778067	total: 210ms	remaining: 12.1s
17:	learn: 35.9955160	total: 215ms	remaining: 11.7s
18:	learn: 35.3700702	total: 220ms	remaining: 

<catboost.core.CatBoostRegressor at 0x152bb3726f0>

In [15]:
train_score = -1 * cross_val_score(cb, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(cb, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

Learning rate set to 0.059734
0:	learn: 55.0229999	total: 4ms	remaining: 4s
1:	learn: 53.2437477	total: 7.97ms	remaining: 3.98s
2:	learn: 51.6260709	total: 12.2ms	remaining: 4.05s
3:	learn: 50.2961619	total: 16.1ms	remaining: 4.02s
4:	learn: 48.8896281	total: 19.6ms	remaining: 3.9s
5:	learn: 47.7168457	total: 23.9ms	remaining: 3.95s
6:	learn: 46.6051698	total: 27.4ms	remaining: 3.89s
7:	learn: 45.5406636	total: 31.3ms	remaining: 3.88s
8:	learn: 44.2570803	total: 35.1ms	remaining: 3.87s
9:	learn: 43.1880017	total: 39.3ms	remaining: 3.89s
10:	learn: 42.1792344	total: 43.1ms	remaining: 3.88s
11:	learn: 41.3852009	total: 46.8ms	remaining: 3.86s
12:	learn: 40.5831591	total: 50.7ms	remaining: 3.85s
13:	learn: 39.7760543	total: 55.4ms	remaining: 3.9s
14:	learn: 39.1164930	total: 60ms	remaining: 3.94s
15:	learn: 38.1534410	total: 64.2ms	remaining: 3.95s
16:	learn: 37.5849391	total: 68.3ms	remaining: 3.95s
17:	learn: 37.0248813	total: 72.7ms	remaining: 3.96s
18:	learn: 36.2061123	total: 76.8ms	

In [16]:
model_result = pd.DataFrame({'Model':['CatBoost Regression'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

### 4. XGBoost 모델

In [17]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train, y_train)

In [18]:
train_score = -1 * cross_val_score(xgb, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(xgb, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

[328.6399     395.23794434 307.81283133 370.19984481 383.35895081]
평균 학습 MSE :  357.0498942613301
평균 학습 RMSE :  18.895763923729838
[464.43439197 419.91342923 397.86677955 357.33486487 356.37700414]
평균 테스트 MSE :  399.18529394956147
평균 테스트 RMSE :  19.97962196713345


In [19]:
model_result = pd.DataFrame({'Model':['XGBoost Regression'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

### 롤모델(레퍼런스)이 안 썼던 다른 모델 몇 가지
### 5. 라쏘(Lasso) 규제 적용 회귀 모델

In [20]:
from sklearn.linear_model import Lasso

lasso = Lasso()
lasso.fit(X_train, y_train)

In [21]:
train_score = -1 * cross_val_score(lasso, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(lasso, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

[910.31534034 886.48221162 852.70565125 891.38747788 921.15149261]
평균 학습 MSE :  892.40843474063
평균 학습 RMSE :  29.873205966896656
[986.60216842 836.17478369 958.75421709 878.21703946 834.50100221]
평균 테스트 MSE :  898.8498421712475
평균 테스트 RMSE :  29.980824574571784


In [22]:
model_result = pd.DataFrame({'Model':['Lasso'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

### 6. 릿지(Ridge) 규제 적용 회귀 모델

In [23]:
from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(X_train, y_train)

In [24]:
train_score = -1 * cross_val_score(ridge, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(ridge, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

[436.16231835 457.7010251  378.7035127  482.75615812 507.77772509]
평균 학습 MSE :  452.6201478714932
평균 학습 RMSE :  21.274871277436514
[507.84216893 438.52537621 412.04363648 422.23769624 382.82537548]
평균 테스트 MSE :  432.69485066644285
평균 테스트 RMSE :  20.801318483847194


In [25]:
model_result = pd.DataFrame({'Model':['Ridge'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

### 7. 엘라스틱넷(ElasticNet) 규제 적용 회귀 모델
- 학습에 30분 걸림

In [26]:
from sklearn.linear_model import ElasticNet

en = ElasticNet()
en.fit(X_train, y_train)

In [27]:
train_score = -1 * cross_val_score(en, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(en, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

[2628.80907429 2564.84558602 2549.55617661 2576.47777602 2529.31644325]
평균 학습 MSE :  2569.801011238799
평균 학습 RMSE :  50.69320478366701
[2629.7160841  2374.33658912 2496.0777894  2484.84900425 2396.71841243]
평균 테스트 MSE :  2476.3395758606866
평균 테스트 RMSE :  49.76283327806694


In [28]:
model_result = pd.DataFrame({'Model':['ElasticNet'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

### 8. 다항회귀 적용(Polynomial Feautres) 선형 회귀 모델
- 별도의 모델은 아니고 그냥 선형 회귀 모델에 넣을 변수에 다항회귀를 적용시켜보는 것

In [35]:
poly = PolynomialFeatures(degree=2)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

lrp = LinearRegression()
lrp.fit(X_poly_train, y_train)

In [36]:
train_score = -1 * cross_val_score(en, X_poly_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(lrp, X_poly_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

[1791.81948375 1734.87721808 1721.65369117 1730.4399347  1734.79058427]
평균 학습 MSE :  1742.716182393598
평균 학습 RMSE :  41.745852277724524
[3.25143596e+25 1.12337543e+25 1.80566071e+25 1.65183476e+26
 5.38234513e+25]
평균 테스트 MSE :  5.616232967965421e+25
평균 테스트 RMSE :  7494153032841.951


In [37]:
model_result = pd.DataFrame({'Model':['Polynomial Linear Regression'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

### 9. 확률적 경사하강법 선형 회귀 모델(SGDRegressor)

In [38]:
from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor()
sgd.fit(X_train, y_train)

In [39]:
train_score = -1 * cross_val_score(en, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(en, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

[2628.80907429 2564.84558602 2549.55617661 2576.47777602 2529.31644325]
평균 학습 MSE :  2569.801011238799
평균 학습 RMSE :  50.69320478366701
[2629.7160841  2374.33658912 2496.0777894  2484.84900425 2396.71841243]
평균 테스트 MSE :  2476.3395758606866
평균 테스트 RMSE :  49.76283327806694


In [40]:
model_result = pd.DataFrame({'Model':['SGD Regression'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

### 10. LightGBMRegressor 모델

In [42]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001842 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2152
[LightGBM] [Info] Number of data points in the train set: 13650, number of used features: 75
[LightGBM] [Info] Start training from score 91.292308


In [74]:
train_score = -1 * cross_val_score(lgbm, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print(train_score)
print("평균 학습 MSE : ", train_score.mean())
print("평균 학습 RMSE : ", train_score.mean() ** 0.5)
test_score = -1 * cross_val_score(lgbm, X_test, y_test, scoring='neg_mean_squared_error', cv=5)
print(test_score)
print("평균 테스트 MSE : ", test_score.mean())
print("평균 테스트 RMSE : ", test_score.mean() ** 0.5)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000868 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2139
[LightGBM] [Info] Number of data points in the train set: 10920, number of used features: 74
[LightGBM] [Info] Start training from score 90.954945
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000916 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2138
[LightGBM] [Info] Number of data points in the train set: 10920, number of used features: 74
[LightGBM] [Info] Start training from score 91.112546
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [82]:
model_result = pd.DataFrame({'Model':['Light GBM Regression'], 'train_RMSE':[train_score.mean() ** 0.5], 'test_RMSE':[test_score.mean() ** 0.5]})
result = pd.concat([result, model_result])

## 모델 평가지표(RMSE) 비교

In [83]:
result.set_index('Model', inplace=True)
result

Unnamed: 0_level_0,train_RMSE,test_RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
,21.278314,847816600000.0
,19.215355,19.30282
,18.241544,18.81085
,18.895764,19.97962
,29.873206,29.98082
,21.274871,20.80132
,50.693205,49.76283
,36.044888,3205.209
,41.745852,7494153000000.0
,50.693205,49.76283


#### 테스트셋 RMSE 저 머꼬??