## 2022-1 응용수학 기말고사

## 문제정의
- Input:
Shell Weight, Height, Diameter, Water, Length, Whole Weight, Viscra Weight, Shucked Weight
- Ouput: Target (전복 나이)
- 문제를 한 문장으로 정의: 전복의 나이를 예측할 성능 좋은 모델을 만들어보자.

In [None]:
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

In [None]:
from google.colab import files
train = files.upload()

Saving train2.csv to train2 (2).csv


In [None]:
test = files.upload()

Saving test2.csv to test2 (4).csv


In [None]:
train = pd.read_csv('train2.csv', encoding='cp949')
test = pd.read_csv('test2.csv', encoding='cp949')

train.rename(columns = {'Lenght':'Length'}, inplace = True) #csv 파일 오타 수정
test.rename(columns = {'Lenght': 'Length'}, inplace = True)

---

## 방법

|모델명|간단한 설명|
|:-----:|:-----|
|**Linear Regression**|데이터를 가장 잘 대변해주는 선을 찾아내는 것. 선형 상관 관계를 모델링하는 회귀 분석 기법|
|**Ridge**|일반적으로 사용되는 선형회귀 모델을 변형시킨 알고리즘, 변수 간 상관관계가 높은 상황에서 좋은 예측 성능을 나타내는 모델|
|**Random Forest**|의사결정트리에 기반을 둔 앙상블 모델(Bagging 방식)|
|**XGBoost**|그레디언트 부스팅 프레임워크를 사용하는 의사결정트리 기반의 앙상블 머신러닝 알고리즘(Boosting 방식)|
|**LightGBM**|XGBoost의 단점 보완, 비대칭적으로 트리가 성장해 빠른 속도로 훈련 가능(Boosting 방식)|

#### Feature 전처리

In [None]:
train[train['Viscra Weight'] > train['Shucked Weight']] # 이상치: 내장 무게가 껍질 제외한 무게보다 큼

Unnamed: 0.1,Unnamed: 0,id,Gender,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
1023,465,466,M,0.415,0.315,0.125,0.388,0.068,0.09,0.125,12


In [None]:
train = train.drop(train[train['Shucked Weight'] < train['Viscra Weight']].index[0],axis=0) # 1개이므로 단순 삭제

In [None]:
# water feature 추가
train['Water'] = train['Whole Weight'] - (train['Shucked Weight'] + train['Shell Weight'])
test['Water'] = test['Whole Weight'] - (test['Shucked Weight'] + test['Shell Weight'])

train = train[['Gender', 'Length', 'Diameter', 'Height', 'Whole Weight', 'Shucked Weight', 'Viscra Weight', 'Shell Weight', 'Water', 'Target']]
train[train['Water']<0]

Unnamed: 0,Gender,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Water,Target
39,I,0.455,0.33,0.1,0.372,0.358,0.0775,0.11,-0.096,8
76,I,0.23,0.165,0.06,0.0515,0.019,0.0145,0.036,-0.0035,4
367,I,0.275,0.205,0.07,0.1055,0.495,0.019,0.0315,-0.421,5
837,I,0.475,0.365,0.1,0.1315,0.2025,0.0875,0.123,-0.194,7
840,I,0.38,0.275,0.095,0.1375,0.086,0.0585,0.0605,-0.009,7


In [None]:
test[test['Water']<0]

Unnamed: 0.1,Unnamed: 0,id,Gender,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target,Water


In [None]:
#음수를 양수로 바꿀 시 0.554946, 그냥 냅두면 0.558108
train.loc[39, 'Water'] = abs(train.loc[39, 'Water'])
train.loc[76, 'Water'] = abs(train.loc[76, 'Water'])
train.loc[367, 'Water'] = abs(train.loc[367, 'Water'])
train.loc[837, 'Water'] = abs(train.loc[837, 'Water'])
train.loc[840, 'Water'] = abs(train.loc[840, 'Water'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [None]:
co = train.corrwith(train['Target'])
print(co.sort_values(ascending=False))

co_abs = abs(co)

Target            1.000000
Shell Weight      0.624776
Height            0.606800
Diameter          0.570221
Water             0.554946
Length            0.545089
Whole Weight      0.537538
Viscra Weight     0.494948
Shucked Weight    0.419971
dtype: float64


- Data normalization, standardization

In [None]:
features = train.columns.tolist()
features.remove('Gender')
features.remove('Target')

In [None]:
from sklearn.pipeline import Pipeline

train_x = train[features] #:-1

test_x = test[features]

pipeline = Pipeline([('normalizer', Normalizer()), ('scaler', StandardScaler())])

train_x[features] = pipeline.fit_transform(train_x[features])
test_x[features] = pipeline.transform(test_x[features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [None]:
train_x.head()

Unnamed: 0,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Water
0,-0.490444,-0.2102,0.177794,0.494532,0.367696,0.327195,0.764261,0.149488
1,-0.525688,-0.62058,-0.823035,0.536154,1.498784,0.384044,-0.744148,0.023973
2,-0.377525,-0.372158,-0.840757,0.494789,0.696247,0.344799,0.446655,-0.028205
3,1.860422,1.650201,-0.294062,-1.909665,-1.468156,-1.217873,-1.615087,-1.869465
4,1.913071,1.703548,1.563962,-2.129635,-2.153172,-1.515506,-1.644232,-1.494386


In [None]:
train_y = train.Target

## 평가산식 & GridSearchCV 사용 (여기선 5겹 교차 검증)

In [None]:
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [None]:
from sklearn.model_selection import GridSearchCV

### [모델 학습]

In [None]:
from sklearn.model_selection import train_test_split
T_train_x, T_test_x, T_train_y, T_test_y = train_test_split(train_x, train_y, random_state=32, test_size = 0.2)


### 릿지

In [None]:
from sklearn.linear_model import Ridge

Ridge_params ={
    'alpha':[0, 0.001, 0.01, 0.1, 1]} #alpha 값은 커질수록 정확도가 낮아지므로

In [None]:
hyper_parameter_tuner_ridge = GridSearchCV(
    Ridge(random_state=32), Ridge_params, cv=5)
hyper_parameter_tuner_ridge.fit(T_train_x, T_train_y)

GridSearchCV(cv=5, estimator=Ridge(random_state=32),
             param_grid={'alpha': [0, 0.001, 0.01, 0.1, 1]})

In [None]:
hyper_parameter_tuner_ridge.best_params_

{'alpha': 1}

In [None]:
hyper_parameter_tuner_ridge.score(T_test_x, T_test_y)

0.5247065442223331

In [None]:
##### 여기서부터 학습입니다!
'''
ridge_model = Ridge(random_state=32, alpha=1)
ridge_model.fit(train_x, train_y)
test_y = ridge_model.predict(test_x)
'''

'\nridge_model = Ridge(random_state=32, alpha=1)\nridge_model.fit(train_x, train_y)\ntest_y = ridge_model.predict(test_x)\n'

### 랜덤 포레스트

In [None]:
from sklearn.ensemble import RandomForestRegressor

RF_params ={
    'min_samples_leaf': [1, 2], #불균형 데이터의 경우 특정 클래스 데이터가 극도로 작을 수 있으므로 작게 설정 필요, 기본 값 1
    'min_samples_split': [2, 3, 5], #작게 설정할수록 과적합 가능성 증가, 기본값 2
    'n_estimators': [100, 150, 190]}

> leaf, split 과적합 제어 용도

> min_samples_leaf 불균형 데이터 경우 위해 작게 설정 필요

In [None]:
hyper_parameter_tuner_RF = GridSearchCV(RandomForestRegressor(random_state=32), RF_params, cv=5) #오래 걸립니다!
hyper_parameter_tuner_RF.fit(T_train_x, T_train_y)
hyper_parameter_tuner_RF.best_params_

{'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 190}

In [None]:
scores_df = pd.DataFrame(hyper_parameter_tuner_RF.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 
           'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'min_samples_leaf': 1, 'min_samples_split': 2...",0.497759,18,0.53137,0.46503,0.444461
1,"{'min_samples_leaf': 1, 'min_samples_split': 2...",0.499713,16,0.536967,0.464404,0.4433
2,"{'min_samples_leaf': 1, 'min_samples_split': 2...",0.500353,15,0.533655,0.46514,0.445657
3,"{'min_samples_leaf': 1, 'min_samples_split': 3...",0.498054,17,0.532839,0.465673,0.444916
4,"{'min_samples_leaf': 1, 'min_samples_split': 3...",0.50185,14,0.541671,0.465887,0.444824
5,"{'min_samples_leaf': 1, 'min_samples_split': 3...",0.503417,12,0.538079,0.467905,0.448076
6,"{'min_samples_leaf': 1, 'min_samples_split': 5...",0.503158,13,0.540171,0.468798,0.450084
7,"{'min_samples_leaf': 1, 'min_samples_split': 5...",0.505953,7,0.54532,0.471285,0.448003
8,"{'min_samples_leaf': 1, 'min_samples_split': 5...",0.505026,10,0.541526,0.469272,0.450678
9,"{'min_samples_leaf': 2, 'min_samples_split': 2...",0.505248,8,0.532507,0.474884,0.462083


In [None]:
rf_model = RandomForestRegressor(random_state=32, min_samples_leaf=2, min_samples_split=5, n_estimators=190)
rf_model.fit(T_train_x, T_train_y) # 테스트용 train 학습, 테스트
T_test_y = rf_model.predict(T_test_x)

hyper_parameter_tuner_RF.score(T_test_x, T_test_y)

1.0

In [None]:
'''
# 학습
rf_model = RandomForestRegressor(random_state=32, min_samples_leaf=2, min_samples_split=5, n_estimators=190)
rf_model.fit(train_x, train_y)
test_y = rf_model.predict(test_x)
'''

'\n# 학습\nrf_model = RandomForestRegressor(random_state=32, min_samples_leaf=2, min_samples_split=5, n_estimators=190)\nrf_model.fit(train_x, train_y)\ntest_y = rf_model.predict(test_x)\n'

### LGBM

In [None]:
import lightgbm as lgb

LGB_params ={
    'n_estimators':[100, 200, 250], #n_estimators가 크면 과대적합 가능성, 적당히
    'learning_rate':[0.1, 0.05, 0.01]} #learning_rate는 작을수록 좋음

In [None]:
hyper_parameter_tuner_LGBM = GridSearchCV(lgb.LGBMRegressor(random_state=32),
                                    LGB_params, cv=5)
hyper_parameter_tuner_LGBM.fit(T_train_x, T_train_y)

hyper_parameter_tuner_LGBM.best_params_

{'learning_rate': 0.01, 'n_estimators': 250}

In [None]:
lgbm_model = lgb.LGBMRegressor(random_state=32, learning_rate = 0.01, n_estimators = 250)
lgbm_model.fit(T_train_x, T_train_y) # 테스트용 train 학습, 테스트
T_test_y = lgbm_model.predict(T_test_x)

hyper_parameter_tuner_LGBM.score(T_test_x, T_test_y)

1.0

In [None]:
'''
# 학습
lgbm_model = lgb.LGBMRegressor(random_state=32, learning_rate = 0.01, n_estimators = 250)
lgbm_model.fit(train_x, train_y)
test_y = lgbm_model.predict(test_x)
'''

'\n# 학습\nlgbm_model = lgb.LGBMRegressor(random_state=32, learning_rate = 0.01, n_estimators = 250)\nlgbm_model.fit(train_x, train_y)\ntest_y = lgbm_model.predict(test_x)\n'

### 선형회귀

In [None]:
from sklearn.linear_model import LinearRegression

'''
linear_model = LinearRegression()
linear_model.fit(train_x, train_y)
test_y = linear_model.predict(test_x)
'''
##### 여기까지가 학습입니다! #####

'\nlinear_model = LinearRegression()\nlinear_model.fit(train_x, train_y)\ntest_y = linear_model.predict(test_x)\n'

### [모델 테스트] 앙상블 - Soft Voting

In [None]:
test_true = files.upload() # 주어진 test2.csv 파일 한 번 더 불러오기

Saving test2.csv to test2 (5).csv


In [None]:
test_true = pd.read_csv('test2.csv', encoding = 'cp949')

In [None]:
'''
models = [ridge_model, rf_model, lgbm_model]
'''

'\nmodels = [ridge_model, rf_model, lgbm_model]\n'

### 모델 저장하기

In [None]:
import pickle
'''
with open("2020038020_정희원.model","wb") as f:
    pickle.dump(models, f)
'''

'\nwith open("2020038020_정희원.model","wb") as f:\n    pickle.dump(models, f)\n'

### 모델 파일 읽어오기

In [None]:
with open("2020038020_정희원.model","rb") as f:
    models_soft_voting = pickle.load(f)

In [None]:
models_soft_voting

[Ridge(alpha=1, random_state=32),
 RandomForestRegressor(min_samples_leaf=2, min_samples_split=5, n_estimators=190,
                       random_state=32),
 LGBMRegressor(learning_rate=0.01, n_estimators=250, random_state=32)]

In [None]:
# 앙상블 - soft voting
#1. 1차시도는 0.1465
#1-1. Water 그냥 냅두면 0.1433166
#1-2. Water 음수를 양수로 바꾸면 0.143197
#1-3. 정규화, 표준화 둘 다 했을 때 0.142746
#1-4. normalization만 했을 때 0.143228
#1-5. standardScaler만 했을 때 0.146956
#1-6. 직접 스케일링 했을 때 0.143556271

#---linear 섞으면
# 2. 2차시도는 0.1463
# 2-1. Water 그냥 냅두면 0.1442780
# 2-2. Water 음수를 양수로 바꾸면 0.144045
# 2-3. 정규화, 표준화 둘 다 했을 때 0.1437909
# 2-4. normalization만 했을 때 0.1436368
# 2-5. standardScaler만 했을 때 0.1479878
# 2-6. 직접 스케일링 했을 때 0.14436143


##### 여기서부터 테스트입니다! #####
prediction = None
for model in models_soft_voting:
    if prediction is None:
        prediction = model.predict(test_x)
    else:
        prediction += model.predict(test_x)
      
prediction /= len(models_soft_voting)

NMAE(test_true['Target'], prediction)
##### 여기까지가 테스트입니다! #####

0.1435562710236606

In [None]:
from sklearn.metrics import r2_score

r2_score(test_true['Target'], prediction)

0.5911862258249434