<a href="https://colab.research.google.com/github/jooeun921/Big-Data-Analyst/blob/main/Part03_Section_02_scikit_learn_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Section 02 학습 : scikit-learn을 활용한 회귀 모델 적합

작업형 2유형 문제 -> 모델 학습 및 평가이므로, 회귀지표를 선택해야 함.

회귀지표 종류의 scoring 옵션으로는,
- RMSE = neg_root_mean_squared_error
- MSE = neg_mean_squared_error
- MAE = neg_mean_absolute_error
- R-square = r2
- MAPE = neg_mean_absolute_percentage_error


In [3]:
train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/st_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/st_test.csv')

In [4]:
X_train = train.drop(['grade'], axis = 1)
y_train = train['grade']

X_test = test.drop(['grade'], axis = 1)
y_test = test['grade']

#### KNN


```
from sklearn.neighbors import KNeighborsRegressor
```

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline

In [6]:
num_columns = X_train.select_dtypes('number').columns.tolist()
cat_columns = X_train.select_dtypes('object').columns.tolist()

cat_preprocess = make_pipeline(OneHotEncoder(handle_unknown  = 'ignore', sparse_output = False))
num_preprocess = make_pipeline(SimpleImputer(strategy = 'mean'), StandardScaler())

preprocess = ColumnTransformer([
    ('cat', cat_preprocess, cat_columns), ('num', num_preprocess, num_columns)
])

In [7]:
from sklearn.neighbors import KNeighborsRegressor

full_pipe = Pipeline([
    ('preprocess', preprocess), ('regressor', KNeighborsRegressor())
])

In [8]:
KNeighborsRegressor().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [9]:
knn_params = {'regressor__n_neighbors' : np.arange(5, 10, 1)}

knn_search = GridSearchCV(estimator = full_pipe, param_grid = knn_params, scoring = 'neg_root_mean_squared_error', cv = 3)
knn_search.fit(X_train, y_train)

In [10]:
pd.DataFrame(knn_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.022126,0.010326,0.023657,0.010755,5,{'regressor__n_neighbors': 5},-3.049133,-2.977168,-3.25468,-3.09366,0.117588,4
1,0.018478,0.003538,0.022986,0.009811,6,{'regressor__n_neighbors': 6},-3.044619,-2.97517,-3.296364,-3.105384,0.137987,5
2,0.018883,0.004374,0.013177,0.002626,7,{'regressor__n_neighbors': 7},-3.015779,-2.923309,-3.270186,-3.069758,0.146666,3
3,0.023552,0.008102,0.01783,0.00602,8,{'regressor__n_neighbors': 8},-3.02028,-2.924887,-3.257288,-3.067485,0.139747,2
4,0.023726,0.010705,0.012404,0.001759,9,{'regressor__n_neighbors': 9},-3.000335,-2.925236,-3.259131,-3.061567,0.143023,1


In [11]:
print('Best 파라미터 조합: ', knn_search.best_params_)
print('RMSE: ', -knn_search.best_score_)

Best 파라미터 조합:  {'regressor__n_neighbors': np.int64(9)}
RMSE:  3.0615672862185073


In [12]:
from sklearn.metrics import root_mean_squared_error

knn_pred = knn_search.predict(X_test)
print('테스트 RMSE: ', root_mean_squared_error(y_test, knn_pred))

테스트 RMSE:  3.121771059103236


#### Decision Tree
```
from sklearn.tree import DecisionTreeRegressor
```

In [13]:
from sklearn.tree import DecisionTreeRegressor

full_pipe2 = Pipeline([
    ('preprocess', preprocess), ('regressor', DecisionTreeRegressor())
])

In [14]:
DecisionTreeRegressor().get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [15]:
decisiontree_param = {'regressor__ccp_alpha' : np.arange(0.01, 0.3, 0.05)}

decisiontree_search = GridSearchCV(estimator = full_pipe2, param_grid = decisiontree_param, cv = 5, scoring = 'neg_mean_squared_error')
decisiontree_search.fit(X_train, y_train)

In [17]:
print('Best 파라미터 조합: ', decisiontree_search.best_params_)
print('MSE: ', -decisiontree_search.best_score_)

Best 파라미터 조합:  {'regressor__ccp_alpha': np.float64(0.26)}
MSE:  9.403541096157653


In [19]:
from sklearn.metrics import mean_squared_error

dt_pred = decisiontree_search.predict(X_test)
# print(mean_squared_error.__doc__)
print('테스트 MSE:', mean_squared_error(y_test, dt_pred))

테스트 MSE: 10.23195890566565


#### Ensemble 앙상블

###### Bagging(배깅)
```
from sklearn.ensemble import BaggingRegressor
```

In [20]:
from sklearn.ensemble import BaggingRegressor

full_pipe_bagging = Pipeline([
    ('preprocess', preprocess),
    ('regressor', BaggingRegressor())
])

In [24]:
BaggingRegressor().get_params()

{'bootstrap': True,
 'bootstrap_features': False,
 'estimator': None,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [25]:
Bagging_param = {'regressor__n_estimators' : np.arange(10, 100, 20), 'regressor__random_state' : [0]}

Bagging_search = GridSearchCV(estimator = full_pipe_bagging, param_grid = Bagging_param, cv = 5, scoring = 'neg_root_mean_squared_error')

Bagging_search.fit(X_train, y_train)

In [27]:
print('Best 파라미터 조합: ', Bagging_search.best_params_)
print('RMSE: ', -Bagging_search.best_score_)

Best 파라미터 조합:  {'regressor__n_estimators': np.int64(30), 'regressor__random_state': 0}
RMSE:  3.080936524776625


In [28]:
from sklearn.metrics import mean_squared_error

bag_pred = Bagging_search.predict(X_test)
# print(mean_squared_error.__doc__)
print('테스트 MSE:', mean_squared_error(y_test, bag_pred))

테스트 MSE: 9.469390221661055


##### RandomForest
```
from sklearn.ensemble import RandomForestRegressor
```

In [29]:
from sklearn.ensemble import RandomForestRegressor

full_pipe_randomforest = Pipeline([
    ('preprocess', preprocess),
    ('regressor', RandomForestRegressor(random_state = 0))
])

In [31]:
RandomForestRegressor().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [34]:
RandomForest_param = {'regressor__n_estimators' : np.arange(100, 500, 100), 'regressor__max_features' : ['sqrt']}

RandomForest_search = GridSearchCV(estimator = full_pipe_randomforest, param_grid = RandomForest_param, scoring = 'neg_root_mean_squared_error', cv = 5)

RandomForest_search.fit(X_train, y_train)

In [35]:
print('Best 파라미터 조합: ', RandomForest_search.best_params_)
print('RMSE: ', -RandomForest_search.best_score_)

Best 파라미터 조합:  {'regressor__n_estimators': np.int64(30), 'regressor__random_state': 0}
RMSE:  3.080936524776625


In [37]:
from sklearn.metrics import root_mean_squared_error

rf_pred = RandomForest_search.predict(X_test)
# print(mean_squared_error.__doc__)
print('테스트 RMSE:', root_mean_squared_error(y_test, rf_pred))

테스트 RMSE: 3.127018418302721


##### Gradient Boosting
```
from sklearn.ensemble import GradientBoostingRegressor
```

In [46]:
from sklearn.ensemble import GradientBoostingRegressor

full_pipe_gradientboosting = Pipeline([
    ('preprocess', preprocess),
    ('regressor', GradientBoostingRegressor(random_state = 0))
])

In [47]:
GradientBoostingRegressor().get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [49]:
GradientBoosting_param = {'regressor__learning_rate' : np.arange(0.1, 0.3, 0.05)}

GradientBoosting_search = GridSearchCV(estimator = full_pipe_gradientboosting, param_grid = GradientBoosting_param, scoring = 'neg_mean_squared_error', cv = 5)

GradientBoosting_search.fit(X_train, y_train)

In [52]:
print('Best 파라미터 조합: ', GradientBoosting_search.best_params_)
print('RMSE: ', -GradientBoosting_search.best_score_)

Best 파라미터 조합:  {'regressor__learning_rate': np.float64(0.1)}
RMSE:  10.801391426542164


In [53]:
from sklearn.metrics import mean_squared_error

gb_pred = GradientBoosting_search.predict(X_test)
print('테스트 MSE:', mean_squared_error(y_test, gb_pred))

테스트 MSE: 10.576858829166314


#### 고급 회귀 기법(SVR, Support Vector Regression)
```
from sklearn.svm import SVR
```

In [55]:
from sklearn.svm import SVR

full_pipeline_SVR = Pipeline([
    ('preprocess', preprocess),
    ('regressor', SVR())
])

In [56]:
SVR().get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [57]:
SVR_param = {'regressor__C' : np.arange(1, 100, 20)}
SVR_search = GridSearchCV(estimator = full_pipeline_SVR, param_grid = SVR_param, scoring = 'neg_mean_squared_error', cv = 5)
SVR_search.fit(X_train, y_train)

In [58]:
print('Best 파라미터 조합: ', SVR_search.best_params_)
print('RMSE: ', -SVR_search.best_score_)

Best 파라미터 조합:  {'regressor__C': np.int64(1)}
RMSE:  8.905507590431398


In [59]:
from sklearn.metrics import mean_squared_error

svr_pred = SVR_search.predict(X_test)
print('테스트 MSE:', mean_squared_error(y_test, svr_pred))

테스트 MSE: 10.141966042523615


#### 작업형 제2유형 모범답안
학생 성적데이터 => grade 예측하는 것. 결과는 result.csv로 저장하기
```
import pandas as pd
import numpy as np
train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/st_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/st_test.csv')
```

In [132]:
import pandas as pd
import numpy as np
train = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/st_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/data/main/st_test.csv')

In [133]:
y_train = train['grade']
X_train = train.drop(['grade'], axis=1)

y_test = test['grade']
X_test = test.drop(['grade'], axis=1)

In [134]:
# train['goout'] 결측치 처리해줘야 함.
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split

imputer = SimpleImputer(strategy='mean').set_output(transform = 'pandas')
encoder = OrdinalEncoder().set_output(transform = 'pandas')
stadardscaler = StandardScaler()

num_columns = X_train.select_dtypes('number').columns
cat_columns = X_train.select_dtypes('object').columns

X_train[num_columns] = imputer.fit_transform(X_train[num_columns])
X_test[num_columns] = imputer.transform(X_test[num_columns])

X_train[num_columns] = stadardscaler.fit_transform(X_train[num_columns])
X_test[num_columns] = stadardscaler.transform(X_test[num_columns])

X_train[cat_columns] = encoder.fit_transform(X_train[cat_columns])
X_test[cat_columns] = encoder.transform(X_test[cat_columns])

In [135]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(random_state = 0)

regr.fit(X_train, y_train)

In [137]:
test_pred = regr.predict(X_test)

from sklearn.metrics import root_mean_squared_error

regr_rmse = root_mean_squared_error(y_test, test_pred)
print(regr_rmse)

3.147467937232894


In [138]:
test_pred = regr.predict(X_test)
pd.DataFrame({'pred' : test_pred}).to_csv('result.csv', index = False)

### Section 02 연습문제 : scikit-learn을 활용한 회귀 모델 적합

다음 학습용 데이터(prestige_train.csv)는 1971년 캐나나 직업군에 대한 사회적 지위, 교육 수준, 소득, 여성 비율 등을 조사한 자료이다.

| 변수명 | 설명 |
|:--|:--|
| education | 해당 직업 종사자의 평균 교육 기간 |
| income | 해당 직업 종사자의 평균 소득 |
| women | 해당 직업 종사자 중 여성의 비율 |
| prestige | Pineo-Porter 명망(prestige) 점수 |
| census | 캐나다 인구조사(1971년) 직업 코드 |
| type | 직업 유형 분류 |


```
import pandas as pd
import numpy as np
train = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/data/main/prestige_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/data/main/prestige_test.csv")
print(train.head())
```

학습용 데이터를 활용하여 명망 점수(prestige)를 예측하는 모델을 개발하고, 이 중 가장 우수한 모델을 평가용 데이터(prestige_test.csv)에 적용하여 명망 점수를 예측하시오.

% 예측 결과는 RSME(Root Mean Squared Error) 평가지표에 따라 평가

제출형식
- csv 파일명 : result.csv (파일명에 디렉토리, 폴더 지정 불가)
- 예측 칼럼명 : pred
- 제출 칼럼 개수 : pred 칼럼 1개
- 평가용 데이터 개수와 예측 결과 데이터 개수 일치

In [236]:
import pandas as pd
import numpy as np
train = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/data/main/prestige_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/data/main/prestige_test.csv")
print(train.head())

   education  income  women  prestige  census  type
0       8.49    8845   0.00      48.9    9131    bc
1      11.59    4036  97.51      46.0    4111    wc
2      15.77   19263   5.13      82.3    2343  prof
3      11.49    3148  95.97      41.9    4113    wc
4      13.11   12351  11.16      68.8    1113  prof


In [237]:
train.isnull().sum()

Unnamed: 0,0
education,0
income,0
women,0
prestige,0
census,0
type,4


In [238]:
print(len(train['census'].unique()))
print(len(test['census'].unique()))

71
31


In [239]:
# train = train.drop('census', axis = 1)
# test = test.drop('census', axis = 1)

X_train = train.drop('prestige', axis = 1)
y_train = train['prestige']

X_test = test.drop('prestige', axis = 1)
y_test = test['prestige']

In [240]:
X_train.head()

Unnamed: 0,education,income,women,census,type
0,8.49,8845,0.0,9131,bc
1,11.59,4036,97.51,4111,wc
2,15.77,19263,5.13,2343,prof
3,11.49,3148,95.97,4113,wc
4,13.11,12351,11.16,1113,prof


In [241]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer

type_mode = X_train['type'].mode()[0]

X_train['type']  = X_train['type'].fillna(type_mode)
X_test['type']  = X_test['type'].fillna(type_mode)

cat_cols = X_train.select_dtypes('object').columns
num_cols = X_train.select_dtypes('number').columns

ct = make_column_transformer(
    (StandardScaler(), num_cols),
    (OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
).set_output(transform='pandas')

X_train_prep = ct.fit_transform(X_train)
X_test_prep = ct.transform(X_test)

In [242]:
from sklearn.ensemble import RandomForestRegressor

rf_regression = RandomForestRegressor(random_state = 0)
rf_regression.fit(X_train_prep, y_train)

In [243]:
from sklearn.metrics import root_mean_squared_error

predict_rf = rf_regression.predict(X_test_prep)

# rmse = root_mean_squared_error(y_test, predict_rf)
# print(rmse)
pd.DataFrame({'pred' : predict_rf}).to_csv('result.csv', index = False)

## 작업형 2유형 풀이 방법

1. train_X, train_y, test_X, test_y 로 데이터 분할.
2. 결측치 확인 train.isnull().sum()   
        - (범주형이면 보통 fillna(df[cat_columns].mode()[0]), 수치형이면 mean, median으로) -> 대치.
3. make_column_transformer 로 수치형, 범주형 한번에 처리하기!   
```
ct = make_column_transformer(
    (StandardScaler(), num_cols),
    (OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
).set_output(transform='pandas')

ct.fit_transform / ct.transform
```
4. 모델 작동. 분류면, `RandomForestClassifier` 회귀면, `RandomForestRegressor`
5. 예측하기. 확률이나 auc일 때는 predict_proba임.   
predict(X_test) / predict_proba(X_test)[:, 1]
6. 필요한 칼럼에 맞춰서 pd.DataFrame / df.to_csv('파일명', index = False) 하기

predict() : 정확도(accuracy), f1-score, confusion matrix   
predict_proba() : ROC Curve, AUC, Log Loss 계산 시