
<img align="right" src="https://ds-cs-images.s3.ap-northeast-2.amazonaws.com/Codestates_Fulllogo_Color.png" width=100>

## *DATA SCIENCE / SECTION 2 / SPRINT 2 / NOTE 4*

# 📝 Assignment
---

In [1]:
# !pip install --upgrade pandas_profiling
# !pip install --upgrade category_encoders

In [2]:
# # connect kaggle colab
# !pip install kaggle
# from google.colab import files
# files.upload()

# # 파일이 제대로 업로드 되었는지 확인
# print('\n--------파일 제대로 업로드 되었는지 확인--------\n')
# !ls -1ha kaggle.json
# # 파일 업로드 제대로 되었는지 확인 완료!

# # kaggle API를 사용하기 위해 json 파일 ~/.kaggle로 이동
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# # Permission Warning 이 일어나지 않도록 
# !chmod 600 ~/.kaggle/kaggle.json
# # 본인이 참가한 모든 대회 보기
# print('\n---------------내가 참가한 대회---------------\n') 
# !kaggle competitions list

# # 데이터 셋 불러오기
# print('\n------------upload dataset------------\n')
# !kaggle competitions download -c prediction-of-h1n1-vaccination

# 모델선택(Model Selection)

### 1) 캐글 대회를 이어서 진행합니다. RandomizedSearchCV 를 사용하여 하이퍼파라미터 튜닝을 진행합니다.

- [RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)를 사용하세요.
- 분류문제에서 맞는 [scoring parameter](https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values) metric을 사용하세요.
- [OrdinalEncoder](https://contrib.scikit-learn.org/categorical-encoding/ordinal.html) 사용을 권합니다.
- RandomizedSearchCV 를 사용해서 하이퍼파라미터 튜닝을 진행하고 최고 성능을 보이는 모델로 예측을 진행한 후 캐글에 제출합니다.
- **캐글 Leaderboard에서 개선된 본인 Score를 과제 제출폼에 제출하세요.**

# Dataset 불러오기

In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer 
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
import numpy as np

target = 'vacc_h1n1_f'
train = pd.merge(pd.read_csv('/content/train.csv.zip'), pd.read_csv('/content/train_labels.csv')[target],left_index=True , right_index=True)
test = pd.read_csv('/content/test.csv.zip')

  import pandas.util.testing as tm


# Feature Engineering

In [4]:
## feature engineering
from category_encoders import OneHotEncoder
from sklearn.model_selection import train_test_split

def change(x):
  if int(x) > 0 :
    return 0
  else:
    return 1

def f_engineering(df):
  # behaviorals feature 생성
  behaviorals = [col for col in df.columns if 'behavioral' in col]
  df['behaviorals'] = df.loc[:,behaviorals].sum(axis = 1)

  # n_adult_r feature 삭제, household_children & n_people_r 두 가지 feature와 다중 공산성 문제가 발생할 수 있다고 판단.
  df.drop('n_adult_r',axis=1)

  # 건강보험이 있는 workers 
  workers_h_insurances = ['health_insurance','health_worker']
  df['workers_have_insurances'] = df.loc[:,workers_h_insurances].sum(axis = 1)
  df['workers_have_insurances'] = df['workers_have_insurances'].apply(change)

  # seas & employment가 들어 있는 feature 제거 
  dels = [col for col in df.columns if ('employment' in col or 'sea' in col)] # employment와 seas를 가지고 있는 특성을 모두 제거
  df.drop(columns=dels, inplace=True)

  return df 

train = f_engineering(train)
test = f_engineering(test)

In [5]:
# split dataset
y_train = train[target]
X_train = train.drop(target,axis=1)
_, X_val, _, y_val = train_test_split(X_train, y_train, test_size=.2, random_state=2)

# RandomsizedSearchCV 사용 하이퍼파라미터 튜닝

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# create pipe
pipe = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier()
)

# tuning hyper parameters  RandomForestClassifier
dists = {
    'randomforestclassifier__class_weight' : ['balanced', 'balanced_subsample'],
    'randomforestclassifier__max_depth' : range(1,100,5),
    'randomforestclassifier__n_estimators' : randint(50,500),
    'randomforestclassifier__min_samples_leaf' : np.random.rand(5),
    'randomforestclassifier__max_features' : ['auto','sqrt','log2'],
    'simpleimputer__strategy': ['mean','most_frequent']
}

clf = RandomizedSearchCV(
    pipe, 
    param_distributions = dists,
    n_iter = 50 ,
    cv = 3,
    n_jobs = -1, 
    verbose = 1
)

clf.fit(X_train,y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.2min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('ordinalencoder',
                                              OrdinalEncoder(cols=None,
                                                             drop_invariant=False,
                                                             handle_missing='value',
                                                             handle_unknown='value',
                                                             mapping=None,
                                                             return_df=True,
                                                             verbose=0)),
                                             ('simpleimputer',
                                              SimpleImputer(add_indicator=False,
                                                            copy=True,
                                                            fill_value=None,


In [7]:
print(f'최적의 파라미터: {clf.best_params_}')
print(f'mean_cv_score: {clf.best_score_}')

최적의 파라미터: {'randomforestclassifier__class_weight': 'balanced_subsample', 'randomforestclassifier__max_depth': 1, 'randomforestclassifier__max_features': 'auto', 'randomforestclassifier__min_samples_leaf': 0.1388336514961238, 'randomforestclassifier__n_estimators': 316, 'simpleimputer__strategy': 'mean'}
mean_cv_score: 0.7537608362266505


In [35]:
# Validation Set Score
print('검증 score :', clf.best_estimator_.score(X_val, y_val))
print('F1 score :', f1_score(y_val, clf.best_estimator_.predict(X_val)))

검증 score : 0.7730992764796584
F1 score : 0.5935840237943488


In [13]:
## 임계값 확인 해보니 , 기존 0.5일때가 가장 f1 score가 좋게 나온다.
from ipywidgets import interact, fixed
import seaborn as sns
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

pipe = clf.best_estimator_
y_pred = pipe.predict(X_val)
y_pred_proba = pipe.predict_proba(X_val)[:, 1]

def explore_threshold(y_true, y_pred_proba, threshold=0.5):
    y_pred = y_pred_proba >= threshold
    vc = pd.Series(y_pred).value_counts()
    ax = sns.histplot(y_pred_proba, kde=True)
    ax.axvline(threshold, color='red')
    ax.set_title(f'# of target, 1={vc[1]}, 0={vc[0]}')
    plt.show()
    print(classification_report(y_true, y_pred))

    
interact(explore_threshold, 
    y_true=fixed(y_val), 
    y_pred_proba=fixed(y_pred_proba), 
    threshold=(0, 1, 0.01));

interactive(children=(FloatSlider(value=0.5, description='threshold', max=1.0, step=0.01), Output()), _dom_cla…

In [17]:
y_pred = pipe.predict(test)

my_submission = pd.DataFrame({
    'Id' : range(0, len(y_pred)),
    'vacc_h1n1_f' : y_pred})
my_submission
my_submission.to_csv('nnnnnnnnewsubmission.csv', index=False)

## 🔥 도전과제


### 2) [`GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) 를 사용하여 하이퍼파라미터 튜닝을 진행합니다.
- 모델 성능을 높이기 위해 가능한 시도를 다 해보세요.
- 모델 성능에 가장 큰 영향을 준 하이퍼파라미터에 대해서 분석하고 설명해 보세요.



In [28]:
from sklearn.model_selection import GridSearchCV

pipe_grid = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier()
)

dists = {
    'randomforestclassifier__max_depth' : [1,2,3,4],
    'randomforestclassifier__n_estimators' : [314,315,316,317],
    'randomforestclassifier__min_samples_leaf' : [0.13,0.14,0.15],
    }

clf = GridSearchCV(
    pipe, 
    param_grid = dists,
    cv = 2,
    n_jobs = -1, 
    verbose = 1,
    return_train_score=True
)

clf.fit(X_train,y_train)

Fitting 2 folds for each of 48 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:  4.5min finished
  elif pd.api.types.is_categorical(cols):


GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('ordinalencoder',
                                        OrdinalEncoder(cols=['opinion_h1n1_vacc_effective',
                                                             'opinion_h1n1_risk',
                                                             'opinion_h1n1_sick_from_vacc',
                                                             'agegrp',
                                                             'census_msa',
                                                             'state'],
                                                       drop_invariant=False,
                                                       handle_missing='value',
                                                       handle_unknown='value',
                                                       mapping=[{'col': 'opinion_h1n1_vacc_effective',
                                        

In [29]:
print(f'최적의 파라미터: {clf.best_params_}')
print(f'mean_cv_score: {clf.best_score_}')

최적의 파라미터: {'randomforestclassifier__max_depth': 1, 'randomforestclassifier__min_samples_leaf': 0.13, 'randomforestclassifier__n_estimators': 315}
mean_cv_score: 0.7720975470892442


In [30]:
# Validation Set Score
print('Accuracy :', clf.best_estimator_.score(X_val, y_val))
print('F1 score :', f1_score(y_val, clf.predict(X_val)))

Accuracy : 0.7730992764796584
F1 score : 0.5935840237943488
