<a href="https://colab.research.google.com/github/khalidpark/machinelearning_whitepaper/blob/main/Machine_learning_whitepaper_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 데이터 전처리

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
target = 'vacc_h1n1_f'
# target = 'vacc_seas_f'
train = pd.merge(pd.read_csv('https://ds-lecture-data.s3.ap-northeast-2.amazonaws.com/vacc_flu/train.csv'), 
                 pd.read_csv('https://ds-lecture-data.s3.ap-northeast-2.amazonaws.com/vacc_flu/train_labels.csv')[target], left_index=True, right_index=True)
test = pd.read_csv('https://ds-lecture-data.s3.ap-northeast-2.amazonaws.com/vacc_flu/test.csv')
sample_submission = pd.read_csv('https://ds-lecture-data.s3.ap-northeast-2.amazonaws.com/vacc_flu/submission.csv')


훈련, 검증, 테스트데이터로 나누기

In [4]:
train, val = train_test_split(train, train_size=0.80, test_size=0.20, 
                              stratify=train[target], random_state=2)


train.shape, val.shape, test.shape


((33723, 39), (8431, 39), (28104, 38))

타겟의 비율

In [5]:
train[target].value_counts(normalize=True)


0    0.760935
1    0.239065
Name: vacc_h1n1_f, dtype: float64

가장 큰 범주(0)가 76.15%를 차지 

클래스가 불균형(imbalanced)한 분류 문제



---



중복된 값있는지 확인

In [11]:
train.T.duplicated()


h1n1_concern                   False
h1n1_knowledge                 False
behavioral_antiviral_meds      False
behavioral_avoidance           False
behavioral_face_mask           False
behavioral_wash_hands          False
behavioral_large_gatherings    False
behavioral_outside_home        False
behavioral_touch_face          False
doctor_recc_h1n1               False
doctor_recc_seasonal           False
chronic_med_condition          False
child_under_6_months           False
health_insurance               False
health_worker                  False
opinion_h1n1_vacc_effective    False
opinion_h1n1_risk              False
opinion_h1n1_sick_from_vacc    False
opinion_seas_vacc_effective    False
opinion_seas_risk              False
opinion_seas_sick_from_vacc    False
agegrp                         False
education_comp                 False
raceeth4_i                     False
sex_i                          False
inc_pov                        False
marital                        False
r

혹시 너무 많은 범주를 가지는 카테고리 특성들이 있는지 카디널리티(cardinality)를 확인

In [12]:
train.describe(exclude='number')


Unnamed: 0,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,agegrp,employment_status,census_msa,employment_industry,employment_occupation,state
count,26637,26602,26568,26519,26466,26413,33723,25466,33723,13714,13714,33723
unique,6,6,6,6,6,6,7,3,3,23,25,51
top,Somewhat Effective,Somewhat Low,Not Very Worried,Somewhat Effective,Somewhat Low,Not At All Worried,65+ Years,Employed,"MSA, Not Principle City",Health Care and Social Assistance Industries,Management Occupations,CALIFORNIA
freq,11710,9980,9229,11571,9057,11813,6821,13722,15202,2451,1769,829


In [13]:
train.describe(exclude='number').T.sort_values(by='unique')


Unnamed: 0,count,unique,top,freq
employment_status,25466,3,Employed,13722
census_msa,33723,3,"MSA, Not Principle City",15202
opinion_h1n1_vacc_effective,26637,6,Somewhat Effective,11710
opinion_h1n1_risk,26602,6,Somewhat Low,9980
opinion_h1n1_sick_from_vacc,26568,6,Not Very Worried,9229
opinion_seas_vacc_effective,26519,6,Somewhat Effective,11571
opinion_seas_risk,26466,6,Somewhat Low,9057
opinion_seas_sick_from_vacc,26413,6,Not At All Worried,11813
agegrp,33723,7,65+ Years,6821
employment_industry,13714,23,Health Care and Social Assistance Industries,2451



state를 제외하고는 카디널리티가 그리 높지 않습니다

카테고리를 많이 가지는 특성들의 범주를 살펴보겠습니다

In [14]:
train['employment_occupation'].value_counts()


Management Occupations                                       1769
Office and Administrative Support Occupations                1556
Education, Training, and Library Occupations                 1286
Healthcare Practitioners and Technical Occupations           1200
Sales and Related Occupations                                1108
Business and Financial Operations Occupations                 764
Construction and Extraction Occupations                       538
Production Occupations                                        483
Transportation and Material Moving Occupations                483
Computer and Mathematical Occupations                         475
Food Preparation and Serving Related Occupations              405
Architecture and Engineering Occupations                      367
Arts, Design, Entertainment, Sports and Media Occupations     366
Personal Care and Service Occupations                         353
Community and Social Services Occupations                     335
Building a

특성 엔지니어링

In [15]:
import numpy as np

def engineer(df):
    """특성을 엔지니어링 하는 함수입니다."""
    
    # 높은 카디널리티를 가지는 특성을 제거합니다.
    selected_cols = df.select_dtypes(include=['number', 'object'])
    labels = selected_cols.nunique() # 특성별 카디널리티 리스트
    selected_features = labels[labels <= 30].index.tolist() # 카디널리티가 30보다 작은 특성만 선택합니다.
    df = df[selected_features]
    
    # 새로운 특성을 생성합니다.
    behaviorals = [col for col in df.columns if 'behavioral' in col] 
    df['behaviorals'] = df[behaviorals].sum(axis=1)
    
    
    dels = [col for col in df.columns if ('employment' in col or 'seas' in col)]
    df.drop(columns=dels, inplace=True)
        
    return df


train = engineer(train)
val = engineer(val)
test = engineer(test)


데이터에서 타겟과 특성 분리

In [16]:
features = train.drop(columns=[target]).columns


훈련/검증/테스트 데이터를 특성과 타겟으로 분리

In [17]:
# 훈련/검증/테스트 데이터를 특성과 타겟으로 분리합니다
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]




---



# 사이킷런 파이프라인

In [19]:
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 16.2MB/s eta 0:00:01[K     |████████▏                       | 20kB 6.6MB/s eta 0:00:01[K     |████████████▏                   | 30kB 4.4MB/s eta 0:00:01[K     |████████████████▎               | 40kB 4.1MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 2.1MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 2.4MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 2.6MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.2MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [20]:
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline


  import pandas.util.testing as tm


In [21]:
enc = OneHotEncoder()
imp_mean = SimpleImputer()
scaler = StandardScaler()
model_lr = LogisticRegression(n_jobs=-1)

X_train_encoded = enc.fit_transform(X_train)
X_train_imputed = imp_mean.fit_transform(X_train_encoded)
X_train_scaled = scaler.fit_transform(X_train_imputed)
model_lr.fit(X_train_scaled, y_train)

X_val_encoded = enc.transform(X_val)
X_val_imputed = imp_mean.transform(X_val_encoded)
X_val_scaled = scaler.transform(X_val_imputed)

# score method: Return the mean accuracy on the given test data and labels
print('검증세트 정확도', model_lr.score(X_val_scaled, y_val))

X_test_encoded = enc.transform(X_test)
X_test_imputed = imp_mean.transform(X_test_encoded)
X_test_scaled = scaler.transform(X_test_imputed)

y_pred = model_lr.predict(X_test_scaled)


  elif pd.api.types.is_categorical(cols):


검증세트 정확도 0.8185268651405527


파이프라인으로 똑같은 작업 진행시

In [22]:
pipe = make_pipeline(
    OneHotEncoder(), 
    SimpleImputer(), 
    StandardScaler(), 
    LogisticRegression(n_jobs=-1)
)
pipe.fit(X_train, y_train)

print('검증세트 정확도', pipe.score(X_val, y_val))

y_pred = pipe.predict(X_test)


  elif pd.api.types.is_categorical(cols):


검증세트 정확도 0.8185268651405527


파이프라인에서 모델의 파라미터 등과 같은 정보를 확인

In [23]:
pipe.named_steps


{'logisticregression': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=-1, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'onehotencoder': OneHotEncoder(cols=['opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
                     'opinion_h1n1_sick_from_vacc', 'agegrp', 'census_msa'],
               drop_invariant=False, handle_missing='value',
               handle_unknown='value', return_df=True, use_cat_names=False,
               verbose=0),
 'simpleimputer': SimpleImputer(add_indicator=False, copy=True, fill_value=None,
               missing_values=nan, strategy='mean', verbose=0),
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True)}