##### 자료 불러오기

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('penguins')

##### 결측치 제거

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [3]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [4]:
df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female


In [6]:
null_value = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']
for idx, i in enumerate(null_value):
    if idx < 4:
        df[i] = df[i].fillna(df[i].median())
    else:
        df[i] = df[i].fillna('Male')

In [7]:
df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female


In [8]:
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

#### Label Encoding

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
labeled = ['species', 'island', 'sex']
for i in labeled:
    df[i] = le.fit_transform(df[i])

In [10]:
df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,1
1,0,2,39.5,17.4,186.0,3800.0,0
2,0,2,40.3,18.0,195.0,3250.0,0


##### OH Encoding

In [11]:
df['island'] = df['island'].astype('category')
df['sex'] = df['sex'].astype('category')

In [12]:
df = pd.get_dummies(df)

In [13]:
df.head(3)

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1
0,0,39.1,18.7,181.0,3750.0,False,False,True,False,True
1,0,39.5,17.4,186.0,3800.0,False,False,True,True,False
2,0,40.3,18.0,195.0,3250.0,False,False,True,True,False


##### MinMax Scaling

In [15]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
scaled = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
minmax.fit(df[scaled])
df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']] = minmax.transform(df[scaled])

In [16]:
df.head(3)

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1
0,0,0.254545,0.666667,0.152542,0.291667,False,False,True,False,True
1,0,0.269091,0.511905,0.237288,0.305556,False,False,True,True,False
2,0,0.298182,0.583333,0.389831,0.152778,False,False,True,True,False


##### 데이터 분리

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df['species'], test_size=0.2, stratify=df['species'], random_state=1)

In [18]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(275, 9) (69, 9)
(275,) (69,)


##### 모형 학습

In [19]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier()
model1.fit(X_train, y_train)
pred1 = model1.predict(X_test) 

In [20]:
from sklearn.ensemble import AdaBoostClassifier
model2 = AdaBoostClassifier()
model2.fit(X_train, y_train)
pred2 = model2.predict(X_test)

##### ensemble

In [21]:
from sklearn.ensemble import VotingClassifier
clf = VotingClassifier(estimators=[('rf', model1), ('ad', model2)], voting='hard')
clf.fit(X_train, y_train)
pred3 = clf.predict(X_test)

##### 모형평가

In [22]:
from sklearn.metrics import accuracy_score
print('rf accuracy: ', accuracy_score(y_test, pred1))
print('ada accuracy: ', accuracy_score(y_test, pred2))
print('voting accuracy: ', accuracy_score(y_test, pred3))

rf accuracy:  1.0
ada accuracy:  0.9855072463768116
voting accuracy:  1.0
