## target: 팽귄의 서식지 (island)

In [16]:
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import RandomForest

In [2]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/datasets/penguins.csv")

In [3]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


### 결측치 제거

In [4]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [5]:
re = df.dropna(inplace=True)
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,FEMALE
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [6]:
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

### 타겟 - 레이블 인코딩

In [10]:
labels = []
encoder = LabelEncoder()
items = list(df['island'])
encoder.fit(items)
labels = encoder.transform(items)
df['island'] = labels

In [11]:
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,2,39.1,18.7,181.0,3750.0,MALE
1,Adelie,2,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,2,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,2,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,2,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...,...
338,Gentoo,0,47.2,13.7,214.0,4925.0,FEMALE
340,Gentoo,0,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,0,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,0,45.2,14.8,212.0,5200.0,FEMALE


### species - 원핫인코딩 (종, 성별)

In [12]:
df = pd.get_dummies(df)

In [13]:
df

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species_Adelie,species_Chinstrap,species_Gentoo,sex_FEMALE,sex_MALE
0,2,39.1,18.7,181.0,3750.0,1,0,0,0,1
1,2,39.5,17.4,186.0,3800.0,1,0,0,1,0
2,2,40.3,18.0,195.0,3250.0,1,0,0,1,0
4,2,36.7,19.3,193.0,3450.0,1,0,0,1,0
5,2,39.3,20.6,190.0,3650.0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
338,0,47.2,13.7,214.0,4925.0,0,0,1,1,0
340,0,46.8,14.3,215.0,4850.0,0,0,1,1,0
341,0,50.4,15.7,222.0,5750.0,0,0,1,0,1
342,0,45.2,14.8,212.0,5200.0,0,0,1,1,0


### 모델: DecisionTree

In [14]:
X = df.drop(['island'], axis=1)
y = df['island']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
print("모델의 정확도: ", round(dtc.score(X_test, y_test), 4))

모델의 정확도:  0.6866


### StandardScaler

In [18]:
from sklearn.preprocessing import StandardScaler

#  Standardization 평균 0 / 분산 1
std = StandardScaler()

In [19]:
std.fit(X_train)
X_train_scaled = std.transform(X_train)
X_test_scaled = std.transform(X_test)
dtc.fit(X_train_scaled, y_train)

print('모델의 정확도 :', round(dtc.score(X_test_scaled, y_test), 4))

모델의 정확도 : 0.6866


In [20]:
X_train

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species_Adelie,species_Chinstrap,species_Gentoo,sex_FEMALE,sex_MALE
107,38.2,20.0,190.0,3900.0,1,0,0,0,1
6,38.9,17.8,181.0,3625.0,1,0,0,1,0
19,46.0,21.5,194.0,4200.0,1,0,0,0,1
53,42.0,19.5,200.0,4050.0,1,0,0,0,1
52,35.0,17.9,190.0,3450.0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...
284,45.8,14.2,219.0,4700.0,0,0,1,1,0
263,49.6,15.0,216.0,4750.0,0,0,1,0,1
137,40.2,20.1,200.0,3975.0,1,0,0,0,1
256,42.6,13.7,213.0,4950.0,0,0,1,1,0


In [21]:
X_train_scaled

array([[-1.05397698,  1.42807536, -0.76903935, ..., -0.7271421 ,
        -0.99250926,  0.99250926],
       [-0.92586776,  0.30676891, -1.41187461, ..., -0.7271421 ,
         1.00754728, -1.00754728],
       [ 0.37352576,  2.19260249, -0.48333479, ..., -0.7271421 ,
        -0.99250926,  0.99250926],
       ...,
       [-0.68795064,  1.47904384, -0.05477794, ..., -0.7271421 ,
        -0.99250926,  0.99250926],
       [-0.24871903, -1.78293858,  0.87376188, ...,  1.37524701,
         1.00754728, -1.00754728],
       [ 0.39182707,  0.51064281, -1.62615303, ..., -0.7271421 ,
         1.00754728, -1.00754728]])

### MinMaxScaler

In [22]:
from sklearn.preprocessing import MinMaxScaler

#  MinMaxScaler 최솟값 0 / 최댓값 1
mms = MinMaxScaler()

In [23]:
# 교차검증
mms.fit(X_train)
X_train_scaled = mms.transform(X_train)
X_test_scaled = mms.transform(X_test)
dtc.fit(X_train_scaled, y_train)

print('모델의 정확도 :', round(dtc.score(X_test_scaled, y_test), 4))

모델의 정확도 : 0.6866


### MaxAbsScaler

In [24]:
from sklearn.preprocessing import MaxAbsScaler

#절댓값
scaler = MaxAbsScaler()

In [25]:
#교차검증시
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

dtc.fit(X_train_scaled, y_train)
print('모델의 정확도 :', round(dtc.score(X_test_scaled, y_test), 4))

모델의 정확도 : 0.7015


### RobustScaler

In [26]:
# RobustScaler: 중앙값 0, IQR=1, 이상치 영향 최소화하고 더 넓게 분포
from sklearn.preprocessing import RobustScaler

#중앙값0/IQR 1
rbs = RobustScaler()

In [27]:
#교차검증시
X_train_scaled = rbs.fit_transform(X_train)
X_test_scaled = rbs.transform(X_test)
dtc.fit(X_train_scaled, y_train)

print('모델의 정확도 :', round(dtc.score(X_test_scaled, y_test), 4))

모델의 정확도 : 0.6716


## 모델: RandomForest

In [28]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
print("모델의 정확도: ", round(rfc.score(X_test, y_test), 4))

모델의 정확도:  0.6567


In [29]:
from sklearn.preprocessing import MaxAbsScaler

#절댓값
scaler = MaxAbsScaler()

In [30]:
#교차검증시
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

dtc.fit(X_train_scaled, y_train)
print('모델의 정확도 :', round(dtc.score(X_test_scaled, y_test), 4))

모델의 정확도 : 0.7015
