# 데이터 로드 

In [1]:
import seaborn as sns

dataset = sns.get_dataset_names()
df = sns.load_dataset('penguins')
df.head(5)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


# 데이터를 처음 로드해오면 결측치 체크를 해야함

In [2]:
df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

### 결측치 제거 
1. 중앙값을 활용해 채우기(연속형) 2. 비율이 많은쪽으로 채우기(범주형)


In [3]:
#1 중앙값(평균) 이용하기
missing = ['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']
for i in missing:
    df[i] = df[i].fillna(df[i].median())

#2 비율이 많은쪽 
df['sex'].value_counts() # 비율체크
df['sex'] = df['sex'].fillna('Male')

In [4]:
df.isna().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

### 문자열 변수를 머신러닝이 되게끔 라벨인코딩이 필요함
sklearn 모듈 사용

In [7]:
from sklearn.preprocessing import LabelEncoder
label = ['species','island' ,'sex']
df[label]=df[label].apply(LabelEncoder().fit_transform)
 
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,1
1,0,2,39.5,17.4,186.0,3800.0,0
2,0,2,40.3,18.0,195.0,3250.0,0
3,0,2,44.45,17.3,197.0,4050.0,1
4,0,2,36.7,19.3,193.0,3450.0,0


### 데이터 변환, 더미처리
pandas 사용

숫자형 변수로 바뀌었지만 이를 계산해서 값을 도출하는게 목적이 아님 ->
범주형 변수 처리를 해야함 
변환 후 더미처리를 해야하는데 , 우선 종속변수는 건들지 않음

In [8]:
import pandas as pd

category = ['island','sex']
for i in category:
    df[i] = df[i].astype('category')
df = pd.get_dummies(df)

df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0
3,0,44.45,17.3,197.0,4050.0,0,0,1,0,1
4,0,36.7,19.3,193.0,3450.0,0,0,1,1,0


### 더미처리가 끝나면 파생변수를 만들어야함
머신러닝이 학습할때 성능 면에서 업그레이드 시킬수 있는 여지가 있음

pandas qcut -> 정규분포를 보이는 수치형 변수를 구간화 시켜줌

In [10]:
df['body_mass_g_qcut'] =pd.qcut(df['body_mass_g'], 5, labels=False)
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1,body_mass_g_qcut
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0,1
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0,0
3,0,44.45,17.3,197.0,4050.0,0,0,1,0,1,2
4,0,36.7,19.3,193.0,3450.0,0,0,1,1,0,0


In [11]:
df['body_mass_g_qcut'].value_counts()
## 분포가 균일하게 나눠짐

0    71
1    70
2    68
4    68
3    67
Name: body_mass_g_qcut, dtype: int64

## 스케일링
sklearn preprocessing / help() 함수 써서 함수 생각안나면 봐야함

In [14]:
# help('sklearn.preprocessing')
from sklearn.preprocessing import MinMaxScaler

In [15]:
scaler = ['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']
min = MinMaxScaler()
min.fit(df[scaler])
df[scaler] = min.transform(df[scaler])
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1,body_mass_g_qcut
0,0,0.254545,0.666667,0.152542,0.291667,0,0,1,0,1,1
1,0,0.269091,0.511905,0.237288,0.305556,0,0,1,1,0,1
2,0,0.298182,0.583333,0.389831,0.152778,0,0,1,1,0,0
3,0,0.449091,0.5,0.423729,0.375,0,0,1,0,1,2
4,0,0.167273,0.738095,0.355932,0.208333,0,0,1,1,0,0


## 전처리의 과정 완료 
### 학습용/테스트용 데이터 분류 
sklearn 모듈 사용

In [24]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(df.iloc[:,1:], df['species'], test_size=0.2 , stratify=df['species'], random_state = 1)

In [27]:
print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)

X_train (275, 10)
X_test (69, 10)
y_train (275,)
y_test (69,)


## 데이터 전처리 완료 

# ------------------------------------------------------

# 모형 학습
### 1)랜덤포레스트 2)아다부스트 3) 1,2합친 앙상블(보팅기법)

In [28]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier()
model1.fit(X_train,y_train)
pred1 = model1.predict(X_test)

In [29]:
pred1

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1], dtype=int64)

In [30]:
from sklearn.ensemble import AdaBoostClassifier
model2 = AdaBoostClassifier()
model2.fit(X_train,y_train)
pred2 = model2.predict(X_test)

In [31]:
pred2

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 1, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1], dtype=int64)

In [33]:
from sklearn.ensemble import VotingClassifier
clf = VotingClassifier(estimators=[('rf',model1),('ad',model2)],voting='hard')
clf.fit(X_train,y_train)
pred3 = clf.predict(X_test)

In [34]:
pred3

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1], dtype=int64)

# 모형 평가
sklearn.metrics 의 accuracy_score를 써도되지만 classification_report는 다 계산함

여러 모형 지표가 있음 / 분류와 회귀는 평가지표가 다름 

In [37]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('rf 정확도', accuracy_score(y_test,pred1))
print('ad 정확도', accuracy_score(y_test,pred2))
print('voting 정확도', accuracy_score(y_test,pred3))

rf 정확도 1.0
ad 정확도 0.9855072463768116
voting 정확도 1.0


### 하이퍼 파라미터 튜닝 
시험에선 안써도 되긴함

In [47]:
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[50,100], 'max_depth':[4,6]}
model4 = RandomForestClassifier()
clf = GridSearchCV(estimator=model4 , param_grid=parameters,cv=3)
clf.fit(X_train,y_train)
print('최적의 파라미터', clf.best_params_)

최적의 파라미터 {'max_depth': 4, 'n_estimators': 50}


# 데이터 저장(중요!)
pandas dataframe 만들어야 하고 to_csv 내보내고 read_csv로 확인하기

In [48]:
pd.DataFrame({'id':y_test.index, 'pred':pred3}).to_csv('0030.csv',index=False)

In [49]:
pd.read_csv('0030.csv')

Unnamed: 0,id,pred
0,57,0
1,173,1
2,213,1
3,50,0
4,25,0
...,...,...
64,171,1
65,129,0
66,287,2
67,13,0


# index=False 안하면 열갯수가 달라져서 채점이 안됨 !!