<a href="https://colab.research.google.com/github/linusms/Hands-on/blob/main/chapter_3_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
  !sudo apt-get install -y fonts-nanum
  !sudo fc-cache -fv
  !rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 20 not upgraded.
Need to get 9,604 kB of archives.
After this operation, 29.5 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 fonts-nanum all 20170925-1 [9,604 kB]
Fetched 9,604 kB in 0s (22.5 MB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype

In [None]:
import matplotlib.pyplot as plt

plt.rc('font', family='NanumBarunGothic') 

In [None]:
## 1. 97% 정확도의 MNIST 분류기

from sklearn.datasets import fetch_openml

mnist=fetch_openml('mnist_784', version=1, as_frame=False)

In [None]:
X, y= mnist['data'], mnist['target']
X.shape

(70000, 784)

In [None]:
X_train, X_test, y_train, y_test=X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
print(type(y_train))

<class 'numpy.ndarray'>


In [None]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

# 오류. 그리고 애초에 KNeighborsClassifier는 다중 레이블 분류 지원하므로
# 그냥 분류기 돌려도 자동으로 다중 레이블 설정됨
y_multilabel=np.empty((1,1),int)
for i in range(len(set(y_train.tolist()))):
  y_train_a=(y_train==i)
  y_multilabel=np.hstack((y_train, y_multilabel))

print(y_multilabel)

ValueError: ignored

In [None]:
## 실행하지 말 것. 시간 오래 걸림

from sklearn.model_selection import GridSearchCV

# KNeighborsClassifier()의 여러 파라미터들 중 가중치를 계산하는 방법인 weights,
# 검색할 이웃의 수인 n_neighbors를 이용
param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]

# grid_search의 사용. verbose는 각 수행당 출력 메세지 표시 정도를 나타냄
knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)

In [None]:
# 위 코드 실행시 나오는 결과값

grid_search.best_params_

# {'n_neighbors': 4, 'weights': 'distance'}

grid_search.best_score_

# 0.9716166666666666

from sklearn.metrics import accuracy_score

y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

# 0.9714

In [None]:
# 2. 데이터 증식(훈련 세트 확장)

# 이미지를 픽셀 단위로 옮길 수 있는 함수
from scipy.ndimage.interpolation import shift

# cval: 옮겨진 후 새로 생성된 픽셀들에 대해서 채울 값 지정
def shift_image(image, dx, dy):
    # 입력된 1*784 형태의 픽셀 정보들을 28*28의 이미지 형태로 재배열
    # 재배열된 이미지형태의 정보를 shift 함수로 이동 
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")

    # reshape 함수는 배열의 행,열 개수를 재설정해주는 함수
    # (-1)은 1*n의 배열로 만든다는 뜻 / (-1,k)는 (n/k)*k
    return shifted_image.reshape([-1])


image = X_train[1000]
# 5만큼 y방향(위), -5만큼 x방향(왼쪽)
# 실제로 작동되는 모습 보여줌
shifted_image_down = shift_image(image, 0, 5)
shifted_image_left = shift_image(image, -5, 0)

plt.figure(figsize=(12,3))
plt.subplot(131)
plt.title("Original", fontsize=14)
plt.imshow(image.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.subplot(132)
plt.title("Shifted down", fontsize=14)
plt.imshow(shifted_image_down.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.subplot(133)
plt.title("Shifted left", fontsize=14)
plt.imshow(shifted_image_left.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.show()

In [None]:
# 픽셀 이동 데이터들이 추가될 새로운 훈련 데이터셋, 훈련 데이터셋 레이블 만듬  

X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

# 1픽셀만큼 오른쪽/왼쪽/위/아래로 이동 후 기존 리스트에 append
for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)

# 리스트로 만들었던 데이터셋을 array 자료형으로 변경
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

# permutation() : 무작위 배열 만들기
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

# gridsearch로 찾은 최적의 파라미터들을 모두 인자로 받게 코드 작성
knn_clf = KNeighborsClassifier(**grid_search.best_params_)
knn_clf.fit(X_train_augmented, y_train_augmented)

In [None]:
# 3. 타이타닉 데이터셋 도전

from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [None]:
import os
os.getcwd()

'/content'

In [1]:
import pandas as pd
import numpy as np

train_data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/titanic/train.csv')
test_data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/titanic/test.csv')

# Survived가 타겟 데이터(생존 여부, 우리가 측정해야 하는 값)
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [2]:
train_data.info()

# Age, Cabin, Embarked는 결측치 존재

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
# 범주형 데이터 확인

train_data['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [5]:
train_data['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [6]:
train_data['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [7]:
train_data['Embarked'].value_counts()


S    644
C    168
Q     77
Name: Embarked, dtype: int64

**범주형 데이터의 결측값 최빈값으로 채우는 변환기 (유용!!!)**

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
 

# 범주형 데이터에서 최빈값을 찾고(value.counts()로 반환된 series에서 첫번째 행의 인덱스 값)
# fit()은 그 값을 most_frequent_ 속성에 저장
# transform()시 most_frequent_ 속성에 저장된 최빈값을 결측치에 채워넣음
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [9]:
# 희소행렬 옵션을 False로 두는 이유
# 각 특성에 대한 y/n를 쉽게 보기 위해
# 그리고 일부 알고리즘의 경우 희소행렬을 입력값으로 못받는 경우도 있음

cat_pipeline = Pipeline([
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
    ])

In [19]:
# 각 열에 다른 변환기 적용하는 ColumnTransformer
# 2개 이상의 변환 특정 열들에 적용 : 순차적으로 적용하면 X!
# 순차적으로 진행되는 변환기 하나의 파이프라인으로 묶고, 
# 그 파이프라인을 ColumnTransformer에 삽입(괄호 없이)

from sklearn.compose import ColumnTransformer

var1=['Age','SibSp','Parch','Fare']
var2=['Pclass','Sex','Embarked']

pipeline=ColumnTransformer([('num_pipeline',num_pipeline,var1),
    ('cat_pipeline', cat_pipeline, var2)
])

In [20]:
X_train=pipeline.fit_transform(train_data)
print(X_train)
X_train.shape

[[22.  1.  0. ...  0.  0.  1.]
 [38.  1.  0. ...  1.  0.  0.]
 [26.  0.  0. ...  0.  0.  1.]
 ...
 [28.  1.  2. ...  0.  0.  1.]
 [26.  0.  0. ...  1.  0.  0.]
 [32.  0.  0. ...  0.  1.  0.]]


(891, 12)

In [41]:
y_train=train_data['Survived']
y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [42]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_clf.fit(X_train, y_train)

SVC(gamma='auto')

In [43]:
X_test = pipeline.transform(test_data)
y_pred = svm_clf.predict(X_test)

In [44]:
from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

0.7329588014981274

In [45]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8126466916354558

In [48]:
from sklearn.linear_model import LinearRegression

lin_clf=LinearRegression()
lin_scores=cross_val_score(lin_clf, X_train, y_train, cv=10)
lin_scores.mean()

0.3625102333415448

In [46]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf=KNeighborsClassifier(n_neighbors=5)
knn_scores=cross_val_score(knn_clf, X_train, y_train, cv=10)
knn_scores.mean()

0.7150187265917602

In [47]:
from sklearn.linear_model import SGDClassifier

sgd_clf=SGDClassifier(random_state=42)
sgd_scores=cross_val_score(knn_clf,X_train,y_train, cv=10)
sgd_scores.mean()


0.7150187265917602

In [49]:
# 최적의 파라미터 : n_estimators=30

from sklearn.model_selection import GridSearchCV

grid_search=GridSearchCV(forest_clf, [{'n_estimators':[x*10 for x in range(1,11)]}], cv=10)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'n_estimators': 30}
0.8182521847690387


In [54]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=30, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

forest_clf.fit(X_train,y_train)

RandomForestClassifier(n_estimators=30, random_state=42)

In [72]:
# titanic prediction submit

X_test=pipeline.fit_transform(test_data)

answer=forest_clf.predict(X_test)

a=pd.DataFrame(answer, columns=['Survived'])
submit=pd.concat([test_data['PassengerId'],a], axis=1)
submit.to_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/titanic/submit.csv', index=False)

In [50]:
# 특성을 재조합, 추가하면 연관성이 더 뚜렷하게 나타날 수 있다.
# 나이대를 조사한 AgeBucket, 가족 수를 조사한 RelativesOnBoard

train_data["AgeBucket"] = train_data["Age"] // 15 * 15
train_data[["AgeBucket", "Survived"]].groupby(['AgeBucket']).mean()

Unnamed: 0_level_0,Survived
AgeBucket,Unnamed: 1_level_1
0.0,0.576923
15.0,0.362745
30.0,0.423256
45.0,0.404494
60.0,0.24
75.0,1.0


In [11]:
train_data["RelativesOnboard"] = train_data["SibSp"] + train_data["Parch"]
train_data[["RelativesOnboard", "Survived"]].groupby(['RelativesOnboard']).mean()

Unnamed: 0_level_0,Survived
RelativesOnboard,Unnamed: 1_level_1
0,0.303538
1,0.552795
2,0.578431
3,0.724138
4,0.2
5,0.136364
6,0.333333
7,0.0
10,0.0


In [39]:
# 열이 추가되었고, 추가된 열이 범주형 데이터를 가지므로
# 다시 데이터 전처리

from sklearn.compose import ColumnTransformer

var2_new=['Pclass','Sex','Embarked','AgeBucket','RelativesOnboard']

pipeline_new=ColumnTransformer([
    ('cat_pipeline', cat_pipeline, var2_new),
])

print(train_data.shape)
X_train_new=pipeline_new.fit_transform(train_data)


(891, 14)


In [35]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBucket,RelativesOnboard
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,15.0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,30.0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,15.0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,30.0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,30.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,15.0,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,15.0,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,,3
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,15.0,0


In [15]:
y_train=train_data['Survived']
y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [40]:
from sklearn.model_selection import GridSearchCV

grid_search=GridSearchCV(forest_clf, [{'n_estimators':[x*10 for x in range(1,11)]}], cv=10)
grid_search.fit(X_train_new, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'n_estimators': 30}
0.8070037453183521


In [34]:
# 질문 : 왜 Age->AgeBucket / Parch,SibSp -> RelativesOnboard로 변경했을때 정확도가 떨어지는가?
# 나중에 개선해볼 것.

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(n_estimators=30, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train_new, y_train, cv=10)
forest_scores.mean()

0.8070037453183521

In [None]:
# 4. 스팸 분류기 만들기

