## 필요한 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## example 데이터 불러오기

In [4]:
ex = pd.read_csv("example.csv")

In [5]:
# 데이터 확인
ex

Unnamed: 0,번호,나이,키,몸무게,최종학력,연봉,다음기수
0,1,21,170,70,고등학교,3000,O
1,2,24,175,75,대학교,3200,X
2,3,23,180,80,고등학교,3400,X
3,4,22,185,85,대학교,2800,O
4,5,25,160,60,대학원,4000,O
5,6,23,155,55,대학교,3600,X
6,7,22,158,58,대학교,3800,O
7,8,26,166,66,고등학교,3700,O


## 데이터 프레임 정보 확인하기

In [6]:
ex.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
번호      8 non-null int64
나이      8 non-null int64
키       8 non-null int64
몸무게     8 non-null int64
최종학력    8 non-null object
연봉      8 non-null int64
다음기수    8 non-null object
dtypes: int64(5), object(2)
memory usage: 576.0+ bytes


# One Hot Encoding

## (1) pandas 모듈 사용

In [7]:
# get_dummies()
temp = pd.get_dummies(ex['최종학력'])
temp

Unnamed: 0,고등학교,대학교,대학원
0,1,0,0
1,0,1,0
2,1,0,0
3,0,1,0
4,0,0,1
5,0,1,0
6,0,1,0
7,1,0,0


In [8]:
# 최종 데이터 프레임 만들기
final = pd.concat([ex, temp], axis=1)

# 필요없는 컬럼 지우기
# final = final.drop(['최종학력'], axis=1)
final.drop(['최종학력'], axis=1, inplace=True)

# 최종 데이터 프레임 확인하기
final

Unnamed: 0,번호,나이,키,몸무게,연봉,다음기수,고등학교,대학교,대학원
0,1,21,170,70,3000,O,1,0,0
1,2,24,175,75,3200,X,0,1,0
2,3,23,180,80,3400,X,1,0,0
3,4,22,185,85,2800,O,0,1,0
4,5,25,160,60,4000,O,0,0,1
5,6,23,155,55,3600,X,0,1,0
6,7,22,158,58,3800,O,0,1,0
7,8,26,166,66,3700,O,1,0,0


In [9]:
# 한번에 처리
# pd.get_dummies(data, columns, drop_first)
# data : 데이터 프레임
# columns : one-hot encoding처리하고 싶은 column
# drop_first : True of False
pd.get_dummies(ex, columns=['최종학력'], drop_first=False)

Unnamed: 0,번호,나이,키,몸무게,연봉,다음기수,최종학력_고등학교,최종학력_대학교,최종학력_대학원
0,1,21,170,70,3000,O,1,0,0
1,2,24,175,75,3200,X,0,1,0
2,3,23,180,80,3400,X,1,0,0
3,4,22,185,85,2800,O,0,1,0
4,5,25,160,60,4000,O,0,0,1
5,6,23,155,55,3600,X,0,1,0
6,7,22,158,58,3800,O,0,1,0
7,8,26,166,66,3700,O,1,0,0


## (2) sklearn 모듈 사용

In [10]:
# 라이브러리 불러오기
from sklearn.preprocessing import OneHotEncoder

# numpy를 사용해 배열로 변경
temp = np.array(ex['최종학력'])

# OneHotEncoder를 넣기 위해 배열의 shape변경
temp = temp.reshape(-1,1)

# OneHotEncoder 생성
onehot = OneHotEncoder()

# OneHotEncoder 학습시키기
onehot.fit(temp)

# OneHotEncoder 적용시키기
onehot.transform(temp).toarray()


array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

## Label Encoding 사용

In [11]:
# 라이브러리 불러오기
from sklearn.preprocessing import LabelEncoder

# LabelEncoder 생성
label = LabelEncoder()

# LabelEncoder 적용
temp = ex[['최종학력']].apply(label.fit_transform)

In [12]:
# 최종 데이터 프레임 만들기
pd.concat([ex.drop(['최종학력'],axis=1), temp], axis=1)

Unnamed: 0,번호,나이,키,몸무게,연봉,다음기수,최종학력
0,1,21,170,70,3000,O,0
1,2,24,175,75,3200,X,1
2,3,23,180,80,3400,X,0
3,4,22,185,85,2800,O,1
4,5,25,160,60,4000,O,2
5,6,23,155,55,3600,X,1
6,7,22,158,58,3800,O,1
7,8,26,166,66,3700,O,0


# K-NN

## 데이터 불러오기

In [13]:
wine = pd.read_csv("wine.csv")

In [14]:
wine

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [15]:
wine['quality'].value_counts().sort_index()

3      20
4     163
5    1457
6    2198
7     880
8     175
9       5
Name: quality, dtype: int64

## grade라는 column name으로 quality의 값이 3 ~ 6이면 Good / 7 ~ 9이면 Best로 할당

In [27]:
wine['grade'] = ['Good' if x <= 6 else 'Best' for x in wine['quality']]

In [28]:
wine.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality,grade
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,Good
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,Good
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,Good
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,Good
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,Good


## wine 요약통계량 확인

In [29]:
wine.describe()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


## 표준화

In [30]:
from sklearn.preprocessing import StandardScaler

# scaler 생성
standard_scaler = StandardScaler()

# scaler 학습
standard_scaler.fit(x_data)

# scaler 적용
temp = standard_scaler.transform(x_data)

# 최종 데이터프레임 만들기
x_data = pd.DataFrame(temp, columns = x_data.columns)

## 종속변수, 독립변수 분할

In [31]:
y_data = wine['grade']
x_data = wine.drop(['quality', 'grade'], axis=1)

## train, test 분할

In [32]:
wine.shape

(4898, 13)

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2, random_state = 321)

In [23]:
4898 * 0.8

3918.4

In [24]:
# 데이터 크기 확인
print(X_train.shape)
print(X_test.shape)

(3918, 11)
(980, 11)


## K-NN 모델 적용

In [46]:
from sklearn.neighbors import KNeighborsClassifier

# knn 모델 생성
# n_neighbors : k의 개수
# p : 1(맨해튼), 2(유클리디안)
# weights : uniform(동일), distance(거리기반 가중치)
# n_jobs : -1을 사용하면 현재 컴퓨터내 모든 process 사용

model = KNeighborsClassifier(n_neighbors=3, p=2, weights = 'distance')
model 
## 이 상태로 했을 때 default 값(default에서 n_neighbors는 5). n_neighbors 수는 k 를 몇개로 설정할 것인지. (가장 중요함)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='distance')

Classifier 가 붙었기 때문에 연속형/범주형 중 범주형이라는 것을 알 수 있음.

model

이 상태로 했을 때 default 값. n_neighbors 수는 k 를 몇개로 설정할 것인지. (가장 중요함)


p 에 뭘 넣느냐에 따라 공식에 지수값이 다르게 되고 계산식이 달라지는 것임.
p=2 이면 유클리디안 거리로 계산되는데, 뭐가 좋은지 모르니까 다 해봐야함.

weight는 가중치.
분모로 오면 분모가 작아질수록 값이 커짐. 그래서 weights 에서 기본이 uniform. (즉 모든점을 동일하게 바라본다는 것인데, 이걸 weights 를 distance로 바꿔주면 조금 더 가까운 점에 가중치를 부여해서 더 확실하게 분류하게 해줌)

이제 모델을 만들었다. 이제 학습해주어야함.

In [47]:
# knn 모델 학습
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='distance')

(X_train 으로 학습을 시키고 검증해야함. 뒤에는 target을 적어줌.)


In [48]:
X_test

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
3524,5.3,0.36,0.27,6.30,0.028,40.0,132.0,0.99186,3.37,0.40,11.6
1066,6.4,0.30,0.51,5.50,0.048,62.0,172.0,0.99420,3.08,0.45,9.1
3234,6.6,0.25,0.34,3.00,0.054,22.0,141.0,0.99338,3.26,0.47,10.4
4878,6.2,0.53,0.02,0.90,0.035,6.0,81.0,0.99234,3.24,0.35,9.5
4387,7.1,0.21,0.33,1.20,0.039,34.0,97.0,0.99112,3.11,0.75,11.2
...,...,...,...,...,...,...,...,...,...,...,...
2506,6.6,0.32,0.34,7.70,0.044,63.0,212.0,0.99526,3.22,0.48,9.7
2344,7.0,0.23,0.26,7.20,0.041,21.0,90.0,0.99509,3.22,0.55,9.5
4046,7.2,0.17,0.28,17.55,0.050,33.0,154.0,0.99971,2.94,0.43,9.0
3834,6.5,0.41,0.22,4.80,0.052,49.0,142.0,0.99460,3.14,0.62,9.2


In [49]:
# 확률 출력
model.predict_proba(X_test)

array([[0.        , 1.        ],
       [0.13461226, 0.86538774],
       [0.        , 1.        ],
       ...,
       [0.        , 1.        ],
       [0.28820313, 0.71179687],
       [0.2765436 , 0.7234564 ]])

항상 1. 모델생성, 2.모델학습 3.예측 순서대로 진행

In [50]:
# 학습한 모델을 바탕으로 class 예측
y_pred = model.predict(X_test)
y_pred

array(['Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Best', 'Good', 'Good', 'Good', 'Best', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Best', 'Best', 'Good', 'Good', 'Good', 'Best',
       'Good', 'Best', 'Good', 'Best', 'Best', 'Good', 'Good', 'Best',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Best',
       'Good', 'Good', 'Best', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Best', 'Good', 'Best', 'Best', 'Good', 'Best', 'Good',
       'Best', 'Good', 'Good', 'Best', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Best', 'Good', 'Good', 'Best', 'Good', 'Good', 'Good',
       'Best', 'Good', 'Best', 'Good', 'Good', 'Good', 'Best', 'Good',
       'Best', 'Good', 'Best', 'Good', 'Good', 'Best', 'Good', 'Good',
      

모델이 예측을 제대로 했는지 안했는지 확인해주어야함.

In [51]:
# 정확도 출력 (같은지 비교하는 것임, 같으면 True, 아니면 False)
# True면 1, False 면 0 이니 Sum 을 통해서 확인할 수 있음.
# sum 을 y_test 길이로 나누어주면 정확도를 구할 수 있음.

sum(y_pred == y_test) / len(y_test)


0.8336734693877551

이전에는 weights 를 기본으로 하고  확률이 80퍼였는데, weights를 distance 로 바꾸니 83퍼로 올라감. 이렇게 계속 정확도를 올리는 식으로 모델을 수정해나가야하는 것임.

## QUIZ
#### 지금은 k를 3으로 했는데, k 가 다름에 따라 정확도가 달라짐.
#### k를 1부터 20까지 돌리면서 각 k에 해당하는 정확도 출력해서 가장 높은 정확도의 k를 찾아내기가 quiz

가로 k, 세로 정확도로 시각화해보기. 실제로는 13보다는 10으로 하는게 다른 test set 에 적용하더라도 잘 맞을 확률 큼.(과적합 때문에)
plt.grid(True) 를 이용해 시각화할 수 있음.

In [52]:
# k값에 따른 정확도 확인


In [53]:
# k값에 따른 정확도 시각화
