In [1]:
# BInary Classificatio의 대표적인 2개의 문제를 한번 구현해보자
# 1. 위스콘신 유방암 데이터
# 2. Titanic

In [25]:
# 위스콘신 유방암 데이터
# 이 데이터는 sklearn이 제공
# sklearn과 tensorflow를 이용함

import numpy as np
from sklearn import linear_model   # LogisticRegression()
from sklearn.datasets import load_breast_cancer  # 데이터 로딩을 위한 함수
from sklearn.model_selection import train_test_split  # 학습데이터와 평가데이터로 분리
from sklearn.model_selection import cross_val_score  # cross_validation 하기 위해 필요

# raw data loading
# cancer = load_breast_cancer()
print(type(cancer))  # <class 'sklearn.utils.Bunch'>
                     # sklearn이 데이터를 표현하기 위해 사용하는 자료구조
                     # python의 dictionary와 유사한 구조
# print(cancer)
# data라는 속성과 target이라는 속성을 가지고 있고
# data라는 속성이 독립변수, target이 종속변수

print(cancer.data.shape, cancer.target.shape)  # (569, 30) (569,)
print(np.unique(cancer.target, return_counts=True))
# array([0, 1]), array([212, 357]

# 유방암 데이터에 대한 상세 내용
# print(cancer.DESCR)
# :Missing Attribute Values: None
# WDBC-Malignant(악성)-0, WDBC-Benign(정상)-1
# Class Distribution: 212 - Malignant, 357 - Benign

# data set
x_data = cancer.data
t_data = cancer.target

# Hold-out validation을 위해서 train, validation으로 분리
train_x_data, test_x_data, train_t_data, test_t_data = \
train_test_split(x_data,
                 t_data,
                 test_size = 0.2,
                 random_state=2,
                 stratify=t_data)
print(train_x_data.shape, train_t_data.shape)  # (455, 30) (455,)
print(np.unique(train_t_data, return_counts=True))
# (array([0, 1]), array([170, 285]

# Model 생성
model = linear_model.LogisticRegression()

# K-Fold cross validation
test_score = cross_val_score(model, x_data, t_data, scoring='accuracy', cv=5)
print(test_score)  # [0.92982456 0.93859649 0.95614035 0.92982456 0.96460177]
print(test_score.mean())  # 0.943797546964757 (확률이 94프로라는것)

# Hold-out 방식으로 validation
model.fit(train_x_data, train_t_data)
test_score = model.score(test_x_data, test_t_data)  # 따로 옵션을 주지않으면 score는 accuracy를 구함
print(test_score)  # 0.9736842105263158

# sklearn 방식을 이용해보았음

<class 'sklearn.utils.Bunch'>
(569, 30) (569,)
(array([0, 1]), array([212, 357], dtype=int64))
(455, 30) (455,)
(array([0, 1]), array([170, 285], dtype=int64))
[0.92982456 0.93859649 0.95614035 0.92982456 0.96460177]
0.943797546964757
0.9736842105263158


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [21]:
# 위의 데이터를 tensorflow로 구현
import tensorflow as tf

# tensorflow 그래프를 그려보자

# placeholder
X = tf.placeholder(shape=[None,30], dtype=tf.float32)
T = tf.placeholder(shape=[None,1], dtype=tf.float32)

# Weight & bias
W = tf.Variable(tf.random.normal([30,1]))
b = tf.Variable(tf.random.normal([1]))

# Hypothesis, model, predict model, Logistic Regression Model
logit = tf.matmul(X,W) + b
H = tf.sigmoid(logit)

# cross entropy(loss function)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit,
                                                              labels=T))

# 한번 수행하면 W,b가 한번씩 좋아짐, 여러번 반복하면 여러번 좋아짐
# train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-4).minimize(loss)

# session, 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())  # 초기화 작업

# 반복학습
# 전체 데이터를 이용해서 1번 학습 => 1 epoch(에폭)
for step in range(100000):
    # train 노드랑 loss 노드 실행
    _, loss_val = sess.run([train, loss], feed_dict={X: train_x_data,
                                                     T: train_t_data.reshape(-1,1)})
    # train_t_data는 1차원인데 위에서 T를 2차원으로 잡아놨으니 맞춰야함
    
    if step % 10000 == 0:
        print('loss value : {}'.format(loss_val))
        


loss value : 146.30230712890625
loss value : 0.3728635609149933
loss value : 0.3494783043861389
loss value : 0.34985828399658203
loss value : 0.34732216596603394
loss value : 0.34367385506629944
loss value : 0.3396242558956146
loss value : 0.33554983139038086
loss value : 0.33155977725982666
loss value : 0.32773149013519287


In [19]:
# 정확도(accuracy) 측정

# validation data(test_x_data, test_t_data)를 이용해서 정확도를 측정
predict = tf.cast(H >= 0.5, dtype=tf.float32)  # True -> 1.0
                                                # False -> 0.0
# 예측값과 입력받은 정답이 일치하는지
correct = tf.equal(predict, T)       # True, False, False, True, ...
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))   # 1, 0, 0, 1

accuracy_val = sess.run(accuracy, feed_dict={X : test_x_data,
                                             T : test_t_data.reshape(-1,1)})
print('Accuracy : {}'.format(accuracy_val))  # 0.9649122953414917

Accuracy : 0.9649122953414917


In [None]:
# titanic 예제로 Logistic Regression 구현
# 케글에서 데이터셋을 받아서 나온 결과를 kaggle에 업로드해서
# 우리 모델의 정확도를 평가받아 보자!