In [32]:
# Admission 예제

import numpy as np
import pandas as pd
import tensorflow as tf
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
import matplotlib.pyplot as plt

# python 구현을 위한 수치미분함수가 필요!
# 수치미분함수(for python)
def numerical_derivative(f,x):
    
    # f : 미분하려고 하는 다변수 함수(loss 함수)
    # x : 모든 값을 포함하는 numpy array => [W, b] 
    delta_x = 1e-4
    derivative_x = np.zeros_like(x)    # [0 0]
    
    it = np.nditer(x, flags=['multi_index'])
    
    while not it.finished:
        
        idx = it.multi_index   # 현재의 iterator의 index를 추출 => tuple형태로 나와요
        
        tmp = x[idx]     # 현재 index의 값을 잠시 보존.
                         # delta_x를 이용한 값으로 ndarray를 수정한 후 편미분을 계산
                         # 함수값을 계산한 후 원상복구를 해 줘야 다음 독립변수에
                         # 대한 편미분을 정상적으로 수행할 수 있어요!
        x[idx] = tmp + delta_x        
        fx_plus_delta = f(x)    # f([1.00001, 2.0])   => f(x + delta_x)
        

        x[idx] = tmp - delta_x
        fx_minus_delta = f(x)    # f([0.99999, 2.0])   => f(x - delta_x)
        
        derivative_x[idx] = (fx_plus_delta - fx_minus_delta) / (2 * delta_x)
        
        x[idx] = tmp
        
        it.iternext()
        
    return derivative_x

# Raw Data Loading
df = pd.read_csv('./data/admission.csv')
# display(df)

# 데이터 전처리
# 1. 결측치 처리
# print(df.isnull().sum())  # 결측치는 없어요!

# 2. 이상치 처리
# 이상치가 있는지 먼저 눈으로 확인 => boxplot을 이용
# fig = plt.figure()
# fig_gre = fig.add_subplot(1,3,1)
# fig_gpa = fig.add_subplot(1,3,2)
# fig_rank = fig.add_subplot(1,3,3)
# fig_gre.boxplot(df['gre'])
# fig_gpa.boxplot(df['gpa'])
# fig_rank.boxplot(df['rank'])

# fig.tight_layout()
# plt.show()

# 이상치가 존재해요!!
# Z-Score방식으로 이상치를 제거
zscore_threshold = 2.0 # 2.0이하로 설정하는게 optimal

for col in df.columns:
    outlier = df[col][(np.abs(stats.zscore(df[col])) > zscore_threshold)]
    df = df.loc[~df[col].isin(outlier)]
    
# display(df)   # 382 rows × 4 columns  

# 이상치를 성공적으로 제거했으니 이제 Normalization(정규화)처리를 해 보아요!
x_data = df.drop('admit', axis=1, inplace=False).values
# print(x_data)
t_data = df['admit'].values.reshape(-1,1)
# print(t_data)

# sklearn은 정규화하지 않은 데이터를 이용.
# python구현과 tensorflow 구현은 데이터를 정규화해서 사용해야 해요!
scaler_x = MinMaxScaler()
scaler_x.fit(x_data)
norm_x_data = scaler_x.transform(x_data)   # for python, tensorflow
# print(norm_x_data)

# python 구현부터 알아보아요!

# Weight & bias
W = np.random.rand(3,1)   # 3 : 행렬곱을 위해 입력데이터 x의 열의 수
                          # 1 : 예측값이 t_data와 연산이 되야 되요. t_data의 
                          #     column 수와 동일.
b = np.random.rand(1)        

# loss function
def loss_func(input_obj):    # input_obj : [w1 w2 w3 b]
    
    input_W = input_obj[:-1].reshape(-1,1)
    input_b = input_obj[-1:]
    
    delta = 1e-7    # log연산시 무한대로 발산하는것을 방지하기 위한 수치처리
    
    z = np.dot(norm_x_data,input_W) + input_b
    y = 1 / ( 1 + np.exp(-1*z))
    
    # cross entropy
    return -np.sum(t_data*np.log(y+delta) + (1-t_data)*np.log(1-y+delta))

# learning rate 정의
learning_rate = 1e-4

for step in range(300000):
    input_param = np.concatenate((W.ravel(), b.ravel()), axis=0) # [w1 w2 w3 b]
    derivative_result = learning_rate * numerical_derivative(loss_func, input_param)
    
    W = W - derivative_result[:-1].reshape(-1,1)   # W 갱신(w1, w2, w3 갱신)
    b = b - derivative_result[-1:]                 # b 갱신
    
    if step % 30000 == 0:
        input_param = np.concatenate((W.ravel(), b.ravel()), axis=0)
        print('W : {}, b : {}, loss : {}'.format(W.ravel(), b, loss_func(input_param)))            

W : [0.13017394 0.97008634 0.0926367 ], b : [0.66002466], loss : 428.53800881681093
W : [ 1.07515469  1.12942495 -1.61511996], b : [-1.25304261], loss : 221.2156001000295
W : [ 1.07526591  1.12942595 -1.61508269], b : [-1.25312616], loss : 221.21560006866997
W : [ 1.07526592  1.12942595 -1.61508268], b : [-1.25312617], loss : 221.21560006866997
W : [ 1.07526592  1.12942595 -1.61508268], b : [-1.25312617], loss : 221.21560006866997
W : [ 1.07526592  1.12942595 -1.61508268], b : [-1.25312617], loss : 221.21560006866997
W : [ 1.07526592  1.12942595 -1.61508268], b : [-1.25312617], loss : 221.21560006866997
W : [ 1.07526592  1.12942595 -1.61508268], b : [-1.25312617], loss : 221.21560006866997
W : [ 1.07526592  1.12942595 -1.61508268], b : [-1.25312617], loss : 221.21560006866997
W : [ 1.07526592  1.12942595 -1.61508268], b : [-1.25312617], loss : 221.21560006866997


In [33]:
# predict
def logistic_predict(x):
    z = np.dot(x,W) + b
    y = 1 / ( 1 + np.exp(-1*z))
    
    if y < 0.5:
        result = 0
    else:
        result = 1
    
    return result, y

my_score = np.array([600, 3.8, 1])
scaled_my_score = scaler_x.transform(my_score.reshape(-1,3))
result = logistic_predict(scaled_my_score)
print(result)   # (1, array([[0.57333869]]))

(1, array([[0.57333869]]))


In [34]:
# sklearn 구현

model = linear_model.LogisticRegression()

model.fit(x_data, t_data.ravel())

my_score = np.array([600, 3.8, 1]).reshape(-1,3)
predict_result = model.predict(my_score)
predict_proba = model.predict_proba(my_score)
print(predict_result, predict_proba)   
# [1] [[0.43740782 0.56259218]]

[1] [[0.43740782 0.56259218]]


In [35]:
# tensorflow 구현

# placeholder
X = tf.placeholder(shape=[None,3], dtype=tf.float32)
T = tf.placeholder(shape=[None,1], dtype=tf.float32)

# Weight & bias
W = tf.Variable(tf.random.normal([3,1]), name='weight')
b = tf.Variable(tf.random.normal([1]), name='bias')

# Hypothesis
logit = tf.matmul(X,W) + b
H = tf.sigmoid(logit)

# loss func(cross entropy)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit,
                                                              labels=T))
# train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-4).minimize(loss)

# session & 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 학습
for step in range(300000):
    
    _, W_val, b_val, loss_val = sess.run([train,W,b,loss], 
                                         feed_dict={X:norm_x_data,
                                                    T:t_data})
    if step % 30000 == 0:
        print('W : {}, b: {}, loss : {}'.format(W_val.ravel(),b_val,loss_val))

W : [ 0.10800714  0.96114427 -1.0171901 ], b: [-0.8140015], loss : 0.5898090600967407
W : [ 0.13135205  0.96304756 -1.0655267 ], b: [-0.84222215], loss : 0.5885522365570068
W : [ 0.16077657  0.97238153 -1.1050339 ], b: [-0.8568901], loss : 0.5876408815383911
W : [ 0.19109291  0.98311037 -1.1403624 ], b: [-0.86737126], loss : 0.5868446826934814
W : [ 0.22053573  0.9938392  -1.1725489 ], b: [-0.87631196], loss : 0.5861404538154602
W : [ 0.24891227  1.004568   -1.2026378 ], b: [-0.88525265], loss : 0.5855053663253784
W : [ 0.27609363  1.0152968  -1.231248  ], b: [-0.89419335], loss : 0.5849290490150452
W : [ 0.30225462  1.0235343  -1.2573569 ], b: [-0.9023328], loss : 0.5844234824180603
W : [ 0.32741132  1.0306869  -1.2823908 ], b: [-0.9095066], loss : 0.583966851234436
W : [ 0.35141513  1.0378394  -1.3054036 ], b: [-0.9166592], loss : 0.5835583806037903


In [36]:
my_score = np.array([600, 3.8, 1])
scaled_my_score = scaler_x.transform(my_score.reshape(-1,3))
result = sess.run(H, feed_dict={X:scaled_my_score})
print(result)


[[0.54253036]]
