In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import warnings

warnings.filterwarnings(action='ignore')

# raw data
df = pd.read_csv('./data/ozone.csv')
df.info()
display(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Ozone    116 non-null    float64
 1   Solar.R  146 non-null    float64
 2   Wind     153 non-null    float64
 3   Temp     153 non-null    int64  
 4   Month    153 non-null    int64  
 5   Day      153 non-null    int64  
dtypes: float64(3), int64(3)
memory usage: 7.3 KB


Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
0,41.0,190.0,7.4,67,5,1
1,36.0,118.0,8.0,72,5,2
2,12.0,149.0,12.6,74,5,3
3,18.0,313.0,11.5,62,5,4
4,,,14.3,56,5,5
...,...,...,...,...,...,...
148,30.0,193.0,6.9,70,9,26
149,,145.0,13.2,77,9,27
150,14.0,191.0,14.3,75,9,28
151,18.0,131.0,8.0,76,9,29


In [2]:
# 데이터 전처리

# 결치값 (median값으로 넣기)
col_median = np.nanmedian(df['Solar.R'])
df['Solar.R'].loc[df['Solar.R'].isnull()] = col_median

t_median = np.nanmedian(df['Ozone'])
df['Ozone'].loc[df['Ozone'].isnull()] = t_median
print(df['Ozone'].isnull().sum())

0


In [3]:
# 이상치
                   
zscore_threshold = 2.0

for col in df.columns:
    outlier = df[col][np.abs(stats.zscore(df[col])) > zscore_threshold]
    df = df.loc[np.isin(df[col],outlier, invert=True)]

# outlier = df['Ozone'][np.abs(stats.zscore(df['Ozone'])) > zscore_threshold]
# df['Ozone'] = df['Ozone'].loc[np.isin(df['Ozone'],outlier, invert=True)]

# figure = plt.figure()

# ax1 = figure.add_subplot(1,3,1)
# ax1.boxplot(x_data['Solar.R'])

# ax2 = figure.add_subplot(1,3,2)
# ax2.boxplot(x_data['Wind'])

# ax3 = figure.add_subplot(1,3,3)
# ax3.boxplot(x_data['Temp'])

In [4]:
x_data = df[['Solar.R','Wind','Temp']]  
t_data = df['Ozone'] 

# 정규화
scaler_x = MinMaxScaler()
scaler_t = MinMaxScaler()

scaler_x.fit(x_data.values)               # scaler는 2차원 ndarray로 사용해야 해요!
scaler_t.fit(t_data.values.reshape(-1,1))  

norm_x_data = scaler_x.transform(x_data.values)
norm_t_data = scaler_t.transform(t_data.values.reshape(-1,1)).ravel()   # 1차원

In [5]:
# python 구현

def numerical_derivative(f,x):    
    
    delta_x = 1e-4
    derivative_x = np.zeros_like(x)  
    
    # iterator를 이용해서 입력변수 x에 대한 편미분을 수행!
    it = np.nditer(x, flags=['multi_index'])
    
    while not it.finished:
        
        idx = it.multi_index 
        tmp = x[idx]             
        
        x[idx] = tmp + delta_x   
        fx_plus_delta = f(x)
        
        x[idx] = tmp - delta_x    
        fx_minus_delta = f(x)
        
        derivative_x[idx] = (fx_plus_delta - fx_minus_delta) / (2 * delta_x)
        
        x[idx] = tmp              
        it.iternext()
        
    return derivative_x


# loss function
def loss_func(input_data):
    W = input_data[:-1].reshape(-1,1)
    b = input_data[-1]
    
    y = np.dot(norm_x_data,W) + b
    return np.mean(np.power(norm_t_data-y,2))

# Weight, bias
W = np.random.rand(3,1)
b = np.random.rand(1)

# predict
def predict(x):
    y = np.dot(x,W) + b
    return y

# learning_rate
learning_rate = 1e-4

# 반복 학습
for step in range(300000):
    
    input_param = np.concatenate((W.ravel(), b.ravel()), axis=0)  # [W1 W2 W3 b]
    derivative_result = learning_rate * numerical_derivative(loss_func, input_param)

    W = W - derivative_result[:-1].reshape(-1,1)
    b = b - derivative_result[-1]
    
    if step % 30000 == 0:
        input_param = np.concatenate((W.ravel(), b.ravel()), axis=0)
        print('W : {}, b:{}, loss:{}'.format(W, b, loss_func(input_param)))


W : [[0.18995847]
 [0.16695187]
 [0.19154746]], b:[0.35636952], loss:0.14665164995586097
W : [[0.06706014]
 [0.08301372]
 [0.09289684]], b:[0.21837949], loss:0.057929156879468695
W : [[0.0480961 ]
 [0.07383095]
 [0.08117928]], b:[0.23971555], loss:0.05758185813555907
W : [[0.03456374]
 [0.06495261]
 [0.07093884]], b:[0.25697822], loss:0.057359263593047737
W : [[0.02488868]
 [0.05671971]
 [0.06193566]], b:[0.27105028], loss:0.057211877795420495
W : [[0.01795933]
 [0.04927807]
 [0.05400372]], b:[0.28260125], loss:0.057111632676536905
W : [[0.01298796]
 [0.04266133]
 [0.04701668]], b:[0.29214185], loss:0.05704198166257525
W : [[0.00941519]
 [0.036842  ]
 [0.04087074]], b:[0.3000655], loss:0.05699278881011198
W : [[0.00684306]
 [0.03176175]
 [0.03547574]], b:[0.30667823], loss:0.056957616391126946
W : [[0.00498795]
 [0.02734927]
 [0.03075074]], b:[0.31222034], loss:0.05693224078507582


In [6]:
# predict
result = np.array([[150.0, 10.0, 80.0]])
norm_result = scaler_x.transform(result)
print(scaler_t.inverse_transform(predict(norm_result)))

# 오존량 : [[37.49592078]]

[[35.94716325]]


In [7]:
# tensorflow

# placeholder
X = tf.placeholder(shape=[None,3], dtype=tf.float32)
T = tf.placeholder(shape=[None,1], dtype=tf.float32)

# Weight, bias
W = tf.Variable(tf.random.normal([3,1]))
b = tf.Variable(tf.random.normal([1]))

# Hypothesis
H = tf.matmul(X,W) + b

# loss function
loss = tf.reduce_mean(tf.square(H-T))

# train node
train = tf.train.GradientDescentOptimizer(learning_rate=1e-4).minimize(loss)

# session 생성, 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer()) 

# 반복학습
for step in range(300000):
    
    _, W_val, b_val, loss_val = sess.run([train, W, b, loss], 
                                         feed_dict={X: norm_x_data,
                                                    T: norm_t_data.reshape(-1,1)})
    if step % 30000 == 0:
        print('W : {}, b : {}, loss : {}'.format(W_val, b_val, loss_val))





W : [[-1.0800681]
 [-0.7681486]
 [-1.1571087]], b : [-1.40223], loss : 11.042961120605469
W : [[ 0.05195974]
 [-0.12232768]
 [ 0.00953515]], b : [0.37293214], loss : 0.04890507832169533
W : [[ 0.08745991]
 [-0.23308945]
 [ 0.1562065 ]], b : [0.32696122], loss : 0.036309320479631424
W : [[ 0.10876255]
 [-0.29466006]
 [ 0.2567638 ]], b : [0.29062584], loss : 0.03100055269896984
W : [[ 0.1214758 ]
 [-0.3266187 ]
 [ 0.32724857]], b : [0.261331], loss : 0.028630701825022697
W : [[ 0.12900686]
 [-0.3410329 ]
 [ 0.37786567]], b : [0.23734006], loss : 0.02748284488916397
W : [[ 0.13344987]
 [-0.34526408]
 [ 0.41513503]], b : [0.21741292], loss : 0.026869067922234535
W : [[ 0.13603057]
 [-0.34385014]
 [ 0.44327024]], b : [0.2007154], loss : 0.02650677040219307
W : [[ 0.13751204]
 [-0.33942884]
 [ 0.46498835]], b : [0.18660893], loss : 0.0262746661901474
W : [[ 0.13832472]
 [-0.33367226]
 [ 0.48208874]], b : [0.17465074], loss : 0.02611728571355343


In [8]:
# Prediction

data = np.array([[150.0, 10.0, 80.0]])
norm_result = scaler_x.transform(data)

result = sess.run(H, 
                  feed_dict={X: norm_result})
print(scaler_t.inverse_transform(result))

[[36.2674]]


In [9]:
# sklearn 구현
from sklearn import linear_model

model = linear_model.LinearRegression()

model.fit(norm_x_data, norm_t_data.reshape(-1,1))

result = np.array([[150.0, 10.0, 80.0]])
norm_result=model.predict(scaler_x.transform(result))

print('sklearn 예측값 : {}'.format(scaler_t.inverse_transform(norm_result)))

sklearn 예측값 : [[36.39394518]]
