# 非線形データに対する線形回帰とニューラルネット

In [2]:
import numpy as np
import tensorflow as tf

import sys

import matplotlib.pyplot as plt
%matplotlib inline

try:
    from mpl_toolkits.mplot3d import Axes3D
except:
    pass

  from ._conv import register_converters as _register_converters


人工的にデータを生成して訓練データと教師データに分ける。

In [4]:
def generate_data(n_points=10000, n_features=3, use_nonlinear=True, 
                    noise_std=0.1, train_test_split = 4):
    """
    Arguments:
    n_points - number of data points to generate
    n_features - a positive integer - number of features
    use_nonlinear - if True, generate non-linear data
    train_test_split - an integer - what portion of data to use for testing
    
    Return:
    X_train, Y_train, X_test, Y_test, n_train, n_features
    """
    # Linear data or non-linear data?
    if use_nonlinear:
        weights = np.array([[1.0, 0.5, 0.2],[0.5, 0.3, 0.15]], dtype=np.float32)
    else:
        weights = np.array([1.0, 0.5, 0.2], dtype=np.float32)
    
    np.random.seed(42)
    bias = np.ones(n_points).reshape((-1,1))
    low = - np.ones((n_points,n_features), dtype=np.float32)
    high = np.ones((n_points,n_features), dtype=np.float32)
    
    X = np.random.uniform(low=low, high=high)
    noise = np.random.normal(size=(n_points, 1))
    noise_std = 0.1
    
    if use_nonlinear:
        Y = (weights[0,0] * bias + np.dot(X, weights[0, :]).reshape((-1,1)) + 
             np.dot(X*X, weights[1, :]).reshape([-1,1]) +
             noise_std * noise)
    else:
        Y = (weights[0] * bias + np.dot(X, weights[:]).reshape((-1,1)) + 
             noise_std * noise)
    
    n_test = int(n_points/train_test_split)
    n_train = n_points - n_test
    
    X_train = X[:n_train,:]
    Y_train = Y[:n_train].reshape((-1,1))

    X_test = X[n_train:,:]
    Y_test = Y[n_train:].reshape((-1,1))
    
    return X_train, Y_train, X_test, Y_test, n_train, n_features

In [11]:
X_train, Y_train, X_test, Y_test, n_train, n_features = generate_data(use_nonlinear=True)
X_train.shape, Y_train.shape

((7500, 3), (7500, 1))

### 非線形データに対する線形回帰

非線形のデータに対して線形回帰を当てはめてみる。

In [12]:
def sklearn_lin_regress(X_train, Y_train):
    """
    Arguments:
    X_train  - np.array of size (n by k) where n is number of observations 
                of independent variables and k is number of variables
    Y_train - np.array of size (n by 1) where n is the number of observations of dependend variable
    
    Return: a tuple of 
      - theta_sklearn: np.array of size (k+1 by 1) of regression coefficients
      - lp_model: an instance of LinearRegression
    """
    from sklearn.linear_model import LinearRegression
    lr_model = None
    theta_sklearn = np.array([], dtype=np.float32)
    
    X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
    model = LinearRegression(fit_intercept=False)
    lr_model = model.fit(X_train,Y_train)
    theta_sklearn = lr_model.coef_

    return theta_sklearn, lr_model

LinearRegressionオブジェクトと一緒に使うために訓練データの形も変えておく。

In [13]:
X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

In [14]:
theta_sklearn, lr_model = sklearn_lin_regress(X_train, Y_train)

回帰係数を見てみる。

In [15]:
theta_sklearn

array([[1.31552735, 1.00221739, 0.50122384, 0.19928303]])

$R^2$を見てみる。

In [16]:
lr_model.score(X_test, Y_test)

0.9065452090081396

それなりには線形回帰でもうまくいってはいるっぽい。

### 非線形データに対するニューラルネット

ニューラルネットをtensorflowで実装して先のデータに当てはめる。

活性化関数にはReLUを使い最適化にはGradient Descent Optimizerを使っている。

In [24]:
np.random.seed(42)
X_train, Y_train, X_test, Y_test, n_train, n_features = generate_data(use_nonlinear=True)

In [25]:
def random_batch(X_train, y_train, batch_size):
    np.random.seed(42)
    rnd_indices = np.random.randint(0, len(X_train), batch_size)
    X_batch = X_train[rnd_indices]
    y_batch = y_train[rnd_indices]
    return X_batch, y_batch
    
def neuron_layer(X, n_neurons, name, activation_fn=None):
    with tf.name_scope(name):
        n_inputs = int(X.get_shape()[1])
        stddev = 2 / np.sqrt(n_inputs)
        init = tf.truncated_normal((n_inputs, n_neurons), stddev=stddev)
        W = tf.Variable(init, name="kernel")
        b = tf.Variable(tf.zeros([n_neurons]), name="bias")
        Z = tf.matmul(X, W) + b
        if activation_fn is not None:
            return activation_fn(Z)
        else:
            return Z

In [26]:
learning_rate = 0.01
n_epochs = 200
batch_size = 60
num_rec = X_train.shape[0]
n_batches = int(np.ceil(num_rec / batch_size))
acc_test = 0. #  assign the result of accuracy testing to this variable

n_hidden1 = 100
n_hidden2 = 120
n_outputs = 1 # single value prediction
n_inputs = X_test.shape[1]

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.float32, shape=(None), name="y")

h1 = neuron_layer(X, n_hidden1, 'h1', activation_fn=tf.nn.relu)
h2 = neuron_layer(h1, n_hidden2, 'h2', activation_fn=tf.nn.relu)
output = neuron_layer(h2, n_outputs, 'outputs', activation_fn=None)
loss = tf.reduce_mean(tf.square(output - y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = (optimizer.minimize(loss), loss)

init = tf.global_variables_initializer()

In [27]:
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(n_batches):
            X_batch, Y_batch = random_batch(X_train, Y_train, batch_size)
            _, acc_test = sess.run(train_op, feed_dict={
                X: X_batch,
                y: Y_batch
            })
    acc_test = sess.run(loss, feed_dict={
            X: X_test,
            y: Y_test
        })

どれくらいうまく予測出来なかったのかを見てみる。

In [28]:
acc_test

0.036185063

先の線形回帰が約90%の正確性だったのに比べてNNだと約97%に上がっていることが分かる。