# no normalizing version


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = np.genfromtxt('data_outlier little.txt')

np.random.seed(4)
np.random.shuffle(data)

X = data[:,0:6]
Y = data[:,6:16]
dfX=pd.DataFrame(X)
dfy=pd.DataFrame(Y)

table=pd.concat([dfX, dfy], axis=1)
table.columns=['SiH4','N2O','Temp','Pressure','RF Power','Time','Thickness','Depo. Rate','Uniformity','Ref. Index','Permittivity','Etch Rate','Stress','H2O','SiOH']

In [3]:
print(table)

     SiH4    N2O   Temp  Pressure  RF Power   Time  Thickness  Depo. Rate  \
0   667.0  333.0  375.0      1.80     150.0   60.0       4.49       749.0   
1   200.0  400.0  200.0      0.25     150.0  192.0       5.52       287.0   
2   400.0  400.0  200.0      1.80     150.0  136.0       4.80       352.0   
3   400.0  900.0  200.0      1.10      20.0  154.0       5.78       375.0   
4   400.0  900.0  150.0      1.80      80.0  128.0       5.23       409.0   
5   400.0  400.0  200.0      0.25      20.0  381.0       5.29       139.0   
6   400.0  900.0  250.0      1.10      80.0   85.0       5.01       587.0   
7   333.0  667.0  375.0      1.80     150.0   60.0       1.98       331.0   
8   200.0  400.0  400.0      0.25      20.0  468.0       5.76       123.0   
9   300.0  650.0  300.0      1.03      85.0  115.0       4.98       433.0   
10  400.0  900.0  400.0      0.25      20.0  600.0       3.36        56.0   
11  400.0  900.0  400.0      1.80     150.0  123.0       5.24       426.0   

In [4]:
from sklearn.metrics import mean_squared_error




def test_1(func, x, y, k) :
    
    L = x.shape[0]
       
    if k >= L :
        print('error\n')
        return -1
    
    d = int(L/k)
      
    mse = 0.0
    
    for i in range(0,d) :
        
        x_test = x[i*k:(i+1)*k]
        y_test = y[i*k:(i+1)*k]
        
        x_train = np.concatenate([x[0 : i*k], x[(i+1)*k : L]], axis=0)
        y_train = np.concatenate([y[0 : i*k], y[(i+1)*k : L]], axis=0)
        
        func.fit(x_train, y_train)
        print("%d MSE : %f" %(i+1,mean_squared_error(y_test, func.predict(x_test))))
        mse = mse + mean_squared_error(y_test, func.predict(x_test))
        
    x_test = x[d*k:L]
    y_test = y[d*k:L]
    x_train = x[0:d*k]
    y_train = y[0:d*k]
    
    func.fit(x_train, y_train)
    print("%d MSE : %f"%(d+1, mean_squared_error(y_test, func.predict(x_test))))
    mse = mse + mean_squared_error(y_test, func.predict(x_test))
    
    return mse/(d+1)

def test_2(func, x, y) :
    
    L = x.shape[0]
      
    x_test = x[0:L - 5]
    y_test = y[0:L - 5]
    x_train = x[L-5:L]
    y_train = y[L-5:L]
    
    func.fit(x_train, y_train)
    print("MSE : %f"%( mean_squared_error(y_test, func.predict(x_test))))
    mse = mean_squared_error(y_test, func.predict(x_test))
    
    return mse



X_train = X[0:28,:]
Y_train = Y[0:28,0]
X_test = X[23:28,:]
Y_test = Y[23:28,0]

## Linear_regression

### Multivariate regression  

In [5]:
from sklearn import linear_model

MulReg=linear_model.LinearRegression()

mse = test_1(MulReg, X_train, Y_train, k = 5)


print("MSE: %.4f" % mse)

1 MSE : 0.290170
2 MSE : 1.298704
3 MSE : 4.962395
4 MSE : 2.929756
5 MSE : 1.189320
6 MSE : 0.470049
MSE: 1.8567


### Ridge Regression (Linear least squares with l2 regularization)

In [6]:
ridge = linear_model.Ridge(alpha=0.35)

mse = test_1(ridge, X_train, Y_train, k = 5)

print("MSE: %.4f" % mse)

1 MSE : 0.262777
2 MSE : 1.308143
3 MSE : 5.012404
4 MSE : 2.953351
5 MSE : 1.186880
6 MSE : 0.452393
MSE: 1.8627


### Lasso Regression (Linear least squares with l1 regularization)

In [7]:
lasso = linear_model.Lasso(alpha=0.35)


mse = test_1(lasso, X_train, Y_train, k = 5)


print("MSE: %.4f" % mse)

1 MSE : 0.250662
2 MSE : 1.643366
3 MSE : 6.107704
4 MSE : 3.523310
5 MSE : 1.404637
6 MSE : 0.193052
MSE: 2.1871


# Support Vecter Regression



### *가장 낮은 MSE값을 가지는 C와 kernel을 선택

In [8]:
import numpy as np
from sklearn.svm import SVR

In [9]:
for c in [0.01,0.1,1,10,100]:
    for i in ["linear","rbf","sigmoid"]:
        for e in [0.001,0.01,0.1]:
            svr_ck = SVR(kernel=i, C=c, epsilon=e)
            
            mse = test_2(svr_ck, X_train, Y_train)
            
            print("kernel:",i,"\n","C:",c,"\n","MSE: ",mse,"\n","epsilon:",e,"\n" ,"-"*25)

MSE : 26.328351
kernel: linear 
 C: 0.01 
 MSE:  26.3283509554 
 epsilon: 0.001 
 -------------------------
MSE : 24.650832
kernel: linear 
 C: 0.01 
 MSE:  24.6508324227 
 epsilon: 0.01 
 -------------------------
MSE : 11.240441
kernel: linear 
 C: 0.01 
 MSE:  11.2404413699 
 epsilon: 0.1 
 -------------------------
MSE : 1.973243
kernel: rbf 
 C: 0.01 
 MSE:  1.97324347826 
 epsilon: 0.001 
 -------------------------
MSE : 1.973243
kernel: rbf 
 C: 0.01 
 MSE:  1.97324347826 
 epsilon: 0.01 
 -------------------------
MSE : 1.944095
kernel: rbf 
 C: 0.01 
 MSE:  1.94409456522 
 epsilon: 0.1 
 -------------------------
MSE : 1.973243
kernel: sigmoid 
 C: 0.01 
 MSE:  1.97324347826 
 epsilon: 0.001 
 -------------------------
MSE : 1.973243
kernel: sigmoid 
 C: 0.01 
 MSE:  1.97324347826 
 epsilon: 0.01 
 -------------------------
MSE : 1.944095
kernel: sigmoid 
 C: 0.01 
 MSE:  1.94409456522 
 epsilon: 0.1 
 -------------------------
MSE : 26.328351
kernel: linear 
 C: 0.1 
 MSE:  2

### rbf kernel?

In [10]:
import numpy as np
from sklearn.svm import SVR

In [11]:
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)

mse = test_1(svr_rbf, X_train, Y_train, k = 5)


print("MSE: %.4f" % mse)

1 MSE : 0.556085
2 MSE : 1.804135
3 MSE : 1.103668
4 MSE : 3.436613
5 MSE : 1.822956
6 MSE : 0.124444
MSE: 1.4747


### poly kernel 

In [12]:
svr_poly = SVR(kernel='poly', C=1e3, gamma=0.1)

mse = test_1(svr_poly, X_train, Y_train, k = 5)

print("MSE: %.4f" % mse)

1 MSE : 5.839939
2 MSE : 22.963423
3 MSE : 514.991517
4 MSE : 4.197565
5 MSE : 4.045382
6 MSE : 1273.238522
MSE: 304.2127


### sigmoid kernel

In [13]:
svr_sigmoid = SVR(kernel='sigmoid', C=1e3, gamma=0.1)

mse = test_1(svr_sigmoid, X_train, Y_train, k = 5)


print("MSE: %.4f" % mse)

1 MSE : 0.232460
2 MSE : 1.998620
3 MSE : 0.899980
4 MSE : 3.789785
5 MSE : 2.141865
6 MSE : 0.027758
MSE: 1.5151


### no kernel?


In [14]:
#svr_lin = SVR(kernel='linear', C=1e2)


#mse = test_1(svr_lin, X_train, Y_train, k = 5)


#print("MSE: %.4f" % mse)

## Kernel Ridge Regression 

In [15]:
from sklearn.kernel_ridge import KernelRidge

kr_linear = KernelRidge(kernel='linear', gamma=0.1)
kr_rbf = KernelRidge(kernel='rbf', gamma=0.1)
kr_poly = KernelRidge(kernel='poly', gamma=0.1)




mse = test_1(kr_linear, X_train, Y_train, k = 5)
print("MSE(linear): %.4f\n" % mse)
mse = test_1(kr_rbf, X_train, Y_train, k = 5)
print("MSE(rbf): %.4f\n" % mse)
mse = test_1(kr_poly, X_train, Y_train, k = 5)
print("MSE(poly): %.4f\n" % mse)

1 MSE : 1.646522
2 MSE : 2.718810
3 MSE : 10.576098
4 MSE : 2.757926
5 MSE : 1.495921
6 MSE : 0.365276
MSE(linear): 3.2601

1 MSE : 26.886360
2 MSE : 18.605409
3 MSE : 26.503480
4 MSE : 14.811620
5 MSE : 21.459780
6 MSE : 24.882867
MSE(rbf): 22.1916

1 MSE : 801.923198
2 MSE : 650.386272
3 MSE : 1127.292203
4 MSE : 22.096702
5 MSE : 544.339629
6 MSE : 1616.837630
MSE(poly): 793.8126



Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 3.0254546782813985e-17 / 1.1102230246251565e-16


## Decision Tree Regression
### basic

In [16]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor

In [17]:
regr_1 = DecisionTreeRegressor(max_depth=4)

mse = test_1(regr_1, X_train, Y_train, k = 5)

print("MSE: %.4f" % mse)



1 MSE : 2.907557
2 MSE : 1.294993
3 MSE : 1.217765
4 MSE : 3.972693
5 MSE : 0.113156
6 MSE : 0.059518
MSE: 1.5943


### with AdaBoost

In [18]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

In [19]:
rng = np.random.RandomState(1)


regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                           n_estimators=300, random_state=rng)


mse = test_1(regr_2, X_train, Y_train, k = 5)
print("MSE: %.4f" % mse)




1 MSE : 1.442465
2 MSE : 0.051134
3 MSE : 1.092403
4 MSE : 3.692385
5 MSE : 0.144768
6 MSE : 0.013148
MSE: 1.0727


## Gradient Boosting Regression

In [20]:
from sklearn import ensemble

In [21]:
params = {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 4,
          'learning_rate': 0.005, 'loss': 'ls'}

GBR = ensemble.GradientBoostingRegressor(**params)



mse = test_1(GBR, X_train, Y_train, k = 5)

print("MSE: %.4f" % mse)

1 MSE : 1.093595
2 MSE : 0.302104
3 MSE : 1.081929
4 MSE : 4.328384
5 MSE : 0.333211
6 MSE : 0.025101
MSE: 1.1941
