# no normalizing version


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = np.genfromtxt('data_v2.txt')

np.random.seed(12)
np.random.shuffle(data)

X = data[:,0:7]
Y = data[:,7:17]
dfX=pd.DataFrame(X)
dfy=pd.DataFrame(Y)

table=pd.concat([dfX, dfy], axis=1)
table.columns=['SiH4','N2O','SiO','Temp','Pressure','RF Power','Time','Thickness','Depo. Rate','Uniformity','Ref. Index','Permittivity','Etch Rate','Stress','H2O','SiOH']

In [3]:
print(table)

     SiH4     N2O    SiO   Temp  Pressure  RF Power   Time  Thickness  \
0   667.0   333.0  167.0  375.0       2.0     150.0   60.0       4.49   
1   550.0  1100.0  550.0  375.0       2.0     150.0   25.0       1.57   
2   300.0   600.0  300.0  375.0       2.0     150.0   65.0       1.95   
3   700.0  1400.0  700.0  375.0       2.0     150.0   35.0       2.85   
4   200.0   900.0  200.0  200.0       2.0     150.0  261.0       5.12   
5   200.0   900.0  200.0  400.0       2.0      20.0  180.0       4.83   
6   400.0   900.0  400.0  200.0       1.0      20.0  154.0       5.78   
7   400.0   900.0  400.0  400.0       0.0      20.0  600.0       3.36   
8   909.0    91.0   46.0  375.0       2.0     150.0   40.0       4.25   
9   200.0   400.0  200.0  200.0       2.0      20.0  187.0       5.69   
10  400.0   400.0  200.0  400.0       2.0      20.0  111.0       5.10   
11  400.0   900.0  400.0  150.0       1.0      20.0  220.0       4.63   
12  300.0   650.0  300.0  300.0       1.0      85.0

In [4]:
from sklearn.metrics import mean_squared_error




def test_1(func, x, y, k) :
    
    L = x.shape[0]
       
    if k >= L :
        print('error\n')
        return -1
    
    d = int(L/k)
      
    mse = 0.0
    
    for i in range(0,d) :
        
        x_test = x[i*k:(i+1)*k]
        y_test = y[i*k:(i+1)*k]
        
        x_train = np.concatenate([x[0 : i*k], x[(i+1)*k : L]], axis=0)
        y_train = np.concatenate([y[0 : i*k], y[(i+1)*k : L]], axis=0)
        
        func.fit(x_train, y_train)
        print("%d MSE : %f" %(i+1,mean_squared_error(y_test, func.predict(x_test))))
        mse = mse + mean_squared_error(y_test, func.predict(x_test))
        
    x_test = x[d*k:L]
    y_test = y[d*k:L]
    x_train = x[0:d*k]
    y_train = y[0:d*k]
    
    func.fit(x_train, y_train)
    print("%d MSE : %f"%(d+1, mean_squared_error(y_test, func.predict(x_test))))
    mse = mse + mean_squared_error(y_test, func.predict(x_test))
    
    return mse/(d+1)

def test_2(func, x, y) :
    
    L = x.shape[0]
      
    x_test = x[0:L - 5]
    y_test = y[0:L - 5]
    x_train = x[L-5:L]
    y_train = y[L-5:L]
    
    func.fit(x_train, y_train)
    print("MSE : %f"%( mean_squared_error(y_test, func.predict(x_test))))
    mse = mean_squared_error(y_test, func.predict(x_test))
    
    return mse



X_train = X[:,:]
Y_train = Y[:,0]

## Linear_regression

### Multivariate regression  

In [5]:
from sklearn import linear_model

MulReg=linear_model.LinearRegression()

mse = test_1(MulReg, X_train, Y_train, k = 5)


print("MSE: %.4f" % mse)

1 MSE : 2.663523
2 MSE : 1.653405
3 MSE : 0.506820
4 MSE : 3.212627
5 MSE : 1.307797
6 MSE : 0.977013
7 MSE : 2.077977
8 MSE : 0.125186
MSE: 1.5655


### Ridge Regression (Linear least squares with l2 regularization)

In [6]:
ridge = linear_model.Ridge(alpha=0.35)

mse = test_1(ridge, X_train, Y_train, k = 5)

print("MSE: %.4f" % mse)

1 MSE : 2.673355
2 MSE : 1.650899
3 MSE : 0.506393
4 MSE : 3.226027
5 MSE : 1.300008
6 MSE : 0.970464
7 MSE : 2.073158
8 MSE : 0.114138
MSE: 1.5643


### Lasso Regression (Linear least squares with l1 regularization)

In [7]:
lasso = linear_model.Lasso(alpha=0.35)


mse = test_1(lasso, X_train, Y_train, k = 5)


print("MSE: %.4f" % mse)

1 MSE : 3.296197
2 MSE : 1.663973
3 MSE : 0.636802
4 MSE : 3.812154
5 MSE : 1.723712
6 MSE : 0.927884
7 MSE : 2.357095
8 MSE : 0.083068
MSE: 1.8126


# Support Vecter Regression



### *가장 낮은 MSE값을 가지는 C와 kernel을 선택

In [8]:
import numpy as np
from sklearn.svm import SVR

In [9]:
for c in [0.01,0.1,1,10,100]:
    for i in ["linear","rbf","sigmoid"]:
        for e in [0.001,0.01,0.1]:
            svr_ck = SVR(kernel=i, C=c, epsilon=e)
            
            mse = test_2(svr_ck, X_train, Y_train)
            
            print("kernel:",i,"\n","C:",c,"\n","MSE: ",mse,"\n","epsilon:",e,"\n" ,"-"*25)

MSE : 5.418959
kernel: linear 
 C: 0.01 
 MSE:  5.41895867516 
 epsilon: 0.001 
 -------------------------
MSE : 5.337337
kernel: linear 
 C: 0.01 
 MSE:  5.33733744488 
 epsilon: 0.01 
 -------------------------
MSE : 4.593347
kernel: linear 
 C: 0.01 
 MSE:  4.59334663432 
 epsilon: 0.1 
 -------------------------
MSE : 1.933768
kernel: rbf 
 C: 0.01 
 MSE:  1.93376774194 
 epsilon: 0.001 
 -------------------------
MSE : 1.933768
kernel: rbf 
 C: 0.01 
 MSE:  1.93376774194 
 epsilon: 0.01 
 -------------------------
MSE : 1.974928
kernel: rbf 
 C: 0.01 
 MSE:  1.97492822581 
 epsilon: 0.1 
 -------------------------
MSE : 1.933768
kernel: sigmoid 
 C: 0.01 
 MSE:  1.93376774194 
 epsilon: 0.001 
 -------------------------
MSE : 1.933768
kernel: sigmoid 
 C: 0.01 
 MSE:  1.93376774194 
 epsilon: 0.01 
 -------------------------
MSE : 1.970155
kernel: sigmoid 
 C: 0.01 
 MSE:  1.97015483871 
 epsilon: 0.1 
 -------------------------
MSE : 5.418959
kernel: linear 
 C: 0.1 
 MSE:  5.418

### rbf kernel?

In [10]:
import numpy as np
from sklearn.svm import SVR

In [11]:
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)

mse = test_1(svr_rbf, X_train, Y_train, k = 5)


print("MSE: %.4f" % mse)

1 MSE : 4.597120
2 MSE : 0.881018
3 MSE : 0.103333
4 MSE : 3.351077
5 MSE : 0.895360
6 MSE : 1.903993
7 MSE : 0.840897
8 MSE : 0.049580
MSE: 1.5778


### poly kernel 

In [12]:
#svr_poly = SVR(kernel='poly', C=1e3, gamma=0.1)

#mse = test_1(svr_poly, X_train, Y_train, k = 5)


#print("MSE: %.4f" % mse)

### sigmoid kernel

In [13]:
svr_sigmoid = SVR(kernel='sigmoid', C=1e3, gamma=0.1)

mse = test_1(svr_sigmoid, X_train, Y_train, k = 5)


print("MSE: %.4f" % mse)

1 MSE : 5.563680
2 MSE : 0.897400
3 MSE : 0.051145
4 MSE : 3.807880
5 MSE : 0.243940
6 MSE : 2.069480
7 MSE : 0.279640
8 MSE : 0.062500
MSE: 1.6220


### no kernel?


In [14]:
#svr_lin = SVR(kernel='linear', C=1e2)


#mse = test_1(svr_lin, X_train, Y_train, k = 5)


#print("MSE: %.4f" % mse)

## Kernel Ridge Regression 

In [15]:
from sklearn.kernel_ridge import KernelRidge

kr_linear = KernelRidge(kernel='linear', gamma=0.1)
kr_rbf = KernelRidge(kernel='rbf', gamma=0.1)
kr_poly = KernelRidge(kernel='poly', gamma=0.1)




mse = test_1(kr_linear, X_train, Y_train, k = 5)
print("MSE(linear): %.4f\n" % mse)
mse = test_1(kr_rbf, X_train, Y_train, k = 5)
print("MSE(rbf): %.4f\n" % mse)
mse = test_1(kr_poly, X_train, Y_train, k = 5)
print("MSE(poly): %.4f\n" % mse)

1 MSE : 11.470213
2 MSE : 8.981565
3 MSE : 2.904012
4 MSE : 1.504192
5 MSE : 0.979573
6 MSE : 2.192681
7 MSE : 2.823443
8 MSE : 0.679736
MSE(linear): 3.9419

1 MSE : 12.152880
2 MSE : 23.693100
3 MSE : 16.910560
4 MSE : 21.181500
5 MSE : 28.957060
6 MSE : 23.173560
7 MSE : 24.034709
8 MSE : 23.040000
MSE(rbf): 21.6429

1 MSE : 796.684539
2 MSE : 199.689899
3 MSE : 143.523805
4 MSE : 353.180198
5 MSE : 123.618850
6 MSE : 4399.995821
7 MSE : 57.942373
8 MSE : 111.670247
MSE(poly): 773.2882



Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 1.582132052629392e-17 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 4.554196278787566e-18 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 7.006276324149122e-17 / 1.1102230246251565e-16


## Decision Tree Regression
### basic

In [16]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor

In [17]:
regr_1 = DecisionTreeRegressor(max_depth=4)

mse = test_1(regr_1, X_train, Y_train, k = 5)

print("MSE: %.4f" % mse)



1 MSE : 3.558679
2 MSE : 1.520280
3 MSE : 0.227808
4 MSE : 3.691493
5 MSE : 0.062966
6 MSE : 1.169200
7 MSE : 0.183074
8 MSE : 0.171848
MSE: 1.3232


### with AdaBoost

In [18]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

In [19]:
rng = np.random.RandomState(1)


regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                           n_estimators=300, random_state=rng)


mse = test_1(regr_2, X_train, Y_train, k = 5)
print("MSE: %.4f" % mse)




1 MSE : 1.033264
2 MSE : 1.315880
3 MSE : 0.109345
4 MSE : 3.703797
5 MSE : 0.100865
6 MSE : 1.204160
7 MSE : 0.212806
8 MSE : 0.165378
MSE: 0.9807


## Gradient Boosting Regression

In [20]:
from sklearn import ensemble

In [21]:
params = {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 4,
          'learning_rate': 0.005, 'loss': 'ls'}

GBR = ensemble.GradientBoostingRegressor(**params)



mse = test_1(GBR, X_train, Y_train, k = 5)

print("MSE: %.4f" % mse)

1 MSE : 1.075098
2 MSE : 1.363536
3 MSE : 0.128056
4 MSE : 3.534280
5 MSE : 0.045143
6 MSE : 1.097297
7 MSE : 0.268104
8 MSE : 0.154120
MSE: 0.9582
