# no normalizing version


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = np.genfromtxt('data.txt')

np.random.seed(126)
np.random.shuffle(data)

X = data[:,0:6]
Y = data[:,6:16]
dfX=pd.DataFrame(X)
dfy=pd.DataFrame(Y)

table=pd.concat([dfX, dfy], axis=1)
table.columns=['SiH4','N2O','Temp','Pressure','RF Power','Time','Thickness','Depo. Rate','Uniformity','Ref. Index','Permittivity','Etch Rate','Stress','H2O','SiOH']

In [3]:
print(table)

     SiH4     N2O   Temp  Pressure  RF Power   Time  Thickness  Depo. Rate  \
0   909.0    91.0  375.0      1.80     150.0   40.0       4.25      1064.0   
1   667.0   333.0  375.0      1.80     150.0   60.0       4.49       749.0   
2   400.0   900.0  200.0      1.80     150.0  126.0       4.99       396.0   
3   400.0   400.0  200.0      1.80     150.0  136.0       4.80       352.0   
4   400.0   900.0  400.0      0.25      20.0  600.0       3.36        56.0   
5   200.0   400.0  400.0      0.25      20.0  468.0       5.76       123.0   
6   400.0   900.0  250.0      0.50     150.0  110.0       5.41       491.0   
7   100.0   200.0  375.0      1.80     150.0  105.0       0.79        75.0   
8   200.0   400.0  200.0      1.80      20.0  187.0       5.69       304.0   
9   400.0   900.0  250.0      1.10      80.0   85.0       5.01       587.0   
10  833.0   167.0  375.0      1.80     150.0   45.0       4.52      1005.0   
11  550.0  1100.0  375.0      1.80     150.0   25.0       1.57  

In [4]:
from sklearn.metrics import mean_squared_error




def test_1(func, x, y, k) :
    
    L = x.shape[0]
       
    if k >= L :
        print('error\n')
        return -1
    
    if L%k == 0 :
        d = int(L/k) - 1
    else :
        d = int(L/k)
      
    mse = 0.0
    
    for i in range(0,d) :
        
        x_test = x[i*k:(i+1)*k]
        y_test = y[i*k:(i+1)*k]
        
        x_train = np.concatenate([x[0 : i*k], x[(i+1)*k : L]], axis=0)
        y_train = np.concatenate([y[0 : i*k], y[(i+1)*k : L]], axis=0)
        
        func.fit(x_train, y_train)
        print("%d MSE : %f" %(i+1,mean_squared_error(y_test, func.predict(x_test))))
        mse = mse + mean_squared_error(y_test, func.predict(x_test))
        
    x_test = x[d*k:L]
    y_test = y[d*k:L]
    x_train = x[0:d*k]
    y_train = y[0:d*k]
    
    func.fit(x_train, y_train)
    print("%d MSE : %f"%(d+1, mean_squared_error(y_test, func.predict(x_test))))
    mse = mse + mean_squared_error(y_test, func.predict(x_test))
    
    return mse/(d+1)

def test_2(func, x, y) :
    
    L = x.shape[0]
      
    x_test = x[0:L - 5]
    y_test = y[0:L - 5]
    x_train = x[L-5:L]
    y_train = y[L-5:L]
    
    func.fit(x_train, y_train)
    print("MSE : %f"%( mean_squared_error(y_test, func.predict(x_test))))
    mse = mean_squared_error(y_test, func.predict(x_test))
    
    return mse



X_train = X[:,:]
Y_train = Y[:,1]/100

## Linear_regression

### Multivariate regression  

In [5]:
from sklearn import linear_model

MulReg=linear_model.LinearRegression()

mse = test_1(MulReg, X_train, Y_train, k = 5)


print("MSE: %.4f" % mse)

1 MSE : 0.663586
2 MSE : 1.363335
3 MSE : 1.115933
4 MSE : 1.488241
5 MSE : 0.836300
6 MSE : 1.119458
7 MSE : 0.550296
8 MSE : 0.075182
MSE: 0.9015


### Ridge Regression (Linear least squares with l2 regularization)

In [6]:
ridge = linear_model.Ridge(alpha=0.35)

mse = test_1(ridge, X_train, Y_train, k = 5)

print("MSE: %.4f" % mse)

1 MSE : 0.664864
2 MSE : 1.364786
3 MSE : 1.111570
4 MSE : 1.478227
5 MSE : 0.826406
6 MSE : 1.115797
7 MSE : 0.551197
8 MSE : 0.076716
MSE: 0.8987


### Lasso Regression (Linear least squares with l1 regularization)

In [7]:
lasso = linear_model.Lasso(alpha=0.35)


mse = test_1(lasso, X_train, Y_train, k = 5)


print("MSE: %.4f" % mse)

1 MSE : 0.704881
2 MSE : 1.392677
3 MSE : 1.038850
4 MSE : 1.240646
5 MSE : 0.671918
6 MSE : 1.054088
7 MSE : 0.572130
8 MSE : 0.119907
MSE: 0.8494


# Support Vecter Regression



### *가장 낮은 MSE값을 가지는 C와 kernel을 선택

In [8]:
import numpy as np
from sklearn.svm import SVR

In [9]:
for c in [0.01,0.1,1,10,100]:
    for i in ["linear","rbf","sigmoid"]:
        for e in [0.001,0.01,0.1]:
            svr_ck = SVR(kernel=i, C=c, epsilon=e)
            
            mse = test_2(svr_ck, X_train, Y_train)
            
            print("kernel:",i,"\n","C:",c,"\n","MSE: ",mse,"\n","epsilon:",e,"\n" ,"-"*25)

MSE : 5.086694
kernel: linear 
 C: 0.01 
 MSE:  5.08669421222 
 epsilon: 0.001 
 -------------------------
MSE : 5.090661
kernel: linear 
 C: 0.01 
 MSE:  5.09066097116 
 epsilon: 0.01 
 -------------------------
MSE : 4.862770
kernel: linear 
 C: 0.01 
 MSE:  4.86276966149 
 epsilon: 0.1 
 -------------------------
MSE : 6.367155
kernel: rbf 
 C: 0.01 
 MSE:  6.36715483871 
 epsilon: 0.001 
 -------------------------
MSE : 6.363109
kernel: rbf 
 C: 0.01 
 MSE:  6.36310887097 
 epsilon: 0.01 
 -------------------------
MSE : 6.355154
kernel: rbf 
 C: 0.01 
 MSE:  6.35515403226 
 epsilon: 0.1 
 -------------------------
MSE : 6.375368
kernel: sigmoid 
 C: 0.01 
 MSE:  6.37536774194 
 epsilon: 0.001 
 -------------------------
MSE : 6.371225
kernel: sigmoid 
 C: 0.01 
 MSE:  6.371225 
 epsilon: 0.01 
 -------------------------
MSE : 6.355154
kernel: sigmoid 
 C: 0.01 
 MSE:  6.35515403226 
 epsilon: 0.1 
 -------------------------
MSE : 5.086694
kernel: linear 
 C: 0.1 
 MSE:  5.08669421

### rbf kernel?

In [10]:
import numpy as np
from sklearn.svm import SVR

In [11]:
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)

mse = test_1(svr_rbf, X_train, Y_train, k = 5)


print("MSE: %.4f" % mse)

1 MSE : 14.515676
2 MSE : 4.801443
3 MSE : 9.744776
4 MSE : 3.209158
5 MSE : 5.977618
6 MSE : 1.771525
7 MSE : 0.959625
8 MSE : 0.117306
MSE: 5.1371


### poly kernel 

In [12]:
#svr_poly = SVR(kernel='poly', C=1e3, gamma=0.1)

#mse = test_1(svr_poly, X_train, Y_train, k = 5)


#print("MSE: %.4f" % mse)

### sigmoid kernel

In [13]:
svr_sigmoid = SVR(kernel='sigmoid', C=1e3, gamma=0.1)

mse = test_1(svr_sigmoid, X_train, Y_train, k = 5)


print("MSE: %.4f" % mse)

1 MSE : 15.273665
2 MSE : 4.216425
3 MSE : 10.016025
4 MSE : 2.140680
5 MSE : 6.192185
6 MSE : 1.354265
7 MSE : 0.903705
8 MSE : 0.483025
MSE: 5.0725


### no kernel?


In [14]:
#svr_lin = SVR(kernel='linear', C=1e2)


#mse = test_1(svr_lin, X_train, Y_train, k = 5)


#print("MSE: %.4f" % mse)

## Kernel Ridge Regression 

In [15]:
from sklearn.kernel_ridge import KernelRidge

kr_linear = KernelRidge(kernel='linear', gamma=0.1)
kr_rbf = KernelRidge(kernel='rbf', gamma=0.1)
kr_poly = KernelRidge(kernel='poly', gamma=0.1)




mse = test_1(kr_linear, X_train, Y_train, k = 5)
print("MSE(linear): %.4f\n" % mse)
mse = test_1(kr_rbf, X_train, Y_train, k = 5)
print("MSE(rbf): %.4f\n" % mse)
mse = test_1(kr_poly, X_train, Y_train, k = 5)
print("MSE(poly): %.4f\n" % mse)

1 MSE : 0.893011
2 MSE : 1.515477
3 MSE : 0.902242
4 MSE : 1.049398
5 MSE : 0.637348
6 MSE : 0.986137
7 MSE : 0.737359
8 MSE : 0.304507
MSE(linear): 0.8782

1 MSE : 39.539060
2 MSE : 13.976400
3 MSE : 33.979860
4 MSE : 8.680620
5 MSE : 23.571120
6 MSE : 8.902116
7 MSE : 8.774760
8 MSE : 18.147600
MSE(rbf): 19.4464

1 MSE : 186.174657
2 MSE : 13.541629
3 MSE : 0.696815
4 MSE : 218.403897
5 MSE : 116.406939
6 MSE : 52.827348
7 MSE : 37.781788
8 MSE : 0.169896
MSE(poly): 78.2504



Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 7.063116475076062e-18 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 8.662671966713258e-18 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 7.421107960819473e-18 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 9.028273418504924e-18 / 1.1102230246251565e-16


## Decision Tree Regression
### basic

In [16]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor

In [17]:
regr_1 = DecisionTreeRegressor(max_depth=4)

mse = test_1(regr_1, X_train, Y_train, k = 5)

print("MSE: %.4f" % mse)



1 MSE : 0.408806
2 MSE : 3.002435
3 MSE : 1.799047
4 MSE : 0.126426
5 MSE : 1.556080
6 MSE : 3.159511
7 MSE : 0.197607
8 MSE : 0.022751
MSE: 1.2841


### with AdaBoost

In [18]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

In [19]:
rng = np.random.RandomState(1)


regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                           n_estimators=300, random_state=rng)


mse = test_1(regr_2, X_train, Y_train, k = 5)
print("MSE: %.4f" % mse)




1 MSE : 1.071451
2 MSE : 2.954303
3 MSE : 1.200964
4 MSE : 0.110209
5 MSE : 0.853677
6 MSE : 4.027590
7 MSE : 0.180564
8 MSE : 0.004225
MSE: 1.3004


## Gradient Boosting Regression

In [20]:
from sklearn import ensemble

In [21]:
params = {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 4,
          'learning_rate': 0.005, 'loss': 'ls'}

GBR = ensemble.GradientBoostingRegressor(**params)



mse = test_1(GBR, X_train, Y_train, k = 5)

print("MSE: %.4f" % mse)

1 MSE : 0.571345
2 MSE : 2.150664
3 MSE : 1.130804
4 MSE : 0.018795
5 MSE : 0.570633
6 MSE : 3.272268
7 MSE : 0.087305
8 MSE : 0.034945
MSE: 0.9796
