In [3]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
import sys
sys.path.append('../Kernel')

import MySVM
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error, median_absolute_error
from sklearn import preprocessing

from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVR
from sklearn.decomposition import PCA


metrics = [r2_score, mean_absolute_error, mean_squared_error, median_absolute_error]



In [4]:
#load the dataset and generate the dataframe
def load_data(data_set):    
    data_path = '../Datasets/'
    dft_30 = pd.read_csv(data_path + data_set, header = None)
    df_array = dft_30.drop(dft_30.columns[[0]], axis=1)
    return df_array

In [5]:
# scale the trainset into [0,1] and apply the rule to testset

def scale(train, test):
    scaler = preprocessing.MinMaxScaler()
    train_scale = scaler.fit_transform(train)
    test_sclae = scaler.transform(test)
    return train_scale, test_sclae

In [6]:
# generate the training and test set 

def get_sets(df_array, rd_state):    
    rd_state = rd_state
    target = np.array(df_array[1])
    df_train = df_array.drop(df_array.columns[[0]], axis=1)
    train_all = np.array(df_train)
    train, test, trainlabel, testlabel = train_test_split(train_all, target, test_size=0.2, random_state = rd_state)
    
    train_scale, test_scale = scale(train, test)
    
    return train_scale, test_scale, trainlabel, testlabel

In [7]:
def rfrcv(n_estimators, max_features):
    return cross_val_score(rfr(n_estimators=int(n_estimators),
                               max_features=min(max_features, 0.999)),
                           train, trainlabel, 'r2', cv=5).mean()

In [8]:
def initial_sets(id_ = 0, rd_state = 7):
    
    dataset = ['ann28_v1.csv', 'dft_P_a_v1.csv']
    data_set = dataset[id_]
       
#    if id ==0:        
#        rd_state = 7
#    else:
#        rd_state = 37
       
        
    df_array = load_data(data_set)
    train, test, trainlabel, testlabel = get_sets(df_array, rd_state)
    
    return train, test, trainlabel, testlabel


In [9]:
def bayes_op():

    svcBO = BayesianOptimization(rfrcv, {'n_estimators': (10, 250),
                                             'max_features': (0.1, 0.999)}, verbose = 1)
    svcBO.maximize()

    print('RFC: %f' % svcBO.res['max']['max_val'])
    estimator_rfr = RandomForestRegressor(max_features = (rfBO.res['max']['max_params']['max_features']), \
                                  n_estimators= int(rfBO.res['max']['max_params']['n_estimators']))
    return svcBO,estimator_rfr

#    print('-'*53)

In [10]:
def grid_svrop(score_method = 'r2', greater = True, bound=[-10,10,-10,10], verbose = 1):
    ff = MySVM.SVM_CV()

    df = ff.trainsvr(train, trainlabel, seed ='rbf', Cmin=bound[0], Cmax=bound[1], \
                     numC=21, rmin=bound[2], rmax=bound[3], numr=21,\
                     degree = 3,method = score_method, rad_stat = 2, verbose = verbose)

    df_new = df.drop('gamma_range',1)
    list_gamma = np.array(df['gamma_range'])
    #df_new = df_new.drop('Unnamed: 0',1)

    list_c = df_new.columns.values.tolist()
    idx_C, idx_gamma = [],[]
    if greater == True:
        best_value = df_new.values.max()
    else:
        best_value = df_new.values.min()
    for i in list_c:
        for j in xrange(0,21):
            if df[i][j]== (best_value):

                idx_C.append(i)
                idx_gamma.append(j)
            else:
                pass
#    print ('the best C values are 'idx_C)
#    print ('the best r values are 'idx_gamma)  

    C_best = idx_C[0]
    gamma_best_id = idx_gamma[0]
    gamma_best = list_gamma[gamma_best_id]

#    print ('can achive best result of', df[C_best][gamma_best], best_value)
    C_best = np.float(C_best)
    gamma_best = np.float(gamma_best)
    estimator_svr = SVR(C=C_best, gamma=gamma_best)
    return df, best_value,estimator_svr

In [11]:
def apply_pca(train, test, number = 6):
    
    pca_fit = PCA(n_components=number).fit(train)
    pca_train = pca_fit.transform(train)
    pca_test = pca_fit.transform(test)

    train = pca_train
    test = pca_test 
    return train, test

In [12]:
# 1. initiate the training and test set:
def feature_eng(id_ = 0, method = 'None', rd_state = 7):     
    train, test, trainlabel, testlabel = initial_sets(id_ = 0, rd_state = rd_state)
    if method == 'None':
        pass
    elif method == 'pca':
        train, test = apply_pca(train, test, number = 6)
    
    return train, test, trainlabel, testlabel
       

In [13]:
def test_result(estimator_svr):

    estimator = estimator_svr
    str_metics = str(metrics)
    estimator.fit(train,trainlabel)
    predict_train = estimator.predict(train)
    predict_test = estimator.predict(test)

    
    return predict_train, predict_test
    
    
    
#     for i in metrics:
#         print (str(i)[10:19], i(trainlabel, predict_train))
#     print ('###################let me divided results here#####################')    
#     for i in metrics:
#         print (str(i)[10:19], i(testlabel, predict_test))

In [16]:
model_string =[]
train_r2_string =[]
test_r2_string = []
train_mse_string = []
test_mse_string =[]
rd_string =[]
train_cv_string = []

df_grid = []

for i in np.arange(10,30,1):
    train, test, trainlabel, testlabel = feature_eng(id_ = 0, rd_state = i)
    df_grid_result, train_cv, estimator_svr = grid_svrop(score_method = 'neg_mean_squared_error', greater = True,
                                              bound = [-10, 10, -10, 10], verbose = 0)
    predict_train, predict_test = test_result(estimator_svr)
    model_string.append(estimator_svr)
    train_r2_string.append(metrics[0](trainlabel, predict_train))
    train_mse_string.append(metrics[2](trainlabel, predict_train))    
    test_r2_string.append(metrics[0](testlabel, predict_test))
    test_mse_string.append(metrics[2](testlabel, predict_test))  
    rd_string.append(i)
    train_cv_string.append(train_cv)
    df_grid.append(df_grid_result)

    print ('The current rd_state is ', i)
    
df_result= pd.DataFrame({'rd_state': rd_string, 'model': model_string, 'train_r2':train_r2_string,\
                         'train_mse':train_mse_string, 'test_r2':test_r2_string,\
                         'test_msw':test_mse_string, 'train_cv': train_cv_string})    

('The current rd_state is ', 10)
('The current rd_state is ', 11)
('The current rd_state is ', 12)
('The current rd_state is ', 13)
('The current rd_state is ', 14)
('The current rd_state is ', 15)
('The current rd_state is ', 16)
('The current rd_state is ', 17)
('The current rd_state is ', 18)
('The current rd_state is ', 19)
('The current rd_state is ', 20)
('The current rd_state is ', 21)
('The current rd_state is ', 22)
('The current rd_state is ', 23)
('The current rd_state is ', 24)
('The current rd_state is ', 25)
('The current rd_state is ', 26)
('The current rd_state is ', 27)
('The current rd_state is ', 28)
('The current rd_state is ', 29)


In [17]:
df_result

Unnamed: 0,model,rd_state,test_msw,test_r2,train_cv,train_mse,train_r2
0,"SVR(C=1024.0, cache_size=200, coef0=0.0, degre...",10,5022.95891,0.531319,-63.004836,10.578704,0.98995
1,"SVR(C=1024.0, cache_size=200, coef0=0.0, degre...",11,769.34178,0.592252,-3024.801772,1.026907,0.999761
2,"SVR(C=512.0, cache_size=200, coef0=0.0, degree...",12,3035.476558,0.646227,-1019.822799,438.221808,0.822536
3,"SVR(C=1024.0, cache_size=200, coef0=0.0, degre...",13,2839.127798,0.671356,-1042.664996,581.460181,0.757091
4,"SVR(C=1024.0, cache_size=200, coef0=0.0, degre...",14,232.127944,-1.626251,-2956.869099,5.350264,0.99881
5,"SVR(C=512.0, cache_size=200, coef0=0.0, degree...",15,916.294376,0.578606,-3262.818368,93.173343,0.977734
6,"SVR(C=1024.0, cache_size=200, coef0=0.0, degre...",16,4718.72393,0.329144,-759.126075,556.116642,0.785562
7,"SVR(C=512.0, cache_size=200, coef0=0.0, degree...",17,934.796042,0.071766,-3318.413937,484.871786,0.893173
8,"SVR(C=1024.0, cache_size=200, coef0=0.0, degre...",18,245.367974,-0.199167,-2379.294385,737.207261,0.834283
9,"SVR(C=1024.0, cache_size=200, coef0=0.0, degre...",19,364.492564,0.787911,-2545.802385,298.027368,0.931664


In [None]:
predict_train, predict_test, trainlabel, testlabel = check_result(2)

predict_test - testlabel

In [None]:
def check_result(id_):
    
    train, test, trainlabel, testlabel = feature_eng(method = 'None', rd_state = (id_+1))
    model_1 = df_result['model'][id_].fit(train, trainlabel)
    predict_train = model_1.predict(train)
    predict_test = model_1.predict(test)
    
    for i in metrics:
        print (str(i)[10:18], i(trainlabel, predict_train))
        
    for i in metrics:
        print (str(i)[10:17], i(testlabel, predict_test))
    
    return predict_train, predict_test, trainlabel, testlabel    
#r2_score(testlabel, predict_test)

In [None]:
''' the past 

# 2. training, choose rfBO or svr
#rfBO, estimator_rfr = bayes_op()

C_best,gamma_best, estimator_svr = grid_svrop(score_method = 'neg_mean_squared_error', greater = True,
                                              bound = [-10, 10, -10, 10], verbose = 0)
estimator_svr = SVR(C=C_best, gamma=gamma_best)
# apply the random forest model

estimator_rfr = RandomForestRegressor(max_features = (rfBO.res['max']['max_params']['max_features']), \
                                  n_estimators= int(rfBO.res['max']['max_params']['n_estimators']))

for i in np.arange(1,13,1):
    
    pca = PCA(n_components=i).fit(train)
#pca.explained_variance_ratio_
    print np.sum(pca.explained_variance_ratio_) '''
    
'''    

In [None]:
'''
rd_state = 3
SVR(C=512.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.125,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
'''