In [1]:
import pandas as pd 
import numpy as np 
from scipy.stats import norm
from scipy import stats
from sklearn.model_selection import KFold 
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import matplotlib.pylab as plt
from sklearn.svm import SVR
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/preprocessed_data.csv')
df = df.astype(float)
df.dropna(inplace=True) 
df.reset_index(drop=True, inplace=True)
label = 'price'
y = df[label]
X = df.drop(columns=[label])

In [9]:
def ML_pipeline_kfold(X, y, random_state, n_folds): 
    
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state) 
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state) 
    
    CV_scores = []
    r2score=[]
    best_parameters = [] 

    for train_index, CV_index in kf.split(X_other, y_other): 
        
        X_train, X_CV = X_other.iloc[train_index.tolist()], X_other.iloc[CV_index.tolist()] 
        y_train, y_CV = y_other.iloc[train_index.tolist()], y_other.iloc[CV_index.tolist()] 

        parameters = [(i, j) for i in [1e-3, 1e-2, 1e-1] for j in [20, 40, 60]] 
        
        regs = []
        CV_score = []
        train_score = []

        for i, j in parameters: 
            reg = SVR(gamma=i, C=j, kernel='rbf')
            reg.fit(X_train, y_train) 
        
            train_score.append(mean_squared_error(y_train,reg.predict(X_train)))
            CV_score.append(mean_squared_error(y_CV,reg.predict(X_CV)))
            regs.append(reg)
        
        best_i = parameters[np.argmin(CV_score)][0] 
        best_j = parameters[np.argmin(CV_score)][1]
        best_parameters.append((best_i, best_j)) 
        
        reg = regs[np.argmin(CV_score)]
        CV_scores.append(np.min(CV_score))
        
        r2score.append(reg.score(X_test,y_test))
        
        best_para = best_parameters[np.argmax(r2score)] 
        
    return best_para, min(CV_score), max(r2score), reg, X_test, y_test

In [10]:
test_scores_list = [] 
for i in range(10): 
    random_state = 23*(i+1)
    best_para, CV_score, r2_score, model, X_test_res, y_test_res = ML_pipeline_kfold_random_forest(X, y, random_state, 5) 
    test_scores_list.append(r2_score) 
    print('random state = {}. Best gamma is {} & best C is {}. r2_score is {}'.format(random_state, best_para[0], best_para[1], test_scores_list[-1])) 
average_list = np.mean(test_scores_list) 
std_list = np.std(test_scores_list)
print('average:', np.around(average_list,3))
print('std:', np.around(std_list,3))
print('test accuracy score:', np.around(average_list,3), '+/-', np.around(std_list,3))

random state = 23. Best gamma is 0.1 & best C is 60. r2_score is 0.23325584679760092
random state = 46. Best gamma is 0.01 & best C is 60. r2_score is 0.025355414277116628
random state = 69. Best gamma is 0.1 & best C is 60. r2_score is 0.12812245974602054
random state = 92. Best gamma is 0.1 & best C is 60. r2_score is 0.03544061521102737
random state = 115. Best gamma is 0.1 & best C is 60. r2_score is 0.2534213233970548
random state = 138. Best gamma is 0.1 & best C is 60. r2_score is 0.23709414637607207
random state = 161. Best gamma is 0.1 & best C is 60. r2_score is 0.2616189796544022
random state = 184. Best gamma is 0.1 & best C is 60. r2_score is 0.2091404467804251
random state = 207. Best gamma is 0.1 & best C is 60. r2_score is 0.019958931985271433
random state = 230. Best gamma is 0.1 & best C is 60. r2_score is 0.24418454355811922
average: 0.165
std: 0.097
test accuracy score: 0.165 +/- 0.097
