In [3]:
import pandas as pd 
import numpy as np 
from scipy.stats import norm
from scipy import stats
from sklearn.model_selection import KFold 
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
import matplotlib.pylab as plt
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('../data/preprocessed_data.csv')
df = df.astype(float)
df.dropna(inplace=True) 
df.reset_index(drop=True, inplace=True)
label = 'price'
y = df[label]
X = df.drop(columns=[label])

In [5]:
def ML_pipeline_kfold(X, y, random_state, n_folds): 
    
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state) 
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state) 
    
    CV_scores = []
    r2score=[]
    best_parameters = [] 

    for train_index, CV_index in kf.split(X_other, y_other): 
        
        X_train, X_CV = X_other.iloc[train_index.tolist()], X_other.iloc[CV_index.tolist()] 
        y_train, y_CV = y_other.iloc[train_index.tolist()], y_other.iloc[CV_index.tolist()] 

        parameters = np.logspace(-6,6,num=21) 
        
        regs = []
        CV_score = []
        train_score = []

        for i in parameters: 
            reg = Ridge(alpha = i)
            reg.fit(X_train, y_train) 
        
            train_score.append(mean_squared_error(y_train,reg.predict(X_train)))
            CV_score.append(mean_squared_error(y_CV,reg.predict(X_CV)))
            regs.append(reg)
        
        best_i = parameters[np.argmin(CV_score)] 
        best_parameters.append(best_i) 
        
        reg = regs[np.argmin(CV_score)]
        CV_scores.append(np.min(CV_score))
        
        r2score.append(reg.score(X_test,y_test))
        
        best_para = best_parameters[np.argmax(r2score)] 
        
    return best_para, min(CV_score), max(r2score), reg

In [6]:
test_scores_list = [] 
for i in range(10): 
    random_state = 23*(i+1)
    best_para, CV_score, r2_score, model = ML_pipeline_kfold(X, y, random_state, 5) 
    test_scores_list.append(r2_score) 
    print('random state = {}. Best alpha is {}. r2_score is {}'.format(random_state, best_para, test_scores_list[-1])) 
average_list = np.mean(test_scores_list) 
std_list = np.std(test_scores_list)
print('average:', np.around(average_list,3))
print('std:', np.around(std_list,3))
print('test accuracy score:', np.around(average_list,3), '+/-', np.around(std_list,3))

random state = 23. Best alpha is 15.84893192461111. r2_score is 0.12160591975234647
random state = 46. Best alpha is 15.84893192461111. r2_score is 0.11407015648554941
random state = 69. Best alpha is 15.84893192461111. r2_score is 0.1027119857807598
random state = 92. Best alpha is 15.84893192461111. r2_score is 0.1275992023411331
random state = 115. Best alpha is 15.84893192461111. r2_score is 0.07924757777978442
random state = 138. Best alpha is 63.0957344480193. r2_score is 0.12193687926847274
random state = 161. Best alpha is 15.84893192461111. r2_score is 0.09198932362076473
random state = 184. Best alpha is 63.0957344480193. r2_score is 0.18143288593204754
random state = 207. Best alpha is 63.0957344480193. r2_score is 0.14132986559266825
random state = 230. Best alpha is 63.0957344480193. r2_score is 0.09837637386193343
average: 0.118
std: 0.027
test accuracy score: 0.118 +/- 0.027
