In [1]:
############# BASIC PACKAGES TO IMPORT ############
import os
import pandas as pd #To allow us to work with dataframes
import numpy as np #To allow us to make mathematical transformations
import matplotlib.mlab as mlab #To create plots
import matplotlib.pylab as plt #To create plots
%matplotlib inline 
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4 #width x height in inches
from mpl_toolkits import mplot3d
import category_encoders as ce #To encode our nominal and categorical variables
from sklearn import preprocessing, metrics #This module can be helpful when processing data
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_score, RandomizedSearchCV, train_test_split
from scipy.stats import uniform, chi2_contingency, chisquare
import pickle
import plotly.express as px
import pylab as py
import warnings

import plotly.express as px
#import scipy as sp #To play with scikit-learn.

In [2]:
############# Models to import #############
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


import xgboost as xgb
from xgboost.sklearn import XGBRegressor
#from xgboost import XGBClassifier
import sys 
!{sys.executable} -m pip install xgboost


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows',None)
#updating xgboost and scipy to get rid of an error (9/18/19)
#RUN IN TERMINAL
# pip install --upgrade pip
# pip install --upgrade xgboost 
# pip install --upgrade scipy
# pip install --upgrade sklearn
# pip install --upgrade plotly
# pip install --upgrade pydotplus 
# pip install --upgrade graphviz
# Use 'brew' instead of pip for updates to get the right packages on your computer.


#I don't believe that we need this if we export the entire sklearn library. I will comment these out 
#until it's time to start training models.
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score



In [3]:
#Package to visualize decision trees
#from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from subprocess import call
from IPython.display import Image
import graphviz

In [4]:
########### IMPORT SAVED VARIABLES FROM PREVIOUS SCRIPTS (Data Pipeline 2 Pt.2 Feature Engineering) ###########
#Open up the pickle file
f = open('DataPipeline2_pt2.pckl','rb')
pickle_list = pickle.load(f)
f.close()

#[0] = train_X2
#[1] = train_y2
#[2] = test_X2
#[3] = test_IDs

#Pull out the important objects in the pickle file
train_X2 = pickle_list[0]
train_y2 = pickle_list[1]
test_X2 = pickle_list[2]
test_IDs = pickle_list[3]

In [5]:
########### IMPORTANT FUNCTIONS INVOLVING TRAIN/DEV SETS ###########

######### SPLITSTANDARD ##########
def SplitStandard(train_X, train_y, test_X, train_split = 0.70, random_seed = 7, 
                  Standardizer = True, scaler = preprocessing.StandardScaler(), Normalizer = False, SandN = False):
    #################################################### FUNCTION DESCRIPTION ##############################################
    #################################################### INTRODUCTION ################################################
    #This is a function that will split the train_X and train_y into a train_X, dev_X, train_y, and dev_y sets so that
    #we can evaluate our models to see how well different combos of parameters are doing. This will also do a Standard
    #transform_fit on train_X, and do a transform on dev_X, and test_X. This is done so we do not get test data leak 
    #when we do the transform_fit on train_X and can get a purer examination of our models and compare them.
    #################################################### VARIABLE DEFINITIONS ##############################################
    #***train_X = the training data (features only).  
    #***train_y = the training data (target only).
    #***test_X = the testing data (features only). (I may include a predict option later, I am not sure)
    #***Standardizer = boolean that determines if we scale the features or not. Cannot be used 
    # if Normalizer is True.
    #***scaler = the sklearn scaler that we will use to scale the data along the columns. 
    # The 3 options are MinMaxScaler, RobustScaler, and StandardScaler. 
    #***train_split = The percentage of training data that will go into the new train set. 
    # 1 - train_split will go into the new dev set.
    #***random_seed = the value we insert into np.random.seed(). This is done so that 
    # we get the same train/dev split everytime we insert the same value into np.random.seed()
    #***Normalizer = boolean value that determines if we normalize the values along the rows. 
    # scalers will scale across the features so that the distribution of values along the features 
    # changes, but this variable affects the actual rows (or vectors if you will) instead. Not 
    # recommended unless you understand the changes that will occur after normalization.
    # Cannot be used with Standardizer = True
    #***SandN = Boolean that determines if we Normalize (first) and Standardize (second) the data. 
    
    
    #Combine the train_X and train_y dataframes. This is done so that when we split 
    #them into their new train / dev sets, the target variables stay with the appropriate 
    #feature vectors.
    combo_df = pd.concat([train_X,train_y],axis = 1)
    
    train, dev = train_test_split(combo_df,train_size=train_split,random_state=random_seed)
    
    
    #Split our data back into feature dataframes (new_train_X, dev_X) and the target Series (new_train_y, dev_y)
    new_train_X = train.loc[:,train.columns != 'SalePrice']
    new_train_y = train.loc[:,'SalePrice']
    dev_X = dev.loc[:,dev.columns != 'SalePrice']
    dev_y = dev.loc[:,'SalePrice']
    columns = new_train_X.columns
    
    #Now we standardize our data.
    #We initially fit the scaler to the train data (find the mean and std to be used on the other sets)
    #then we take the fit scaler and transform the dev and test set.
    if Standardizer:
        standardized_train_X = scaler.fit_transform(new_train_X)
        standardized_train_X = pd.DataFrame(standardized_train_X, columns=columns)
        standardized_dev_X = scaler.transform(dev_X)
        standardized_dev_X = pd.DataFrame(standardized_dev_X, columns=columns)
        standardized_test_X = scaler.transform(test_X)
        standardized_test_X = pd.DataFrame(standardized_test_X, columns=columns)
        #Save our standardized dataframes.
        #Standard_Data = [standardized_train_X, new_train_y, standardized_dev_X, dev_y, standardized_test_X]
        return standardized_train_X, new_train_y, standardized_dev_X, dev_y, standardized_test_X
    
    elif Normalizer:
        normalizer = preprocessing.Normalizer()
        normalized_train_X = normalizer.fit_transform(new_train_X)
        normalized_train_X = pd.DataFrame(normalized_train_X,columns=columns)
        normalized_dev_X = normalizer.transform(dev_X)
        normalized_dev_X = pd.DataFrame(normalized_dev_X,columns=columns)
        normalized_test_X = normalizer.transform(test_X)
        normalized_test_X = pd.DataFrame(normalized_test_X,columns=columns)
        #Save our normalized dataframes.
        #Normalized_Data = [normalized_train_X, new_train_y, normalized_dev_X, dev_y, normalized_test_X]
        return normalized_train_X, new_train_y, normalized_dev_X, dev_y, normalized_test_X
    
    elif SandN:
        normalizer = preprocessing.Normalizer()
        s_train_X = scaler.fit_transform(new_train_X)
        s_dev_X = scaler.transform(dev_X)
        s_test_X = scaler.transform(test_X)
        sn_train_X = normalizer.fit_transform(s_train_X)
        sn_train_X = pd.DataFrame(sn_train_X,columns=columns)
        sn_dev_X = normalizer.transform(s_dev_X)
        sn_dev_X = pd.DataFrame(sn_dev_X,columns=columns)
        sn_test_X = normalizer.transform(s_test_X)
        sn_test_X = pd.DataFrame(sn_test_X,columns=columns)
        return sn_train_X, new_train_y, sn_dev_X, dev_y, sn_test_X
    
    else:
        #If we don't standardize or normalize, then we will just return the regular split dataframes
        #Non_Standard_Data = [new_train_X, new_train_y, dev_X, dev_y, test_X]
        return new_train_X, new_train_y, dev_X, dev_y, test_X


In [6]:
############# STANDARDIZE AND SPLIT THE DATA #############
#Standardize our data, and pull out the new dataframes

#Standardized Split Data
strain_X, strain_y, sdev_X, sdev_y, stest_X = SplitStandard(train_X=train_X2,train_y=train_y2,test_X=test_X2)

#Regular Split Data 
#I created these variables incase we want to see how the same model runs on non-standardized data.
train_X3, train_y3, dev_X, dev_y, test_X3 = SplitStandard(train_X=train_X2,train_y=train_y2,test_X=test_X2,Standardizer=False)

In [7]:
########### IMPORTANT FUNCTIONS INVOLVING TRAIN/DEV SETS PT.2 ###########
def TrainDevTestErrors(model,savefileName=None,save=True,train_X=strain_X,train_y=strain_y,dev_X=sdev_X,dev_y=sdev_y,
                       test_X=stest_X,t_IDs=test_IDs,metric=metrics.mean_squared_log_error):
    #################################################### FUNCTION DESCRIPTION ##############################################
    #################################################### INTRODUCTION ################################################
    #This is a function that will compute the train and dev set errors and explained variances of a specific model.
    #This will also compute the test predictions, and save them if save=True.
    #################################################### VARIABLE DEFINITIONS ##############################################
    #***model = The Machine Learning model.
    #***savefileName = The string of the filename.
    #***save = Boolean that determines whether we save the test predictions.
    #***train_X = the training data (features only).  
    #***train_y = the training data (target only).
    #***dev_X = the dev data (features only).
    #***dev_y = the dev data (target only).
    #***test_X = the testing data (features only).
    #***t_IDs = IDs for the testing data.
    #***metric = the metric for which we are examining the error.
    
    #Fit the model to the training data 
    model_fit = model.fit(train_X,train_y)
    
    #Create predictions on the training set. Compute the error and explained variance.
    train_pred = model_fit.predict(train_X)
    train_error = metric(train_y,train_pred)
    train_explained_var = metrics.explained_variance_score(train_y,train_pred)
    
    #Create predictions on the dev set. Compute the error and explained variance.
    dev_pred = model_fit.predict(dev_X)
    dev_error = metric(dev_y,dev_pred)
    dev_explained_var = metrics.explained_variance_score(dev_y,dev_pred)
    
    #If save = True, create predictions on the test set, and save the predictions using SaveFitModels() 
    if save:
        test_pred = model_fit.predict(test_X)
        SaveFitModels(test_pred,t_IDs,savefileName)
    
    return train_error, train_explained_var, dev_error, dev_explained_var






def XGBTrainDevComparisons(xgb_model, xgb_param, xgb_parameter_values, metric=metrics.mean_squared_log_error, 
                           train_X=strain_X, train_y=strain_y, dev_X=sdev_X, dev_y=sdev_y,exp_var=False):
    #################################################### FUNCTION DESCRIPTION ##############################################
    #################################################### INTRODUCTION ################################################
    #This is a function that will take a parameter and set of parameter values that will be replaced within an 
    #xgb model. For each value of the parameter, a fit is done on the traiing set. Then, we will predict on 
    #the train and dev sets and compare their errors on a graph to see how well each parameter does on each set.
    #We also collect the explained variances on each instance of the training and dev set, and will plot that 
    #if exp_var = True.
    #################################################### VARIABLE DEFINITIONS ##############################################
    #***xgb_model= the XGBRegressor algorithm with defined parameters (can decide to leave some parameters blank)
    #***xgb_param = This is the parameter name. This will be a string of the parameter we are changing.
    #***xgb_param_vals = This is the series of values that we will be changing in our model, and will tell us
    # how well a given parameter is doing on both train and dev sets.
    #***metric = The metric that we are using to evaluate the error.
    #***train_X = the training data (features only).  
    #***train_y = the training data (target only).
    #***dev_X = the dev set data (features only). 
    #***dev_y = the dev set data (target only).
    #***exp_var = boolean that determines if we graph the expected variance values from the dev and train set.
    
    #The number of iterations 
    rounds = len(xgb_parameter_values)
    
    train_error_arr = []
    dev_error_arr = []
   
    #Create the arrays to be filled with the explained variances.
    exp_var_arr = pd.DataFrame(columns=[xgb_param,'Explained Variance','Type'])
    #error_arr = pd.DataFrame(columns=[xgb_param,'Error','Type'])
    counter = 0 #counter for the exp_var_arr
    
    #Loop through the different parameters to try.
    for i in range(rounds):
        #Create a parameter dictionary so that we can change the parameter in the model
        param_dict = {xgb_param:xgb_parameter_values[i]}
        #Update the model to have the parameter that we need.
        new_model = xgb_model.set_params(**param_dict)
        #Fit to the new model.
        new_model_fit = new_model.fit(train_X,train_y)
        
        #Dealing with the training set.
        train_pred = new_model.predict(train_X) #Create predictions
        train_error = metric(train_y,train_pred) #Find the error
        train_error_arr.append(train_error)
        #error_arr.loc[counter] = [xgb_parameter_values[i]] + [train_error,'training'] #Append to error array
        train_exp_var = metrics.explained_variance_score(train_y,train_pred) #Find the explained variance
        exp_var_arr.loc[counter] = [xgb_parameter_values[i]] + [train_exp_var,'training'] #Append to explained variance array
        counter = counter + 1 #increment counter
        
        #Dealing with the dev set.
        dev_pred = new_model.predict(dev_X) #Create predictions
        dev_error = metric(dev_y,dev_pred) #Find the error
        dev_error_arr.append(dev_error)
        #error_arr.loc[counter] = [xgb_parameter_values[i]] + [dev_error,'dev'] #Append to error array
        dev_exp_var = metrics.explained_variance_score(dev_y,dev_pred) #Find the explained variance
        exp_var_arr.loc[counter] = [xgb_parameter_values[i]] + [dev_exp_var,'dev'] #Append to explained variance array
        counter = counter + 1 #increment counter
   
    #Create the figures.
    rcParams['figure.figsize'] = 12, 4 #width x height in inches
    
#     #Plot the error_arr with px.scatter
#     fig1 = px.scatter(error_arr, x=error_arr.columns[0], y='Error', color='Type')
#     fig1.show() #Show the plot
    
    #Create the first plot - training errors              
    df1 = pd.DataFrame(data=list(zip(xgb_parameter_values,train_error_arr)),columns=[xgb_param,'Training Error'])
    fig1 = px.scatter(df1, x=df1.columns[0], y='Training Error', color='Training Error')

    #Create the second plot - dev errors
    df2 = pd.DataFrame(data=list(zip(xgb_parameter_values,dev_error_arr)),columns=[xgb_param,'Dev Error'])
    fig2 = px.scatter(df2, x=df2.columns[0], y='Dev Error', color='Dev Error')
    
    fig1.show()
    fig2.show()
    
    #If we want to show explained variance, we will plot this.
    if exp_var:
        #Create the third plot - explained variance score
        fig3 = px.scatter(exp_var_arr, x=exp_var_arr.columns[0], y='Explained Variance', color='Type')
        fig3.show()
    
    return None

In [8]:
########## OTHER IMPORTANT FUNCTIONS ###########
#Random Forest Regressor model fit function
def RFRmodelfitCV(alg, train_X, train_y, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(train_X,train_y)
    
    #Predict on the training set
    train_predictions = alg.predict(train_X)
    
    #Perform cross-validation
    if performCV:
        cv_score = cross_val_score(alg, train_X, train_y, cv = cv_folds, scoring='neg_mean_squared_log_error')
        
    #Print the model report
    print("\nModel Report")
    print("Mean Squared Log Error : %.4g" % metrics.mean_squared_log_error(train_y, train_predictions))
    print("Explained Variance Score : %.4g" % metrics.explained_variance_score(train_y, train_predictions)) #1.0 is the best value
    
    if performCV:
        #print('CV Score: %s'% cv_score)
        print("CV Scores \nMean : %.7g | Std : %.7g | Min : %.7g | Max : %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
    
    #Print Feature Importance
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_,train_X.columns).sort_values(ascending=False)[0:30]
        feat_imp.plot(kind='bar', title = 'Feature Importances')
        plt.ylabel('Feature Importance Score')
        #print(feat_imp) #I may add this



########### MODELFITXGB ###########
#Same as above function, however it works for XGBoost and can be used to 
#find the best n_estimators value at the start of the program      
def modelfitXGB(alg, train_X, train_y, useTrainCV=True, printFeatureImportance=True, cv_folds=3, early_stopping_rounds=50):
    
    #This will fit the train data to the xgboost model and cross-validate 
    #on the data until the error rate stops improving. This will find an 
    #appropriate value of n_estimators.
    if useTrainCV:
        xgb_params = alg.get_xgb_params()
        #print(alg.get_params()['n_estimators'])
        xgtrain = xgb.DMatrix(train_X.values,label=train_y.values)
        cvresult = xgb.cv(xgb_params,xgtrain,num_boost_round=alg.get_params()['n_estimators'],nfold=cv_folds,metrics='rmse',
                          early_stopping_rounds=early_stopping_rounds,verbose_eval=False)
        alg.set_params(n_estimators=cvresult.shape[0])
        print("n_estimators: %.4g" % alg.get_params()['n_estimators'])
    
    #Fit Algorithm on the data
    alg.fit(train_X,train_y,eval_metric='rmse')
    
    #Predict training set
    train_predictions = alg.predict(train_X)
    
    #Print Model Report
    print("\nModel Report")
    print("Mean Squared Log Error : %.4g" % metrics.mean_squared_log_error(train_y, train_predictions))
    print("Explained Variance Score : %.4g" % metrics.explained_variance_score(train_y, train_predictions)) #1.0 is the best value
    
    #Print feature importances
    if printFeatureImportance:
        rcParams['figure.figsize'] = 12, 4 #width x height in inches
        feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)[0:30]
        feat_imp.plot(kind='bar',title = 'Feature Importances')
        plt.ylabel('Feature Important Score')

        

######### SAVEFITMODELS #########
#Save our predictions to the proper directory.
def SaveFitModels(pred, IDs, fileName, saveDirectory1 = '/Users/armenta/Kaggle/Housing Prices/Predictions 2/', 
                  saveDirectory2 ='/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS 2/'):
    
    #Converting the predictions into a form that can be combined with their ID's
    pred = pd.Series(pred)
    pred = pd.concat([pred,IDs.rename('Id')],axis=1)
    pred = pred.rename(columns = {0:'SalePrice','Id':'Id'})
    pred = pred[['Id','SalePrice']]
    #Create the path to save the outputs
    path1 = saveDirectory1 + fileName
    path2 = saveDirectory2 + fileName
    #Save the outputs
    pred.to_csv(path_or_buf = path1)
    pred.to_csv(path_or_buf = path2)
    
    

   
    

def TDComp(model_results,column,number=10):
    #################################################### FUNCTION DESCRIPTION ##############################################
    #################################################### INTRODUCTION ################################################
    #This is a function that takes the model_results from a parameter gridsearch or randomizedsearch, and
    #grabs the top # of column values (specified by column and number) and transforms it into a viable 
    #format that can be used for the XGBTrainDevComparisons function. The output is supposed to be used for 
    #the xgb_parameter_values variable.
    #################################################### VARIABLE DEFINITIONS ##############################################
    #***model_results = the model results from running a parameter grid or randomized search on a ML model.
    #***column = The column that you want to look at, or the parameter that was tuned. Usually, the string is
    # 'param_' + paramter name 
    #***number = The number of values that you want to collect from the model_results. Cannot be bigger than 
    # model_results.shape[0], or else you would be wanting to grab more values than there are in the dataframe.

    #Example code:
    #top_n_est = pd.Series(model_results_dart1_2.loc[model_results_dart1_2.rank_test_score<=10,'param_n_estimators']).reset_index(drop=True)
    
    #Grab the specific values that you want. This series is designed to be used as xgb_parameter_values for the 
    #XGBTrainDevComparisons function below.
    top_values = pd.Series(model_results.loc[model_results.rank_test_score<=number,column]).reset_index(drop=True)
    return top_values



def Standardizer(train_X = train_X2, test_X = test_X2, StandardScaler = preprocessing.StandardScaler(), Standardizer = True, Normalizer = False, SandN = False):
    #################################################### FUNCTION DESCRIPTION ##############################################
    #################################################### INTRODUCTION ################################################
    #This function will do a Standard transform_fit on train_X and test_X. This is done so we do not get test data leak 
    #when we do the transform_fit on train_X and can get a purer examination of our models and compare them.
    #################################################### VARIABLE DEFINITIONS ##############################################
    #***train_X = the training data (features only).  
    #***train_y = the training data (target only).
    #***test_X = the testing data (features only). (I may include a predict option later, I am not sure)
    #***Standardizer = boolean that determines if we scale the features or not. Cannot be used 
    # if Normalizer is True.
    #***scaler = the sklearn scaler that we will use to scale the data along the columns. 
    # The 3 options are MinMaxScaler, RobustScaler, and StandardScaler. 
    #***Normalizer = boolean value that determines if we normalize the values along the rows. 
    # scalers will scale across the features so that the distribution of values along the features 
    # changes, but this variable affects the actual rows (or vectors if you will) instead. Not 
    # recommended unless you understand the changes that will occur after normalization.
    # Cannot be used with Standardizer = True
    #***SandN = Boolean that determines if we Normalize (first) and Standardize (second) the data. 
    
    #Save the column names so that we can convert the arrays to dataframes
    columns = train_X.columns
    
    if Standardizer:
        #Now we standardize our data.
        #We initially fit the scaler to the train data (find the mean and std to be used on the other sets)
        #then we take the fit scaler and transform the dev and test set.
        standardized_train_X = StandardScaler.fit_transform(train_X) #Transform the train data
        standardized_train_X = pd.DataFrame(standardized_train_X, columns=columns) #Convert to a dataframe
        standardized_test_X = StandardScaler.transform(test_X) #Transform the test data
        standardized_test_X = pd.DataFrame(standardized_test_X, columns=columns) #Convert to a dataframe
        #Return the standardized datasets
        return standardized_train_X, standardized_test_X
    
    elif Normalizer:
        #We can normalize the data
        normalizer = preprocessing.Normalizer() #Instantiate the normalizer
        normalized_train_X = normalizer.fit_transform(train_X) #Transform the train data
        normalized_train_X = pd.DataFrame(normalized_train_X,columns=columns) #Convert to a dataframe
        normalized_test_X = normalizer.transform(test_X) #Transform the test data
        normalized_test_X = pd.DataFrame(normalized_test_X,columns=columns) #Convert to a dataframe
        #Return the normalized datasets
        return normalized_train_X, normalized_test_X
    
    elif SandN:
        normalizer = preprocessing.Normalizer() #Instantiate the normalizer
        s_train_X = StandardScaler.fit_transform(train_X) #Standardize the train data
        s_test_X = StandardScaler.transform(test_X) #Standardize the test data 
        sn_train_X = normalizer.fit_transform(s_train_X) #Normalize the train data 
        sn_train_X = pd.DataFrame(sn_train_X,columns=columns) #Convert to a dataframe
        sn_test_X = normalizer.transform(s_test_X) #Normalize the test data
        sn_test_X = pd.DataFrame(sn_test_X,columns=columns) #Convert to a dataframe
        #Return the standardized / normalized datasets
        return sn_train_X, sn_test_X
    


In [9]:
############# STANDARDIZE THE DATA (NO SPLITS) #############
#Standardized Data
strain_X2, stest_X2 = Standardizer()

#Normalized Data
ntrain_X2, ntest_X2 = Standardizer(Standardizer=False, Normalizer=True)

#Standardized and Normalized Data
sntrain_X2, sntest_X2 = Standardizer(Standardizer=False, SandN=True)

In [10]:
########## OTHER IMPORTANT FUNCTIONS pt.2 ###########
################### XGBRModelTune Function ###################
def XGBRModelTune(xgb_alg, xgb_param, xgb_param_vals, train_X=strain_X2, train_y=train_y2, test_X=stest_X2, 
                  cv_num=3, scoring='neg_mean_squared_log_error',Randomized = False, n_iter = 10, 
                  plot2d = True, modelfit = False):
    #################################################### FUNCTION DESCRIPTION ##############################################
    #################################################### INTRODUCTION ################################################
    #This is a function that is used to tune parameters for the XGBoost parameters. There are a total of approximately
    #11 parameters to change in XGBoost, but there will only be 9 that can be tuned in this function. 
    #The only 2 that are not being tuned: objective and booster. You can change these in the definition of the function,
    #but they will not be tuned in this function because the number of values are so low, that I think its best 
    #to manually test it.
    #################################################### VARIABLE DEFINITIONS ##############################################
    #***train_X = the training data (features only).  
    #***train_y = the training data (target only).
    #***test_X = the testing data (features only). (I may include a predict option later, I am not sure)
    #***xgb_alg = the XGBRegressor algorithm with starting parameters (can decide to leave some parameters blank)
    #***xgb_param = This is the parameter name. This will be a string of the parameter we are tuning.
    #***xgb_param_vals = This will be the range that we will search for when we grid search for the best variable values.
    # The range should be as long as you can possibly make it so we can test a plethora of values.
    # If Randomized = True, make sure that the array is larger than the value given for n_jobs, 
    # as this will return an error for RandomizedSearchCV. If you are unsure, then just leave Randomized = False. 
    # The different variables are as follows:
          #***learning_rate = the learning rate of the XGBRegressor algorithm.
          #***n_estimators = the number of trees to use in this ensemble model. 
          #***max_depth = maximum depth allowed for an individual tree.
          #***min_child_weight = minimum number of weights allowed for a child node; basically a variable that describes the amount of 
          # observations that are allowed in each child node. The higher the value, the more values that are required in each node.
          #***gamma = A value that defines the minimum positive reduction in the loss function that must occur for a node to split.
          #***subsample = A value that denotes the % of samples to be used in each node of the tree.
          #***colsample_bytree = A value that determines the % of columns to be used for each tree.
          #***objective = The loss function to be minimized.
          #***booster = The type of model that we run at each iteration. Can choose gbtree (tree-based models), gblinear (linear models),
          # or dart which is similar to gbtree but it implements deep neural networks drop-out technique.
          #***reg_lambda = L2 regularization term on weights. Used to handle the main regularization part of XGBoost.
          #***reg_alpha = L1 regularization term on weights.  
    #***cv_num = The number of cross-validation folds that will be used in the parameter search process.
    #***Randomized = A boolean value that decides if the first search you do for parameter searches is randomized or not.
    #***n_iter = A number that is only used if Randomized is true. It essentially determines the number of minimum iterations 
    # RandomizedSearchCV will do before it stops testing random values of the variable in the distribution.
    # I recommend len(xgb_param_vals) - 10.
    #***plot2d = A boolean that will decide whether we show a 2d plot of error vs variable values. This will essentially help
    # us determine a more effective and smaller range to look at after we do the search.
    #***modelfit = A boolean that will determine if we run the modelfitXGB function to observe important features 
    # in the XGBR model
    
    #This prevents us from getting warnings that are unnecessary and don't add to anything.
    warnings.simplefilter(action='ignore', category=FutureWarning)
    
    #For RandomizedCVSearch
    if Randomized:
        #Create the dictionary object that is used in RandomizedSearchCV
        param_distributions = {xgb_param:xgb_param_vals}
        #Create the RandomizedSearchCV object
        random_search_model = RandomizedSearchCV(estimator = xgb_alg,param_distributions = param_distributions,
                                           n_iter = n_iter,scoring = scoring,n_jobs=-1,iid=False,cv=cv_num)
        #Fit the data to our random search object
        random_search_model.fit(train_X,train_y)
        #These variables will be returned along with the model.
        rs_results = pd.DataFrame(random_search_model.cv_results_) #The results of the random search
        best_param_val = random_search_model.best_params_ #The best parameter
        best_score_val = random_search_model.best_score_ #The best score associated with the best parameter
        
        #Store the returned values in a single list 
        return_values = [random_search_model,rs_results,best_param_val,best_score_val]
        print(best_param_val, best_score_val)
        #Create a 2d plot of mean_test_score (y) vs parameter values (x)
        if plot2d:
            rcParams['figure.figsize'] = 12, 4 #width x height in inches
            param_name = 'param_'+ xgb_param
            fig = px.scatter(rs_results,x=param_name,y='mean_test_score',color='mean_test_score')
            fig.show()
        #Create a bar plot showing the weights of the most important features so far. 
        if modelfit:
            p_dict = {xgb_param:best_param_val[xgb_param]}
            xgb_alg.set_params(**p_dict)
            modelfitXGB(xgb_alg,train_X,train_y,cv_folds=cv_num)  
        
        return return_values 
    
    
    
    #For a GridSearchCV
    else:
        #Create the dictionary object that is used in GridSearchCV
        param_grid = {xgb_param:xgb_param_vals}
        #Create the GridSearch object that will be fitted on the training_data.
        grid_search_model = GridSearchCV(estimator = xgb_alg,param_grid = param_grid,scoring = scoring,
                                        n_jobs = -1,iid = False, cv = cv_num)
        #Fit the training data to the grid search object
        grid_search_model.fit(train_X,train_y)
        
        #Save these following three variables to be returned later 
        gs_results = pd.DataFrame(grid_search_model.cv_results_) #The results of the grid search
        best_param_val = grid_search_model.best_params_ #The best parameter value
        best_score_val = grid_search_model.best_score_ #The best score associated with the best parameter value
        
        #Save the return values in a single list
        return_values = [grid_search_model,gs_results,best_param_val,best_score_val]
        print(best_param_val, best_score_val)
        
        #Create a 2d plot of mean_test_score (y) vs parameter values (x)
        if plot2d:
            rcParams['figure.figsize'] = 12, 4 #width x height in inches
            param_name = 'param_'+ xgb_param
            fig = px.scatter(gs_results,x=param_name,y='mean_test_score',color='mean_test_score')
            fig.show()
        #Create a bar plot showing the weights of the most important features so far. 
        if modelfit:
            p_dict = {xgb_param:best_param_val[xgb_param]}
            xgb_alg.set_params(**p_dict)
            modelfitXGB(xgb_alg,train_X,train_y,cv_folds=cv_num)
        return return_values 
    
    
    
def TrainTestErrors(model,savefileName=None,save=True,train_X=strain_X2,train_y=train_y2,test_X=stest_X2,
                    t_IDs=test_IDs,metric=metrics.mean_squared_log_error):
    #################################################### FUNCTION DESCRIPTION ##############################################
    #################################################### INTRODUCTION ################################################
    #This is a function that will compute the train set errors and explained variances of a specific model.
    #This will also compute the test predictions, and save them if save=True.
    #################################################### VARIABLE DEFINITIONS ##############################################
    #***model = The Machine Learning model.
    #***savefileName = The string of the filename.
    #***save = Boolean that determines whether we save the test predictions.
    #***train_X = the training data (features only).  
    #***train_y = the training data (target only).
    #***test_X = the testing data (features only).
    #***t_IDs = IDs for the testing data.
    #***metric = the metric for which we are examining the error.
    
    #Fit the model to the training data 
    model_fit = model.fit(train_X,train_y)
    
    #Create predictions on the training set. Compute the error and explained variance.
    train_pred = model_fit.predict(train_X)
    train_error = metric(train_y,train_pred)
    train_explained_var = metrics.explained_variance_score(train_y,train_pred)
    
    #If save = True, create predictions on the test set, and save the predictions using SaveFitModels() 
    if save:
        test_pred = model_fit.predict(test_X)
        SaveFitModels(test_pred,t_IDs,savefileName)
    
    return train_error, train_explained_var

In [12]:
############# RIDGE REGRESSION ############
###### NOTE: THIS MODEL WAS USED BERFORE I MADE STANDARDSCALER ########
######## Ridge Regression with RandomizedCV ########
# alpha_val = [0.001,0.005,0.01,0.025,0.05,0.075,0.1,0.25,0.5,0.75,1,2]
# param_test1 = {'alpha':alpha_val}
# RidgeGridSearch1 = GridSearchCV(estimator = Ridge(normalize=True),param_grid=param_test1,
#                                         scoring='neg_mean_squared_error',n_jobs=-1,iid=False,cv=5)
# RidgeGridSearch1.fit(train_X2,train_y2)
# Ridge1_Results = pd.DataFrame(RidgeGridSearch1.cv_results_)
# Ridge1_Results.loc[:,('params','mean_test_score','std_test_score')]
# RidgeGridSearch1.best_params_, RidgeGridSearch1.best_score_

########Ridge Regression with different alpha values, doing by hand without the gridsearch
######## alpha = 0.001 #########
# ridge_reg1 = Ridge(alpha=0.001)
# ridge_model1 = ridge_reg1.fit(train_X2,train_y2)
# ridge_predictions1 = ridge_model1.predict(test_X2)
# SaveFitModels(pred=ridge_predictions1, IDs=test_IDs, fileName = 'Ridge1_10222019.csv')


######## alpha = 0.001, normalize = True #########
# ridge_reg2 = Ridge(alpha=0.001,normalize=True)
# ridge_model2 = ridge_reg2.fit(train_X2,train_y2)
# ridge_predictions2 = ridge_model2.predict(test_X2)
# SaveFitModels(pred=ridge_predictions2, IDs=test_IDs, fileName = 'Ridge2_10222019.csv')

######## alpha = 0.025, normalize = True ########
# ridge_reg3 = Ridge(alpha=0.025,normalize=True)
# ridge_model3 = ridge_reg3.fit(train_X2,train_y2)
# ridge_predictions3 = ridge_model3.predict(test_X2)
# SaveFitModels(pred=ridge_predictions3, IDs=test_IDs, fileName = 'Ridge3_10222019.csv')


In [14]:
############ RANDOM FOREST REGRESSOR ############
###### NOTE: THIS MODEL WAS USED BERFORE I MADE STANDARDSCALER ########
#rcParams['figure.figsize'] = 12, 4 #width x height in inches

######## The Beginner Model (NO TUNING)#########
# rf_0 = RandomForestRegressor(random_state=5)
# # RFRmodelfitCV(rf_0,train_X2,train_y2)
# rf_0_model = rf_0.fit(train_X2,train_y2)
# rf_0_pred = rf_0_model.predict(test_X2)
# SaveFitModels(pred=rf_0_pred,IDs=test_IDs,fileName='RandomForestRegressor0_10222019.csv')


############## TUNE N_ESTIMATORS, MAX_DEPTH, MIN_SAMPLES_LEAF #############
###########################################################################
####### ALREADY RAN ONCE, NOW LETS DO GRIDSEARCHCV WITH THESE VALUES ######
###########################################################################
# # r_exp1 = -4*np.random.rand(100)
# # learn_rate1 = 10**r_exp1
# n_est1 = range(20,151,10)
# max_depth1 = range(3,11,1)
# msl1 = range(10,101,10)
# param_test1 = {'n_estimators':n_est1,'max_depth':max_depth1,'min_samples_leaf':msl1}
# rfr_search1 = RandomizedSearchCV(estimator = RandomForestRegressor(random_state=5,min_samples_split=14,max_features='sqrt'),
#                               param_distributions = param_test1,n_iter=60,scoring='neg_mean_squared_log_error',
#                                  n_jobs=-1,iid=False,cv=5)
# rfr_search1.fit(train_X2,train_y2)
# rfr_results1 = pd.DataFrame(rfr_search1.cv_results_)
# #rfr_results1.loc[:,('params','mean_test_score','std_test_score')]
# rfr1_plot3D = rfr_results1.loc[:,('param_n_estimators','param_min_samples_leaf','param_max_depth','mean_test_score')]
# rfr_search1.best_params_,rfr_search1.best_score_
###############################################################################
####### ALREADY RAN ONCE, NOW LETS DO GRIDSEARCHCV WITH THESE VALUES ##########
###############################################################################


########### Save these variables so that we do not need to run the randomized search
# #pickle_list2 = [rfr1_plot3D,rfr_search1.best_params_,rfr_search1.best_score_]
# #f = open('/Users/armenta/Kaggle/Housing Prices/Data Pipeline 2 Saved Variables/DP2_rfr1.pckl','wb')
# #pickle.dump(pickle_list2,f)
# #f.close()


########### Open up these variables to look at them at a scatter plot
# f = open('/Users/armenta/Kaggle/Housing Prices/Data Pipeline 2 Saved Variables/DP2_rfr1.pckl','rb')
# pickle_list2 = pickle.load(f)
# f.close()
# rfr1_plot3D = pickle_list2[0]
# rfr1_best_params_ = pickle_list2[1]
# rfr1_best_score_ = pickle_list2[2]


########## MATPLOTLIB 3D SCATTER ########
#########################################
# fig1 = plt.figure()
# ax1 = fig1.add_subplot(111,projection='3d')
# zdata1 = rfr1_plot3D.param_n_estimators
# ydata1 = rfr1_plot3D.param_min_samples_leaf
# xdata1 = rfr1_plot3D.param_max_depth
# cdata1 = rfr1_plot3D.mean_test_score
# plt1 = ax1.scatter(xdata1,ydata1,zdata1,c=cdata1,s=50,depthshade=True)
# fig1.colorbar(plt1)


########## PLOTLY.EXPRESS 3D SCATTER #########
##############################################
# fig = px.scatter_3d(rfr1_plot3D,x='param_n_estimators',y='param_min_samples_leaf',
#                     z='param_max_depth',color='mean_test_score')
# fig.show()


########### GridSearchCV w/ n_estimators, max_depth, min_samples_leaf ###########
##################################################################################
# n_est2 = range(80,201,5)
# max_depth2 = range(8,15,1)
# msl_2 = range(5,16,1)
# param_test2 = {'n_estimators':n_est2,'max_depth':max_depth2,'min_samples_leaf':msl_2}
# rfr_gsearch1 = GridSearchCV(estimator = RandomForestRegressor(random_state=5,min_samples_split=14,max_features='sqrt'),
#                            param_grid = param_test2, scoring='neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
# rfr_gsearch1.fit(train_X2,train_y2)
#
###################Pull out the relevant variables that will be graphed.
# gs_results1 = pd.DataFrame(rfr_gsearch1.cv_results_)
# rfr_results2 = gs_results1.loc[:,('param_n_estimators','param_min_samples_leaf','param_max_depth','mean_test_score')]
# #gs_results1.loc[:,('params','mean_test_score','std_test_score')]
# rfr_gsearch1.best_params_, rfr_gsearch1.best_score_
#
################## Plot the variables in a 3d scatter plot 
# fig = px.scatter_3d(rfr_results2,x='param_n_estimators',y='param_min_samples_leaf',
#                    z='param_max_depth',color='mean_test_score')
# fig.show()


########### GridSearchCV w/ n_estimators, min_samples_leaf, min_samples_split ####
##################################################################################
# n_est3 = range(175,251,5)
# msl_3 = range(2,8,1)
# mss_3 = range(5,21,1)
# param_test3 = {'n_estimators':n_est3,'min_samples_leaf':msl_3,'min_samples_split':mss_3}
# rfr_gsearch2 = GridSearchCV(estimator = RandomForestRegressor(random_state=5,max_depth=13,max_features='sqrt'),
#                            param_grid = param_test3, scoring = 'neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
# rfr_gsearch2.fit(train_X2,train_y2)
#
#gs_results2 = pd.DataFrame(rfr_gsearch2.cv_results_)
#rfr_results3 = gs_results2.loc[:,('param_n_estimators','param_min_samples_leaf','param_min_samples_split','mean_test_score')]
#rfr_gsearch2.best_params_, rfr_gsearch2.best_score_
#
# fig = px.scatter_3d(rfr_results3,x='param_n_estimators',y='param_min_samples_leaf',
#                     z='param_min_samples_split',color='mean_test_score')
# fig.show()


################ GridSearchCV w/ min_samples_split, max_features #################
##################################################################################
# mss_4 = range(3,11,1)
# mf_4 = range(6,31,2)
# param_test4 = {'min_samples_split':mss_4,'max_features':mf_4}
# rfr_gsearch3 = GridSearchCV(estimator = RandomForestRegressor(random_state=5,max_depth=13,n_estimators=225,min_samples_split=2),
#                            param_grid = param_test4, scoring = 'neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
# rfr_gsearch3.fit(train_X2,train_y2)
#
# gs_results3 = pd.DataFrame(rfr_gsearch3.cv_results_)
# rfr_results4 = gs_results3.loc[:,('param_min_samples_split','param_max_features','mean_test_score')]
# rfr_gsearch3.best_params_, rfr_gsearch3.best_score_
#
# fig = px.scatter(rfr_results4,x='param_min_samples_split',y='param_max_features',
#                  color='mean_test_score')
# fig.show()


#################### TUNED RANDOMFORESTREGRESSOR MODEL #######################
#rfr_tuned = RandomForestRegressor(random_state=5,max_depth=13,n_estimators=225,min_samples_leaf=2,
#                                 min_samples_split=4,max_features=26)
# #modelfitCV(rfr_tuned,train_X2,train_y2)
# rfr_model4 = rfr_tuned.fit(train_X2,train_y2)
# rfr_pred4 = rfr_model4.predict(test_X2)
# SaveFitModels(pred=rfr_pred4,IDs=test_IDs,fileName='RandomForestRegressor4_10232019.csv')


############### VISUALIZING ONE OF THE TREES IN A RANDOMFOREST ##############
#NOTE: RANDOM FORESTS IS AN ENSEMBLE METHOD WHERE NOT A SINGLE ESTIMATOR IS USED,
#BUT ALL OF THEM ARE COMBINED TOGETHER TO CREATE A BETTER MODEL THAN ANY SINGLE TREE.
# rfr_tuned = RandomForestRegressor(random_state=5,max_depth=13,n_estimators=225,min_samples_leaf=2,
#                                  min_samples_split=4,max_features=26)
# rfr_tuned.fit(train_X2,train_y2)
# estimator = rfr_tuned.estimators_[1] #There are 225 trees in the random forest.
#
# export_graphviz(estimator,out_file='rfr_tree1.dot',feature_names=train_X2.columns,
#                 class_names='SalePrice',rounded=True,proportion=False,precision=2,filled=True)
#
# call(['dot','-Tpng', 'rfr_tree1.dot', '-o', 'rfr_tree1.png'])
#
# Image(filename = 'rfr_tree1.png')

In [26]:
############ XGBRegressor (Standardized Train/Dev Sets Split) #############
#****NOTE: I changed the inputs of the XGBRModelTune to be different datasets, 
# so change the inputs of the functions if you want to rerun this code.


# warnings.simplefilter(action='ignore', category=FutureWarning)
# rcParams['figure.figsize'] = 12, 4 #width x height in inches

########################### PART I ############################

########## Round I - n_estimators #########
# xgb1 = XGBRegressor(learning_rate=0.1, n_estimators=1000, max_depth=6, min_child_weight=4, subsample=0.6, 
#                    colsample_bytree=0.109, objective='reg:squarederror',seed=20)
#modelfitXGB(xgb1,strain_X,strain_y)

########## Example of comparing train and dev sets #########
# xgb1_model = xgb1.fit(strain_X,strain_y)
# train_p1 = xgb1_model.predict(strain_X)
# train_error = metrics.mean_squared_log_error(strain_y,train_p1)
# dev_p1 = xgb1_model.predict(sdev_X)
# dev_error = metrics.mean_squared_log_error(sdev_y,dev_p1)
# print(train_error, dev_error)
# xgb1_predict = xgb1_model.predict(stest_X)
# SaveFitModels(pred=xgb1_predict,IDs=test_IDs,fileName='xgb1_11062019.csv')

########## Round II - max_depth ##########
# #Instantiate the new model with updated n_estimators value
# xgb2 = XGBRegressor(learning_rate=0.1, n_estimators=156, min_child_weight=4, subsample=0.6, colsample_bytree=0.109,
#                    objective='reg:squarederror',seed=20)
# #Create the range that the grid search will be performed over 
# max_depth_range = range(2,16,1)
# model2, model_results2, best_max_depth, best_score2 = XGBRModelTune(xgb_alg=xgb2, xgb_param = 'max_depth',
#                                                                     xgb_param_vals = max_depth_range, modelfit = False)

######### Round III - min_child_weight ##########
# #Instantiate the new model with updated max_depth
# xgb3 = XGBRegressor(learning_rate=0.1, n_estimators=156, max_depth=4, subsample=0.6, colsample_bytree=0.109, 
#                    objective ='reg:squarederror',seed=20)
# #Create the range that the grid search will be performed over
# min_child_weight_range = range(1,11,1)
# model3, model_results3, best_min_child_weight, best_score3 = XGBRModelTune(xgb3,'min_child_weight',min_child_weight_range,
#                                                                           modelfit = False)

######### Round IV - gamma #########
# #Instantiate the new model with updated gamma
# xgb4 = XGBRegressor(learning_rate=0.1,n_estimators=156,max_depth=4,min_child_weight=4,subsample=0.6,
#                     colsample_bytree=0.109,objective='reg:squarederror',seed=20)
# #Create the range that the grid search will be performed over 
# gamma_range = range(0,21,1)
# model4, model_results4, best_gamma, best_score4 = XGBRModelTune(xgb4,'gamma',gamma_range, modelfit=False)

######### Round IV - gamma pt.2 #########
# #Copy and pasted the one above because its the exact same parameters.
# xgb4 = XGBRegressor(learning_rate=0.1,n_estimators=156,max_depth=4,min_child_weight=4,subsample=0.6,
#                     colsample_bytree=0.109,objective='reg:squarederror',seed=20)
# exp_gamma = -2*np.random.rand(60) #Randomly sample the exponent for gammas range
# gamma_range = 10**exp_gamma #Randomly will pick values between 0.01 -> 1
# n_iterations = len(gamma_range2) - 10
#
# model4, model_results4, best_gamma, best_score = XGBRModelTune(xgb4,'gamma',gamma_range,Randomized=True,
#                                                                  n_iter = n_iterations,modelfit=False)

######### Round V - subsample #########
# xgb5 = XGBRegressor(learning_rate=0.1,n_estimators=156,max_depth=4,min_child_weight=4,colsample_bytree=0.109,
#                     objective='reg:squarederror',seed=20)
# subsample_range = np.arange(0.05,1.05,0.05)
# model5, model_results5, best_subsample, best_score5 = XGBRModelTune(xgb5,'subsample',subsample_range,modelfit=False)

######## Round VI - colsample_bytree ########
# xgb6 = XGBRegressor(learning_rate=0.1,n_estimators=156,max_depth=4,min_child_weight=4,objective='reg:squarederror',seed=20)
# colsample_bytree_range = np.arange(0.10,1.05,0.05)
# model6, model_results6, best_colsample_bytree, best_score6 = XGBRModelTune(xgb6,'colsample_bytree',colsample_bytree_range,
#                                                                           modelfit=False)

######## Round VII - n_estimators *BUST* *NOT USABLE DATA* ########
# xgb7 = XGBRegressor(learning_rate=0.1,max_depth=4,min_child_weight=4,subsample=1,colsample_bytree=0.25,
#                     objective='reg:squarederror',seed=20)
# n_estimators_range = range(100,305,5)
# model7, model_results7, best_n_estimators, best_score7 = XGBRModelTune(xgb7,'n_estimators',n_estimators_range,modelfit=False)
#
######## Round VII - n_estimators pt.2 #######
# xgb7 = XGBRegressor(learning_rate=0.1,max_depth=4,min_child_weight=4,subsample=1,colsample_bytree=0.25,
#                     objective='reg:squarederror',seed=20)
# n_estimators_range = range(285,301,1)
# model7, model_results7, best_n_estimators, best_score7 = XGBRModelTune(xgb7,'n_estimators',n_estimators_range,
#                                                                        modelfit=False)

######## Round VIII - learning_rate #########
# xgb8 = XGBRegressor(n_estimators=500,max_depth=4,min_child_weight=4,subsample=1,colsample_bytree=0.25,
#                    objective='reg:squarederror',seed=20)
# exp_LR = -3*np.random.rand(100)
# learning_rate_range = 10**exp_LR
# model8, model_results8, best_learning_rate, best_score8 = XGBRModelTune(xgb8,'learning_rate',learning_rate_range,
#                                                                        Randomized=True, n_iter=50, modelfit=False)

######## Round VII REDUX - n_estimators ############
# xgb7_redux = XGBRegressor(learning_rate = best_learning_rate['learning_rate'],max_depth=4,min_child_weight=4,
#                           subsample=1,colsample_bytree=0.25,objective='reg:squarederror',seed=20)
# n_estimators_range = range(400,605,5)
# model7, model_results7, best_n_estimators, best_score7 = XGBRModelTune(xgb7_redux, 'n_estimators',n_estimators_range,
#                                                                       modelfit=False)

######## SAVE THE TUNED VARIABLES FROM PART I ########
# pickle_dict1 = {'learning_rate':best_learning_rate['learning_rate'],'n_estimators':595,'max_depth':4,'min_child_weight':4,
#                'subsample':1,'colsample_bytree':0.25}
# f = open('/Users/armenta/Kaggle/Housing Prices/Data Pipeline 2 Saved Variables/DP2_xgb_tuned1_vars.pckl','wb')
# pickle.dump(pickle_dict1,f)
# f.close()

######## OPEN THE TUNED VARIABLES FROM PART I ########
# f = open('/Users/armenta/Kaggle/Housing Prices/Data Pipeline 2 Saved Variables/DP2_xgb_tuned1_vars.pckl','rb')
# pickle_dict1 = pickle.load(f)
# f.close()
# best_lr = pickle_dict1['learning_rate']
# n_est = pickle_dict1['n_estimators']
# max_d = pickle_dict1['max_depth']
# min_cw = pickle_dict1['min_child_weight']
# subsamp = pickle_dict1['subsample']
# colsamp_bytree = pickle_dict1['colsample_bytree']

######## LOOKING AT TRAIN, DEV, AND TEST ERROR #########
# xgb_tuned1 = XGBRegressor(learning_rate = best_lr,n_estimators=n_est,max_depth=max_d,min_child_weight=min_cw,
#                           subsample=subsamp,colsample_bytree=colsamp_bytree,objective='reg:squarederror',seed=20)
#modelfitXGB(xgb_tuned1,strain_X,strain_y) # I ran this to look at the feature importances of this first tuned model.
# xgb_tunedmodel1 = xgb_tuned1.fit(strain_X,strain_y)
# train_pred1 = xgb_tunedmodel1.predict(strain_X)
# train_error = metrics.mean_squared_log_error(strain_y,train_pred1)
# dev_pred1 = xgb_tunedmodel1.predict(sdev_X)
# dev_error = metrics.mean_squared_log_error(sdev_y,dev_pred1)
# print(train_error, dev_error)
# test_pred1 = xgb_tunedmodel1.predict(stest_X)
# SaveFitModels(test_pred1,test_IDs,'xgb_tuned1_11072019.csv')





########################### PART II ############################

########## Round I - reg_lambda #########
# xgb1 = XGBRegressor(learning_rate=best_lr, n_estimators=n_est, max_depth=max_d, min_child_weight=min_cw, 
#                     subsample=subsamp, colsample_bytree=colsamp_bytree, objective='reg:squarederror',seed=20)
# lambda_range = range(1,21,1)
# model1, model_results1, best_lambda, best_score1 = XGBRModelTune(xgb1, 'reg_lambda',lambda_range)
#
# # Testing values between 0.001 -> 1
########## Round I - reg_lambda pt.2 #########
# xgb1 = XGBRegressor(learning_rate=best_lr, n_estimators=n_est, max_depth=max_d, min_child_weight=min_cw, 
#                      subsample=subsamp, colsample_bytree=colsamp_bytree, objective='reg:squarederror',seed=20)
# lambda_exp = -4*np.random.rand(100) #Going to try out much smaller values.
# lambda_range = 10**lambda_exp
# model1_2, model_results1_2, best_lambda, best_score1_2 = XGBRModelTune(xgb1, 'reg_lambda', lambda_range, 
#                                                                        Randomized = True,n_iter = 70, 
#                                                                        modelfit = False)

########## Round II - reg_alpha ##########
# xgb2 = XGBRegressor(learning_rate=best_lr, n_estimators=n_est, max_depth=max_d, min_child_weight=min_cw, 
#                      subsample=subsamp, colsample_bytree=colsamp_bytree, reg_lambda = 6, objective='reg:squarederror',
#                      seed=20)
# alpha_range = [0.01,0.05,0.1,0.25,0.5,0.75,1,5,10,15,20]
# model2, model_results2, best_alpha, best_score2 = XGBRModelTune(xgb2, 'reg_alpha',alpha_range,modelfit=False)
#
# # Play around with alpha a little more.
# xgb2 = XGBRegressor(learning_rate=best_lr, n_estimators=n_est, max_depth=max_d, min_child_weight=min_cw, 
#                       subsample=subsamp, colsample_bytree=colsamp_bytree, reg_lambda = 6, objective='reg:squarederror',
#                       seed=20)
# alpha_range = range(15,41,1)
# model2, model_results2, best_alpha, best_score2 = XGBRModelTune(xgb2,'reg_alpha',alpha_range)

########## Check Train, Dev, and Test Errors ##########
# xgb_tuned2 = XGBRegressor(learning_rate=best_lr, n_estimators=n_est, max_depth=max_d, min_child_weight=min_cw, 
#                       subsample=subsamp, colsample_bytree=colsamp_bytree, reg_lambda = 6, reg_alpha=17,
#                       objective='reg:squarederror',seed=20)
# train_tuned_error2, train_exp_var2, dev_tuned_error2, dev_exp_var2 = TrainDevTestErrors(xgb_tuned2,'xgb_tuned2_11092019.csv')
# train_tuned_error2, dev_tuned_error2





########################## PART III ############################
# I will use booster = 'dart', and re-tune the parameters to see if it works better.

############ Round I - n_estimators ###########
# xgb_dart1 = XGBRegressor(learning_rate=best_lr, max_depth=max_d, min_child_weight=min_cw, subsample=subsamp, 
#                          colsample_bytree=colsamp_bytree, reg_lambda = 6, reg_alpha=17,objective='reg:squarederror',
#                          booster='dart',seed=20)
# n_est_range = range(250,605,5)
# model_dart1, model_results_dart1, best_n_est, best_score_dart1 = XGBRModelTune(xgb_dart1,'n_estimators',n_est_range,modelfit=False)
#
############ Round I - n_estimators pt.2 ###########
# xgb_dart1 = XGBRegressor(learning_rate=best_lr, max_depth=max_d, min_child_weight=min_cw, subsample=subsamp, 
#                          colsample_bytree=colsamp_bytree, reg_lambda = 6, reg_alpha=17,objective='reg:squarederror',
#                          booster='dart',seed=20)
# n_est_range2 = range(585,651,1)
# model_dart1_2, model_results_dart1_2, best_n_est2, best_score_dart1_2 = XGBRModelTune(xgb_dart1,'n_estimators',n_est_range2,modelfit=False)
#
############ Check the top 10 n_est values on train and dev sets ############
#These are the top 10 n_estimators values.
#Turn this into a small function.
# top_n_est = pd.Series(model_results_dart1_2.loc[model_results_dart1_2.rank_test_score<=10,'param_n_estimators']).reset_index(drop=True)
# XGBTrainDevComparisons(xgb_dart1,'n_estimators',top_n_est)

############ Round II - max_depth ###########
# xgb_dart2 = XGBRegressor(learning_rate=best_lr, n_estimators=650, min_child_weight=4,subsample=1,colsample_bytree=0.25,
#                         reg_lambda=6, reg_alpha=17,objective='reg:squarederror',booster='dart',seed=20)
# max_depth_range = range(3,11,1)
# model_dart2, model_results_dart2, best_max_d2, best_score_dart2 = XGBRModelTune(xgb_dart2,'max_depth',max_depth_range,modelfit=False)
#
# top_max_d = TDComp(model_results_dart2,'param_max_depth',8)
# XGBTrainDevComparisons(xgb_dart2,'max_depth',top_max_d)

############ Round III - min_child_weight ##########
# xgb_dart3 = XGBRegressor(learning_rate=best_lr, n_estimators=650, max_depth=7, subsample=1, colsample_bytree=0.25,
#                         reg_lambda=6, reg_alpha=17, objective='reg:squarederror',booster='dart',seed=20)
# min_cw_range = range(1,16,1)
# model_dart3, model_results_dart3, best_mincw, best_score_dart3 = XGBRModelTune(xgb_dart3, 'min_child_weight',min_cw_range,modelfit=False)
#
# top_mcw = TDComp(model_results_dart3,'param_min_child_weight',7)
# XGBTrainDevComparisons(xgb_dart3,'min_child_weight',top_mcw)

############ Round IV - gamma ##########
# xgb_dart4 = XGBRegressor(learning_rate=best_lr, n_estimators=650, max_depth=7, min_child_weight=14, subsample=1, 
#                          colsample_bytree=0.25,reg_lambda=6, reg_alpha=17, objective='reg:squarederror',
#                          booster='dart',seed=20)
# gamma_range = np.arange(0.0,1.05,0.05)
# model_dart4, model_results_dart4, best_gamma, best_score_dart4 = XGBRModelTune(xgb_dart4, 'gamma', gamma_range, modelfit=False)
#
# top_gamma = TDComp(model_results_dart4,'param_gamma',5)
# XGBTrainDevComparisons(xgb_dart4,'gamma',top_gamma)

############ Round V - subsample ##########
# xgb_dart5 = XGBRegressor(learning_rate=best_lr, n_estimators=650, max_depth=7, min_child_weight=14, colsample_bytree=0.25,
#                          reg_lambda=6, reg_alpha=17, objective='reg:squarederror',booster='dart',seed=20)
# subsample_range = np.arange(0.30,1.05,0.05)
# model_dart5, model_results_dart5, best_subsample, best_score_dart5 = XGBRModelTune(xgb_dart5, 'subsample', subsample_range, modelfit=False)
#
# top_subsample = TDComp(model_results_dart5,'param_subsample',10)
# XGBTrainDevComparisons(xgb_dart5,'subsample',top_subsample)

############ CHECK HOW WE DO ON THE TEST SET NOW ############
# xgb_dart_tuned1 = XGBRegressor(learning_rate=best_lr, n_estimators=650, max_depth=7, min_child_weight=14, subsample=0.85,
#                                colsample_bytree=0.25,reg_lambda=6, reg_alpha=17, objective='reg:squarederror',booster='dart',seed=20)
# train_dart_error1, train_dart_exp_var1, dev_dart_error1, dev_dart_exp_var1 = TrainDevTestErrors(xgb_dart_tuned1,'xgb_dart_tuned1_11132019.csv')
#train_dart_error1, dev_dart_error1

############ Round VI - colsample_bytree ###########
# xgb_dart6 = XGBRegressor(learning_rate=best_lr, n_estimators=650, max_depth=7, min_child_weight=14, subsample=0.85,
#                          reg_lambda=6, reg_alpha=17, objective='reg:squarederror',booster='dart',seed=20)
# colsamp_range = np.arange(0.10,1.05,0.05)
# model_dart6, model_results_dart6, best_colsamp, best_score_dart6 = XGBRModelTune(xgb_dart6,'colsample_bytree',colsamp_range,modelfit=False)
#top_colsamp = TDComp(model_results_dart6,'param_colsample_bytree',10)
#XGBTrainDevComparisons(xgb_dart6,'colsample_bytree',top_colsamp)

############ Round VII - lambda ###########
# xgb_dart7 = XGBRegressor(learning_rate=best_lr, n_estimators=650, max_depth=7, min_child_weight=14, subsample=0.85,
#                          colsample_bytree=0.2,reg_alpha=17, objective='reg:squarederror',booster='dart',seed=20)
# lambda_range = range(1,21,1)
# model_dart7, model_results_dart7, best_lambda, best_score_dart7 = XGBRModelTune(xgb_dart7,'reg_lambda',lambda_range,modelfit=False)
# top_lambda = TDComp(model_results_dart7,'param_reg_lambda',10)
# XGBTrainDevComparisons(xgb_dart7,'reg_lambda',top_lambda)

############ Round VIII - alpha ###########
# xgb_dart8 = XGBRegressor(learning_rate=best_lr, n_estimators=650, max_depth=7, min_child_weight=14, subsample=0.85,
#                          colsample_bytree=0.2, reg_lambda=11, objective='reg:squarederror',booster='dart',seed=20)
# alpha_range = range(1,11,1)
# model_dart8, model_results_dart8, best_alpha, best_score_dart8 = XGBRModelTune(xgb_dart8,'reg_alpha',alpha_range,modelfit=False)
# top_alpha = TDComp(model_results_dart8,'param_reg_alpha',5)
# XGBTrainDevComparisons(xgb_dart8,'reg_alpha',top_alpha)

############ Round IX - learning_rate ##########
# xgb_dart9 = XGBRegressor(n_estimators=650, max_depth=7, min_child_weight=14, subsample=0.85, colsample_bytree=0.2, 
#                          reg_lambda=11, reg_alpha=4, objective='reg:squarederror',booster='dart',seed=20)
# lr_exp = -3*np.random.rand(100)
# learning_rate_range = 10**lr_exp
# model_dart9, model_results_dart9, best_learnrate, best_score_dart9 = XGBRModelTune(xgb_dart9,'learning_rate',learning_rate_range,
#                                                                                    Randomized=True,n_iter=60,modelfit=False)
# top_learning_rate = TDComp(model_results_dart9,'param_learning_rate',20)
# XGBTrainDevComparisons(xgb_dart9,'learning_rate',top_learning_rate)

############ BEST PARAMETERS ############
# lr_dart = 0.02467365
# n_est_dart = 650
# max_d_dart = 7
# min_cw_dart = 14
# subsample_dart = 0.85
# colsamp_dart = 0.2
# lambda_dart = 11
# alpha_dart = 4

############ FINAL TUNED DART MODEL ############
# xgb_dart_tuned2 = XGBRegressor(learning_rate=lr_dart, n_estimators=n_est_dart, max_depth=max_d_dart, min_child_weight=min_cw_dart,
#                               subsample=subsample_dart, colsample_bytree=colsamp_dart, reg_lambda=lambda_dart, reg_alpha=alpha_dart,
#                               objective='reg:squarederror', booster='dart',seed=20)
# modelfitXGB(xgb_dart_tuned1, strain_X, strain_y)
#train_err_dart2, train_exp_var_dart2, dev_err_dart2, dev_exp_var_dart2 = TrainDevTestErrors(xgb_dart_tuned2,'xgb_dart_tuned2_11132019.csv')
#train_err_dart2, dev_err_dart2

############## PLAYING AROUND ###############
#Check the same model on unstandardized data
#This performed better than the regular standardized data set. I wonder how it will performed on the unsplit data?
# xgb_dart_tuned2 = XGBRegressor(learning_rate=lr_dart, n_estimators=n_est_dart, max_depth=max_d_dart, min_child_weight=min_cw_dart,
#                               subsample=subsample_dart, colsample_bytree=colsamp_dart, reg_lambda=lambda_dart, reg_alpha=alpha_dart,
#                               objective='reg:squarederror', booster='dart',seed=20)
# t_err_NS, t_ev_NS, d_err_NS, d_ev_NS = TrainDevTestErrors(xgb_dart_tuned2,'xgb_dart_tuned2_NS_11132019.csv',train_X=train_X3,
#                                                           train_y=train_y3,dev_X=dev_X,dev_y=dev_y,test_X=test_X3)
# t_err_NS, d_err_NS
#
# t_err_NS2, t_ev_NS2, d_err_NS2, d_ev_NS2 = TrainDevTestErrors(xgb_dart_tuned2,'xgb_dart_tuned2_NS2_11132019.csv',train_X=train_X2,
#                                                                train_y=train_y2,test_X=test_X2)
# t_err_NS2

In [23]:
############ XGBRegressor Pt.2 (strain_X2,train_y2,stest_X2) ############
warnings.simplefilter(action='ignore', category=FutureWarning)
rcParams['figure.figsize'] = 12, 4 #width x height in inches

####################### PART I ######################

############ Round I - n_est ###########
# xgb1 = XGBRegressor(learning_rate=0.1, max_depth=5, min_child_weight=3, subsample=0.6, colsample_bytree=0.109, 
#                          objective='reg:squarederror',seed=7)
# n_est_range = range(100,305,5)
# model1, model_results1, best_n_est, best_score1 = XGBRModelTune(xgb1,'n_estimators',n_est_range)

############ Round II - max_depth ##########
# xgb2 = XGBRegressor(learning_rate=0.1,n_estimators=240,min_child_weight=3,subsample=0.6,colsample_bytree=0.109,
#                    objective='reg:squarederror',seed=7)
# max_d_range = range(3,13,1)
#model2, model_results2, best_max_d, best_score2 = XGBRModelTune(xgb2,'max_depth',max_d_range)

############ Round III - min_child_weight #########
# xgb3 = XGBRegressor(learning_rate=0.1,n_estimators=240,max_depth=4,subsample=0.6,colsample_bytree=0.109,
#                     objective='reg:squarederror',seed=7)
# mcw_range = range(1,11,1)
# model3, model_results3, best_mcw, best_score3 = XGBRModelTune(xgb3,'min_child_weight',mcw_range)

############ Round IV - gamma ###########
# xgb4 = XGBRegressor(learning_rate=0.1,n_estimators=240,max_depth=4,min_child_weight=4,subsample=0.6,
#                     colsample_bytree=0.109,objective='reg:squarederror',seed=7)
# #gamma_range1 = [0,1,5,10,20,50,100]
# gamma_exp = -5**np.random.rand(100)
# gamma_range2 = 10**gamma_exp
# model4, model_results4, best_gamma, best_score4 = XGBRModelTune(xgb4,'gamma',gamma_range2,Randomized=True,n_iter=50)

############ Round V - subsample ##########
# xgb5 = XGBRegressor(learning_rate=0.1,n_estimators=240,max_depth=4,min_child_weight=4,colsample_bytree=0.109,
#                    objective='reg:squarederror',seed=7)
# subsample_range = np.arange(0.1,1.05,0.05)
# model5, model_results5, best_subsample, best_score5 = XGBRModelTune(xgb5,'subsample',subsample_range)

############ Round VI - colsample_bytree ###########
# xgb6 = XGBRegressor(learning_rate=0.1,n_estimators=240,max_depth=4,min_child_weight=4,subsample=1,
#                     objective='reg:squarederror',seed=7)
# colsamp_range = np.arange(0.05,1.05,0.05)
# model6, model_results6, best_colsamp, best_score6 = XGBRModelTune(xgb6,'colsample_bytree',colsamp_range)

############ Round VII - lambda ##########
# xgb7 = XGBRegressor(learning_rate=0.1,n_estimators=240,max_depth=4,min_child_weight=4,subsample=1,colsample_bytree=1,
#                     objective='reg:squarederror',seed=7)
# lambda_range = range(1,11,1)
# # lamb_exp = -3*np.random.rand(100)
# # lambda_range2 = 10**lamb_exp
# # model7, model_results7, best_lambda, best_score7 = XGBRModelTune(xgb7,'reg_lambda',lambda_range2,Randomized=True,n_iter=60)
# model7_2, model_results7_2, best_lambda2, best_score7_2 = XGBRModelTune(xgb7,'reg_lambda',lambda_range)
# # lamb = 0.2897271836971993

############ Round VIII - alpha ##########
# xgb8 = XGBRegressor(learning_rate=0.1,n_estimators=240,max_depth=4,min_child_weight=4,subsample=1,colsample_bytree=1,
#                      objective='reg:squarederror',seed=7)
# alpha_range = [0.01,0.05,0.1,0.25,0.5,0.75,1,5,10,15,20]
# alpha_range2 = range(5,16,1)
# model8, model_results8, best_alpha, best_score8 = XGBRModelTune(xgb8,'reg_alpha',alpha_range2)

############ Round IX - learning_rate ###########
# xgb9 = XGBRegressor(n_estimators=400,max_depth=4,min_child_weight=4,subsample=1,colsample_bytree=1,reg_alpha=11,
#                      objective='reg:squarederror',seed=7)
# lr_exp = -3*np.random.rand(100)
# lr_range = 10**lr_exp
# model9, model_results9, best_lr, best_score9 = XGBRModelTune(xgb9,'learning_rate',lr_range,Randomized=True,n_iter=70)
# #lr_val = 0.057559203317644844 #for n_estimators = 240
# lr_val = 0.04097649273411524 #for n_estimators = 400

############ SAVE TUNED MODEL ############
# lr_val = 0.04097649273411524 #for n_estimators = 400
# xgb_tuned = XGBRegressor(learning_rate=lr_val,n_estimators=400,max_depth=4,min_child_weight=4,subsample=1,colsample_bytree=1,
#                          reg_alpha=11,objective='reg:squarederror',seed=7)
# modelfitXGB(xgb_tuned,strain_X2,train_y2)
# train_err1, train_exp_var1 = TrainTestErrors(xgb_tuned, 'xgb_tuned_strain_X2_11142019.csv')
# train_err1
# train_err2, train_exp_var2 = TrainTestErrors(xgb_tuned,'xgb_tuned_train_X2_11142019.csv',
#                                              train_X=train_X2,train_y=train_y2,test_X=test_X2)
# train_err2




######################## PART II #####################

########## Round I - learning_rate ##############
# xgbII_1 = XGBRegressor(n_estimators=1000,max_depth=4,min_child_weight=4,subsample=1,colsample_bytree=1,
#                           reg_alpha=11,objective='reg:squarederror',seed=7)
# lr_exp = -3*np.random.rand(100)
# lr_range = 10**lr_exp
# modelII_1, model_resultsII_1, best_lrII, best_scoreII_1 = XGBRModelTune(xgbII_1,'learning_rate',lr_range,Randomized=True,n_iter=70)

best_lr = 0.02156731032878086
########## Round II - n_estimators ##############
# xgbII_2 = XGBRegressor(learning_rate=best_lr,max_depth=4,min_child_weight=4,subsample=1,colsample_bytree=1,
#                             reg_alpha=11,objective='reg:squarederror',seed=7)
# n_est_range = range(300,1010,10)
# n_est_range2 = range(1000,1510,10)
# modelII_2, model_resultsII_2, best_n_est, best_scoreII_2 = XGBRModelTune(xgbII_2,'n_estimators',n_est_range2)

########## Round III - max_depth ##############
# xgbII_3 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,min_child_weight=4,subsample=1,colsample_bytree=1,
#                        reg_alpha=11,objective='reg:squarederror',seed=7)
# max_d_range = range(3,10,1)
# modelII_3, model_resultsII_3, best_max_d, best_scoreII_3 = XGBRModelTune(xgbII_3,'max_depth',max_d_range)
# model_resultsII_3

########## TEST OUT MULTIPLE MAX_DEPTH VALUES ON THE TEST SET #########
# xgbII_MD4 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,min_child_weight=4,max_depth=4,subsample=1,
#                          colsample_bytree=1,reg_alpha=11,objective='reg:squarederror',seed=7)
# train_err3, train_exp_var3 = TrainTestErrors(xgbII_MD4,'xgbII_MD4_11142019.csv')
# train_err3
# xgbII_MD6 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,min_child_weight=4,max_depth=6,subsample=1,
#                          colsample_bytree=1,reg_alpha=11,objective='reg:squarederror',seed=7)
# train_err4, train_exp_var4 = TrainTestErrors(xgbII_MD6,'xgbII_MD6_11142019.csv')
# train_err4
# xgbII_MD8 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,min_child_weight=4,max_depth=8,subsample=1,
#                           colsample_bytree=1,reg_alpha=11,objective='reg:squarederror',seed=7)
# train_err5, train_exp_var5 = TrainTestErrors(xgbII_MD8, 'xgbII_MD8_11142019.csv')
# train_err5

########## Round IV - reg_lambda ##########
# xgbII_4 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,min_child_weight=4,subsample=1,colsample_bytree=1,
#                         reg_alpha=11,objective='reg:squarederror',seed=7)
# lambda_range = range(1,22,1)
# modelII_4, model_resultsII_4, best_lambda, best_scoreII_4 = XGBRModelTune(xgbII_4,'reg_lambda',lambda_range)

########## CHECK OUT THE NEW MODEL ON TEST SET ###########
# xgbII_L6 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,min_child_weight=4,max_depth=4,subsample=1,
#                           colsample_bytree=1,reg_alpha=11,reg_lambda=6,objective='reg:squarederror',seed=7)
# train_err6, train_exp_var6 = TrainTestErrors(xgbII_L6,'xgbII_L6_11142019.csv')
# train_err6
# xgbII_L12 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,min_child_weight=4,max_depth=4,subsample=1,
#                            colsample_bytree=1,reg_alpha=11,reg_lambda=12,objective='reg:squarederror',seed=7)
# train_err7, train_exp_var7 = TrainTestErrors(xgbII_L12,'xgbII_L12_11142019.csv')
# train_err7

########## Round V - colsample_bytree #########
# xgbII_5 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,min_child_weight=4,subsample=1,
#                          reg_lambda=12,reg_alpha=11,objective='reg:squarederror',seed=7)
# colsamp_range = np.arange(0.01,1.01,0.01)
# modelII_5, model_resultsII_5, best_colsamp, best_scoreII_5 = XGBRModelTune(xgbII_5,'colsample_bytree',colsamp_range)

########## Round VI - subsample ##########
# xgbII_6 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,min_child_weight=4,colsample_bytree=0.29,
#                        reg_lambda=12,reg_alpha=11,objective='reg:squarederror',seed=7)
# subsample_range = np.arange(0.1,1.01,0.01)
# modelII_6, model_resultsII_6, best_subsample, best_scoreII_6 = XGBRModelTune(xgbII_6,'subsample',subsample_range)
# model_resultsII_6

########### COMPARE SUBSAMPLE VALUES ON TEST SET ##########
# xgbII_SS1 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,min_child_weight=4,colsample_bytree=0.29,
#                          subsample=1,reg_lambda=12,reg_alpha=11,objective='reg:squarederror',seed=7)
# train_err8, train_exp_var8 = TrainTestErrors(xgbII_SS1,'xgbII_SS1_11142019.csv')
# train_err8
# xgbII_SS66 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,min_child_weight=4,colsample_bytree=0.29,
#                           subsample=0.66,reg_lambda=12,reg_alpha=11,objective='reg:squarederror',seed=7)
# train_err9, train_exp_var9 = TrainTestErrors(xgbII_SS66,'xgbII_SS66_11142019.csv')
# train_err9
# xgbII_SS50 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,min_child_weight=4,colsample_bytree=0.29,
#                            subsample=0.5,reg_lambda=12,reg_alpha=11,objective='reg:squarederror',seed=7)
# train_err10, train_exp_var10 = TrainTestErrors(xgbII_SS50,'xgbII_SS50_11142019.csv')
# train_err10

########## Round VII - reg_alpha ###########
# xgbII_7 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,min_child_weight=4,colsample_bytree=0.29,
#                         subsample=0.66, reg_lambda=12,objective='reg:squarederror',seed=7)
# alpha_range = range(0,26,1)
# modelII_7, model_resultsII_7, best_alpha, best_scoreII_7 = XGBRModelTune(xgbII_7,'reg_alpha',alpha_range)
# model_resultsII_7

########## COMPARE ALPHA VALUES ON TEST SET ###########
# xgbII_A8 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,min_child_weight=4,colsample_bytree=0.29,
#                         subsample=0.66, reg_alpha=8, reg_lambda=12,objective='reg:squarederror',seed=7)
# train_err11, train_exp_var11 = TrainTestErrors(xgbII_A8,'xgbII_A8_11162019.csv')
# train_err11
# xgbII_A5 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,min_child_weight=4,colsample_bytree=0.29,
#                          subsample=0.66, reg_alpha=5, reg_lambda=12,objective='reg:squarederror',seed=7)
# train_err12, train_exp_var12 = TrainTestErrors(xgbII_A5, 'xgbII_A5_11162019.csv')
# train_err12
# xgbII_A13 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,min_child_weight=4,colsample_bytree=0.29,
#                          subsample=0.66, reg_alpha=13, reg_lambda=12,objective='reg:squarederror',seed=7)
# train_err13, train_exp_var13 = TrainTestErrors(xgbII_A13, 'xgbII_A13_11162019.csv')
# train_err13

########## Round VIII - min_child_weights ##########
# xgbII_8 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,colsample_bytree=0.29,
#                          subsample=0.66, reg_lambda=12,reg_alpha=11, objective='reg:squarederror',seed=7)
# mcw_range = range(1,16,1)
# modelII_8, model_resultsII_8, best_mcw, best_scoreII_8 = XGBRModelTune(xgbII_8,'min_child_weight',mcw_range)
# model_resultsII_8

########## COMPARE MCW VALUES ON TEST SET ##########
# xgbII_MCW1 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,colsample_bytree=0.29,min_child_weight=1,
#                           subsample=0.66, reg_lambda=12,reg_alpha=11, objective='reg:squarederror',seed=7)
# train_err14, train_exp_var14 = TrainTestErrors(xgbII_MCW1, 'xgbII_MCW1_11162019.csv')
# train_err14
# xgbII_MCW5 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,colsample_bytree=0.29,min_child_weight=5,
#                           subsample=0.66, reg_lambda=12,reg_alpha=11, objective='reg:squarederror',seed=7)
# train_err15, train_exp_var15 = TrainTestErrors(xgbII_MCW5, 'xgbII_MCW5_11162019.csv')
# train_err15
# xgbII_MCW8 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,colsample_bytree=0.29,min_child_weight=8,
#                            subsample=0.66, reg_lambda=12,reg_alpha=11, objective='reg:squarederror',seed=7)
# train_err16, train_exp_var16 = TrainTestErrors(xgbII_MCW8, 'xgbII_MCW8_11162019.csv')
# train_err16
# xgbII_MCW3 =  XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,colsample_bytree=0.29,min_child_weight=3,
#                             subsample=0.66, reg_lambda=12,reg_alpha=11, objective='reg:squarederror',seed=7)
# train_err17, train_exp_var17 = TrainTestErrors(xgbII_MCW3, 'xgbII_MCW3_11162019.csv')
# train_err17
# xgbII_MCW6 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,colsample_bytree=0.29,min_child_weight=6,
#                              subsample=0.66, reg_lambda=12,reg_alpha=11, objective='reg:squarederror',seed=7)
# train_err18, train_exp_var18 = TrainTestErrors(xgbII_MCW6, 'xgbII_MCW6_11162019.csv')
# train_err18





######################### PART III - dart ######################

########### Round I - n_estimators #############
# xgbII_dart1 = XGBRegressor(learning_rate=best_lr,n_estimators=1280,max_depth=6,colsample_bytree=0.29,min_child_weight=5,
#                           subsample=0.66, reg_lambda=12,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# n_est_range = range(400,1005,5)
# n_est_range2 = range(850,1305,5)
# modelII_dart1, model_resultsII_dart1, best_n_est, best_scoreII_dart1 = XGBRModelTune(xgbII_dart1,'n_estimators',n_est_range2)

########### Round II - max_depth ################
# xgbII_dart2 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,colsample_bytree=0.29,min_child_weight=5,
#                            subsample=0.66, reg_lambda=12,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# max_d_range = range(3,11,1)
# modelII_dart2, model_resultsII_dart2, best_max_d, best_scoreII_dart2 = XGBRModelTune(xgbII_dart2,'max_depth',max_d_range)
# model_resultsII_dart2

########### CHECK MODEL ON TEST SET ###########
# xgbII_dartMD5 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,max_depth=5,colsample_bytree=0.29,min_child_weight=5,
#                              subsample=0.66, reg_lambda=12,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# train_err19, train_exp_var19 = TrainTestErrors(xgbII_dartMD5, 'xgbII_dartMD5_11172019.csv')
# train_err19
# xgbII_dartMD6 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,max_depth=6,colsample_bytree=0.29,min_child_weight=5,
#                               subsample=0.66, reg_lambda=12,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# train_err20, train_exp_var20 = TrainTestErrors(xgbII_dartMD6, 'xgbII_dartMD6_11172019.csv')
# train_err20
# xgbII_dartMD4 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,max_depth=4,colsample_bytree=0.29,min_child_weight=5,
#                                subsample=0.66, reg_lambda=12,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# train_err21, train_exp_var21 = TrainTestErrors(xgbII_dartMD4, 'xgbII_dartMD4_11172019.csv')
# train_err21

########### Round III - min_child_weight ##############
# xgbII_dart3 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,colsample_bytree=0.29,subsample=0.66, 
#                            reg_lambda=12,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# mcw_range = range(1,11,1)
# modelII_dart3, model_resultsII_dart3, best_mcw, best_scoreII_dart3 = XGBRModelTune(xgbII_dart3, 'min_child_weight', mcw_range)
# model_resultsII_dart3

########### CHECK MODEL ON TEST SET ###########
# xgbII_dartMCW1 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,max_depth=4,colsample_bytree=0.29,min_child_weight=1,
#                               subsample=0.66, reg_lambda=12,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# train_err22, train_exp_var22 = TrainTestErrors(xgbII_dartMCW1, 'xgbII_dartMCW1_11182019.csv')
# train_err22
# xgbII_dartMCW3 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,max_depth=4,colsample_bytree=0.29,min_child_weight=3,
#                               subsample=0.66, reg_lambda=12,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# train_err23, train_exp_var23 = TrainTestErrors(xgbII_dartMCW3, 'xgbII_dartMCW3_11182019.csv')
# train_err23
# xgbII_dartMCW2 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,max_depth=4,colsample_bytree=0.29,min_child_weight=2,
#                               subsample=0.66, reg_lambda=12,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# train_err24, train_exp_var24 = TrainTestErrors(xgbII_dartMCW2, 'xgbII_dartMCW2_11182019.csv')
# train_err24

########### Round IV - lambda ###########
# xgbII_dart4 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,colsample_bytree=0.29,subsample=0.66,min_child_weight=1, 
#                            reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# lambda_range = range(0,16,1)
# lambda_range2 = range(15,26,1)
# modelII_dart4, model_resultsII_dart4, best_lambda, best_scoreII_dart4 = XGBRModelTune(xgbII_dart4, 'reg_lambda', lambda_range2)
# model_resultsII_dart4

########### CHECK MODEL ON TEST SET ###########
# xgbII_dartL2 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,max_depth=4,colsample_bytree=0.29,min_child_weight=1,
#                                subsample=0.66, reg_lambda=2,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# train_err25, train_exp_var25 = TrainTestErrors(xgbII_dartL2, 'xgbII_dartL2_11182019.csv')
# train_err25
# xgbII_dartL8 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,max_depth=4,colsample_bytree=0.29,min_child_weight=1,
#                                subsample=0.66, reg_lambda=8,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# train_err26, train_exp_var26 = TrainTestErrors(xgbII_dartL8, 'xgbII_dartL8_11182019.csv')
# train_err26
# xgbII_dartL14 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,max_depth=4,colsample_bytree=0.29,min_child_weight=1,
#                                subsample=0.66, reg_lambda=14,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# train_err27, train_exp_var27 = TrainTestErrors(xgbII_dartL14, 'xgbII_dartL14_11182019.csv')
# train_err27
# xgbII_dartL17 = XGBRegressor(learning_rate=best_lr,n_estimators=1030,max_depth=4,colsample_bytree=0.29,min_child_weight=1,
#                                 subsample=0.66, reg_lambda=17,reg_alpha=11, booster='dart',objective='reg:squarederror',seed=7)
# train_err28, train_exp_var28 = TrainTestErrors(xgbII_dartL17, 'xgbII_dartL17_11182019.csv')
# train_err28


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reg_lambda,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,21.934918,0.126686,0.021812,0.000652,15,{'reg_lambda': 15},-0.014262,-0.020268,-0.014986,-0.016506,0.002677,1
1,24.52633,1.763654,0.028285,0.005643,16,{'reg_lambda': 16},-0.014388,-0.020473,-0.015018,-0.016626,0.002732,4
2,26.456749,0.72029,0.028786,0.004045,17,{'reg_lambda': 17},-0.014265,-0.020254,-0.015479,-0.016666,0.002585,9
3,27.306814,0.270677,0.024281,0.001487,18,{'reg_lambda': 18},-0.014181,-0.020237,-0.015551,-0.016656,0.002593,8
4,26.357809,0.092393,0.023848,0.000196,19,{'reg_lambda': 19},-0.014608,-0.019852,-0.015502,-0.016654,0.002291,6
5,26.227064,0.340867,0.021986,0.000652,20,{'reg_lambda': 20},-0.014452,-0.019817,-0.015463,-0.016577,0.002328,2
6,26.458188,0.205551,0.02508,0.004626,21,{'reg_lambda': 21},-0.01429,-0.020034,-0.015459,-0.016595,0.002479,3
7,27.964081,0.621432,0.032292,0.004307,22,{'reg_lambda': 22},-0.014091,-0.020045,-0.015832,-0.016656,0.002499,7
8,33.301161,0.192749,0.032615,0.006115,23,{'reg_lambda': 23},-0.014242,-0.020479,-0.015687,-0.016803,0.002665,11
9,35.236221,0.973561,0.049463,0.016523,24,{'reg_lambda': 24},-0.014153,-0.020225,-0.015887,-0.016755,0.002554,10


In [None]:
############ XGBRegressor Pt.3 (train_X2,train_y2,test_X2) ############

In [16]:
############ K-NEAREST NEIGHBOR REGRESSION ###############


In [29]:
########## CODE THAT IS NOT BEING USED ############


########### TRYING TO FIGURE OUT WHY RIDGE REGRESSION DOESNT ACCEPT THE DATA & SCORE WITH NEG_MEAN_LOG_ERROR ###########
# list_valcounts = []
# for cols in train_X2.columns:
#     list_valcounts.append(train_X2[cols].value_counts())

#list_valcounts[0]
#train_X2.lt(0).sum()
#train_X2.columns
#train_y2.lt(0).sum()
# np.sum(train_y2 > 0)
# len(train_y2)



####### THIS PIECE OF CODE ONLY WORKS FOR A SINGLE DECISION TREE
# dotData = StringIO()
# rfr_tuned.fit(train_X2,train_y2)
# export_graphviz(rfr_tuned,out_file=dotData,filled=True,
#                 rounded=True,special_characters=True)

# graph_rfr4 = pydotplus.graph_from_dot_data(dotData.getvalue())
# Image(graph.create_png())

########### I am going to play around a little with the XGB package

# #Parameter dictionary for XGB
# param_dict = {'learning_rate':0.1,'n_estimators':500,'max_depth':5,'min_child_weight':1,
#               'gamma':0.1,'subsample':0.6,'colsample_bytree':0.6,'reg_lambda':0.1,'reg_alpha':0.1}
# #The model 
# basic_xgb = XGBRegressor(param_dict)
# #Use a DMatrix so we can do the Cross-Validation
# Dtrain = xgb.DMatrix(train_X2.values,label=train_y2.values)

# #Don't bother with this because this only conducts a single cross validation test 
# #on a single set of variables. This may be optimal when you are looking for the value
# #of n_estimators but is not that valuable in general.
# #Cross validation test.
# cv_test = xgb.cv(param_dict,Dtrain,num_boost_round=basic_xgb.get_params()['n_estimators'],nfold=10,
#                  metrics='rmse',early_stopping_rounds=50)


# A function to help tune XGBoost models.
# def XGBRModelTune(train_X,train_y,test_X,learning_rate=0.1,n_estimators=500,max_depth=5,min_child_weight=1,gamma=0.1,
#                  subsample=0.6,colsample_bytree=0.6,objective='reg:squarederror',booster='gbtree',reg_lambda=0.1,reg_alpha=0,
#                  cv_num=5,Randomized=False,n_jobs=0,order=range(0,9,1),num_rounds=3):
#     #################################################### FUNCTION DESCRIPTION ##############################################
#     #################################################### BASIC INTRODUCTION ################################################
#     #This is a function that is used to tune parameters for the XGBoost parameters. There are a total of approximately
#     #11 parameters to change in XGBoost, but there will only be 9 that are being tuned in this function. 
#     #The only 2 that are not being tuned: objective and booster. You can change these in the definition of the function,
#     #but they will not be tuned in the function itself.
#     #################################################### VARIABLE DEFINITIONS ##############################################
#     #***train_X = the training data (features only).  
#     #***train_y = the training data (target only).
#     #***test_X = the testing data (features only).
    
#     #***learning_rate = the learning rate of the XGBRegressor algorithm.
#     #***n_estimators = the number of trees to use in this ensemble model. 
#     #***max_depth = maximum depth allowed for an individual tree.
#     #***min_child_weight = minimum number of weights allowed for a child node; basically a variable that describes the amount of 
#     # observations that are allowed in each child node. The higher the value, the more values that are required in each node.
#     #***gamma = A value that defines the minimum positive reduction in the loss function that must occur for a node to split.
#     #***subsample = A value that denotes the % of samples to be used in each node of the tree.
#     #***colsample_bytree = A value that determines the % of columns to be used for each tree.
#     #***objective = The loss function to be minimized.
#     #***booster = The type of model that we run at each iteration. Can choose gbtree (tree-based models), gblinear (linear models),
#     # or dart which is similar to gbtree but it implements deep neural networks drop-out technique.
#     #***reg_lambda = L2 regularization term on weights. Used to handle the main regularization part of XGBoost.
#     #***reg_alpha = L1 regularization term on weights.
#     #***cv_num = The number of cross-validation folds that will be used in the parameter search process.
#     #***Randomized = A boolean value that decides if the first search you do for parameter searches is randomized or not.
#     #***n_iter = A number that is only used if Randomized is true. It essentially determines the number of minimum jobs 
#     # RandomizedSearchCV will do before it stops testing random values of a variable

#Practice with dictionary objects 
# dict_test = {'key1':0,'key2':1}
# keys = list(dict_test.keys())
# keys

#Parameter dictionary for XGB
#This did not work at all.
# param_dict = {'learning_rate':0.1,'n_estimators':500,'max_depth':5,'min_child_weight':1,
#               'gamma':0.1,'subsample':0.6,'colsample_bytree':0.6,'reg_lambda':0.1,'reg_alpha':0.1}

#The model 
# basic_xgb = XGBRegressor(learning_rate=0.1,n_estimators=500,max_depth=5,min_child_weight=1,gamma=0.1,subsample=0.6,
#                         colsample_bytree=0.6,reg_lambda=0.1,reg_alpha=0.1,random_state=5)
# #Use a DMatrix so we can do the Cross-Validation
# Dtrain = xgb.DMatrix(train_X2.values,label=train_y2.values)

# #get_params() gets us all the parameters defined in the XGBRegressor
# p1 = basic_xgb.get_params() #This shows us 3 extra variables: n_jobs, random_state, silent
# kp1 = list(p1.keys())

# #These are xgb parameters. This is the better function to use.
# p2 = basic_xgb.get_xgb_params()
# kp2 = list(p2.keys())

# for key in kp1:
#     if key not in p2: print(key)



#############Untuned model
# xgb1 = XGBRegressor(learning_rate=0.1,n_estimators=1000,max_depth=6,min_child_weight=1,gamma=0,subsample=0.6,
#                    colsample_bytree=0.1,scale_pos_weight=1,seed=13,objective='reg:squarederror')
# rcParams['figure.figsize'] = 12, 4 #width x height in inches
# modelfitXGB(xgb1,train_X2,train_y2)


###### THIS RAN FINE!!!!
# XGB_param_test1 = {'max_depth':range(2,13,2),'min_child_weight':range(1,6,1)}
# XGBgsearch1 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,n_estimators=300,gamma=0,subsample=0.6,colsample_bytree=0.1,
#                                                   objective='reg:squarederror',scale_pos_weight=1,seed=13),param_grid=XGB_param_test1,
#                           scoring='neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
# XGBgsearch1.fit(train_X2,train_y2)

# THIS RAN FINE AS WELL!!
# XGB_param_test1 = {'max_depth':range(2,13,2),'min_child_weight':range(1,6,1)}
# XGBgsearch1 = RandomizedSearchCV(estimator=XGBRegressor(learning_rate=0.1,n_estimators=300,gamma=0,subsample=0.6,colsample_bytree=0.1,
#                                                   objective='reg:squarederror',scale_pos_weight=1,seed=13),param_distributions=XGB_param_test1,
#                           scoring='neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
# XGBgsearch1.fit(train_X2,train_y2)
# It turns out the reason was I did not define train_X, train_y, or test_X in the function.


# # We must first split the training set into the training and dev set before we use .fit_transform on our training data.
# train_X2_Standard = standardize_scaler.fit_transform(train_X2)
# test_X2_Standard = standardize_scaler.transform(test_X2)






#Random Forest Regressor model fit function
# def RFRmodelfitCV(alg, train_X, train_y, performCV=True, printFeatureImportance=True, cv_folds=5):
#     #Fit the algorithm on the data
#     alg.fit(train_X,train_y)
    
#     #Predict on the training set
#     train_predictions = alg.predict(train_X)
    
#     #Perform cross-validation
#     if performCV:
#         cv_score = cross_val_score(alg, train_X, train_y, cv = cv_folds, scoring='neg_mean_squared_log_error')
        
#     #Print the model report
#     print("\nModel Report")
#     print("Mean Squared Log Error : %.4g" % metrics.mean_squared_log_error(train_y, train_predictions))
#     print("Explained Variance Score : %.4g" % metrics.explained_variance_score(train_y, train_predictions)) #1.0 is the best value
    
#     if performCV:
#         #print('CV Score: %s'% cv_score)
#         print("CV Scores \nMean : %.7g | Std : %.7g | Min : %.7g | Max : %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
    
#     #Print Feature Importance
#     if printFeatureImportance:
#         feat_imp = pd.Series(alg.feature_importances_,train_X.columns).sort_values(ascending=False)[0:30]
#         feat_imp.plot(kind='bar', title = 'Feature Importances')
#         plt.ylabel('Feature Importance Score')
#         #print(feat_imp) #I may add this


#I will come back to 11/8/19
#Trying to trouble shoot how to fix the modelfit portion of XGBRModelTune
#This works here but its not working in my function!?
# param = 'n_estimators'
# p_val = 294
# p_dict = {param:p_val}
# #THIS WORKS!!!!!!!
# xgb7.set_params(**p_dict)
# modelfitXGB(xgb7,strain_X,strain_y,cv_folds=3)


#FROM MY FUNCTION
# p_dict = {xgb_param:best_param_val} #this didn't work because best_param_val is a dict not just a value!!!!!!
# p_dict = {xgb_param:best_param_val[xgb_param]} #this should work!!!!!
# xgb_alg.set_params(**p_dict)
# modelfitXGB(xgb_alg,train_X,train_y,cv_folds=cv_num)



# #Doesn't work
# string_style = param + '=' + str(p_val)
# e_ss = exec(string_style)

#This doesn't change the value
# xgb7.get_params()[param] = p_val
# xgb7.get_params()


# xgb7.set_params(n_estimators=p_val)
# xgb7.get_params()

#This segment works to change the dictionary values.
# xgb_params = xgb7.get_params()
# xgb_params[param] = p_val
# xgb_params
# #However, this segment returns an error
# xgb7.set_params(xgb_params)



#     title1 = str(metric) + 'Vs' + xgb_param + "Training Errors"
#     xaxis1 = xgb_param
#     yaxis1 = str(metric)
    
#     fig1.update_layout(
#     title=title1,
#     xaxis_title=xaxis1,
#     yaxis_title=yaxis1,
#     font=dict(
#         family="Courier New, monospace",
#         size=12,
#         color="#7f7f7f"
#         )
#     )


#This is a good way to combine lists together into a dataframe.
# list1 = [1,1,2,3,5,8,13]
# list2 = [0,1,2,3,4,5,6]
# col1 = '1'


# df0 = pd.DataFrame(data=list(zip(list1,list2)),columns=[col1,'2'])
# df0.columns[0]

# #Great strategy to combine lists so that we can plot them with px.scatter
# #Create the first plot - training errors              
#     df1 = pd.DataFrame(data=list(zip(xgb_parameter_values,train_error_arr)),columns=[xgb_param,'Training Error'])
#     fig1 = px.scatter(df1, x=df1.columns[0], y='Training Error', color='Training Error')

#     #Create the second plot - dev errors
#     df2 = pd.DataFrame(data=list(zip(xgb_parameter_values,dev_error_arr)),columns=[xgb_param,'Dev Error'])
#     fig2 = px.scatter(df2, x=df2.columns[0], y='Dev Error', color='Dev Error')


### TESTING THE FUNCTION I JUST MADE ###

# #Create a basic XGBRegressor model
# basic_xgb = XGBRegressor(objective='reg:squarederror',seed=7,n_estimators=500) #Make sure objective = 'reg:squarederror'
# #The parameter we will tune
# parameter_tune1 = 'learning_rate'
# #We create an exponent like this to get an even distribution between -4 -> 0
# exp_LR = -4*np.random.rand(100)
# #We use this exponent array to act like an exponent, so that our distribution is just as likely to pick values
# #between 0.001 - 0.01 as it is to pick values between 0.1 - 1.
# parameter_tune_vals1 = 10**exp_LR

# #Now we run the program to make sure that the program runs well.
# model1, model_results, best_lr, best_score = XGBRModelTune(xgb_alg = basic_xgb, xgb_param = parameter_tune1, xgb_param_vals = parameter_tune_vals1,
#                              Randomized=True, n_iter = len(parameter_tune_vals1)-50)

# # #EVERYTHING IS GOOD!!!