In [1]:
############# BASIC PACKAGES TO IMPORT ############
import os
import pandas as pd #To allow us to work with dataframes
import numpy as np #To allow us to make mathematical transformations
import matplotlib.mlab as mlab #To create plots
import matplotlib.pylab as plt #To create plots
%matplotlib inline 
from matplotlib.pylab import rcParams
from matplotlib.colors import ListedColormap
rcParams['figure.figsize'] = 12, 4 #width x height in inches
from mpl_toolkits import mplot3d
import category_encoders as ce #To encode our nominal and categorical variables
from sklearn import preprocessing, metrics #This module can be helpful when processing data
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_score, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import uniform, chi2_contingency, chisquare
import pickle
import plotly.express as px
import pylab as py
import warnings

import plotly.express as px
import plotly.graph_objects as go
#import scipy as sp #To play with scikit-learn.

In [2]:
############# Models to import #############
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import DistanceMetric
import sklearn.neighbors
from sklearn import neighbors
from sklearn import decomposition


import xgboost as xgb
from xgboost.sklearn import XGBRegressor
#from xgboost import XGBClassifier
import sys 
!{sys.executable} -m pip install xgboost


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows',None)
#updating xgboost and scipy to get rid of an error (9/18/19)
#RUN IN TERMINAL
# pip install --upgrade pip
# pip install --upgrade xgboost 
# pip install --upgrade scipy
# pip install --upgrade sklearn
# pip install --upgrade plotly
# pip install --upgrade pydotplus 
# pip install --upgrade graphviz
# Use 'brew' instead of pip for updates to get the right packages on your computer.


#I don't believe that we need this if we export the entire sklearn library. I will comment these out 
#until it's time to start training models.
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score



In [3]:
################### PROCESS THE DATA ##########################

######### PREPPING THE DATASET #########
#Load in the test and train data
train_house = pd.read_csv('train.csv')
test_house = pd.read_csv('test.csv')

######## IMPORT THE DATSET ########
#Take a look at a summary of the training data.
#train_house.describe()
#We will seperate the training set into features (train_X) and the predictor variable (train_y)
train_X = train_house.loc[:,train_house.columns != 'SalePrice']
train_y = train_house.SalePrice

#There are no predictions to compare with, you submit them on Kaggle.
test_X = test_house
test_IDs = test_X.loc[:,'Id']

#This is used so we can see the full output display of the iPython Notebook.
pd.options.display.max_rows = 4000
pd.set_option('display.max_columns', None)

#Look at the column names
#train_X.columns
#Verified that all the column names are the names on the sheet of paper 
#test_X.shape  (1459,80)
#train_X.shape (1460, 80)

#Look at what the values look like for each column
#train_X.MiscVal
#train_X.OverallCond.dtypes

#LETS LOOK AT HISTS OF VARS IF WE NEED TO.
#NUMERICAL VARIABLES 
#curr_col = train_house.LotFrontage
#curr_col.hist(bins=25)
#curr_col.value_counts()

#CAT VARIABLES
# curr_col = train_X.BsmtExposure
# curr_col.value_counts().plot(kind='bar')
# curr_col.value_counts()

# #Allows us to look at the unique values and match up with the sheet
# curr_col.unique()
# curr_col.describe()

# curr_var = train_house.loc[train_house.Neighborhood == neighborhood[24],'SalePrice']
# curr_var.hist(bins=25)
# curr_var.describe()

#Checking out the null values for each column where there are null values.
#inds = train_house.BsmtQual.loc[pd.isnull(train_house.BsmtQual)]
#inds = inds.index
#train_house.loc[inds,]


######## HANDLING MISSING VALUES <ONLY RUN ONCE>

#THESE WERE HANDLED FROM THE TRAIN_X DATASET.
#LotFrontage = 484
#Alley = 2709
#MasVnrType = 24 
#MasVnrArea = 23
#BsmtQual = 76
#BsmtCond = 77
#BsmtExposure = 76
#BsmtFinType1 = 74
#BsmtFinType2 = 74
#FireplaceQu = 1412
#GarageType = 156
#GarageYrBlt = 157
#GarageFinish = 157
#GarageQual = 157
#GarageCond = 157
#PoolQC = 2896
#Fence = 2337
#MiscFeature = 2802

#THESE ARE NEW FEATURES WITH MISSING VALUES ADDED FROM TEST_X DATASET (EXCEPT ELECTRICAL).
#THESE WILL ALL BE REMOVED (12 ROWS IN TOTAL)
#MSZoning = 4, Utilities = 2, Exterior1st = 1, Exterior2nd = 1, BsmtFinSF1 = 1, BsmtFinSF2 = 1, BsmtUnfSF = 1, TotalBsmtSF = 1, 
#BsmtFullBath = 2, BsmtHalfBath = 2, KitchenQual = 1, Functional = 2, GarageCars = 1, GarageArea = 1, SaleType = 1, Electrical = 1

#Removing some indices that I discovered from preliminary research along with some bad data from the columns listed above
#332 -> BsmtFinType2 is NaN while the other basement variables are okay, so I didn't know what to replace this with.
#948 -> BsmtExposure was NaN while other basement variables are okay.
#1379 -> Removed the Electrical NaN in the dataset, it is stupid to keep this.
#We can still clean up the training data and throw some rows out but we cannot do this for the
#testing data because Kaggle requires all 1459 rows to be intact. Therefore, we need to fix the 
#the NaN's in the testing set by setting them as NA instead.
train_X = train_X.drop(train_X.index[[332,948,1379]])
train_X = train_X.reset_index(drop = True)
train_y = train_y.drop(train_y.index[[332,948,1379]])
train_y = train_y.reset_index(drop = True)

train_IDs = train_X.Id #Save these so we can extract the training data from the combined_df later
test_IDs = test_X.Id #Save these so we can extract the testing datra from the combined_df later


#COMBINING THE TRAINING AND TESTING DATASETS TO CREATE A SUPER DATA SET.
combined_df = pd.concat([train_X,test_X])
combined_df = combined_df.reset_index(drop = True)


combined_df = combined_df.fillna({'MSZoning':'NA','LotFrontage' : 0,'Utilities':'NA','Alley' : 'No Alley','MasVnrType': 'NA',
                                  'MasVnrArea':0,'BsmtQual' : 'None','BsmtCond' : 'None','BsmtExposure' : 'None', 'Exterior1st':'NA',
                                  'Exterior2nd':'NA','BsmtFinType1' : 'None','BsmtFinSF1':0,'BsmtFinType2':'None',
                                  'BsmtFinSF2':0,'BsmtUnfSF':0,'TotalBsmtSF':0,'BsmtFullBath':0,'BsmtHalfBath':0,
                                  'KitchenQual':'NA','Functional':'NA','FireplaceQu' : 'None','GarageType' : 'None','GarageYrBlt' : 0,
                                  'GarageFinish' : 'None','GarageCars':0,'GarageArea':0,'GarageQual' : 'None', 'GarageCond' : 'None',
                                  'PoolQC' : 'None', 'Fence' : 'None', 'MiscFeature' : 'None','SaleType':'NA'})


##########Remove the ID column because it essentially just numbers the rows. ########## 
#Also, lets instantiate new variables so that anything that we do to them won't be reflected in the original 
#dataset, and won't affect the other pipelines I plan to create in the future. 
#train_X1 = train_X.loc[:,train_X.columns != 'Id']
#train_y1 = train_y
#test_X1 = test_X .loc[:,test_X.columns != 'Id']
combined_df1 = combined_df




##########Convert all ORD variables into numbers, that way we can use them in regression. #########
#I will try to use category_encoder for basic ordinal encoding.

#8/21/19
#THIS IS THE CORRECT FORMATING FOR THE CATEGORICAL_ENCODER MODULE!!! 
#THE DOCUMENTATION THAT I WAS USING WAS OUTDATED!!!

#We create a STANDARD mapping for all ORDINAL VARIABLES. 
ordinal_cols_mapping1 = [{"col":"Utilities",
                            "mapping": {'AllPub':4,
                                       'NoSewr':3,
                                       'NoSeWa':2,
                                       'ELO':1}},
                         {"col":"LandSlope",
                            "mapping":{'Sev':3,
                                       'Mod':2,
                                       'Gtl':1}},
                         {"col":"ExterQual",
                            "mapping":{'Ex':5,
                                        'Gd':4,
                                        'TA':3,
                                        'Fa':2,
                                        'Po':1}},
                         {"col":"ExterCond",
                            "mapping":{'Ex':5,
                                        'Gd':4,
                                        'TA':3,
                                        'Fa':2,
                                        'Po':1}},
                         {"col":"BsmtQual",
                            "mapping":{'Ex':5,
                                        'Gd':4,
                                        'TA':3,
                                        'Fa':2,
                                        'Po':1,
                                        'None':0}},
                         {"col":"BsmtCond",
                             "mapping":{'Ex':5,
                                        'Gd':4,
                                        'TA':3,
                                        'Fa':2,
                                        'Po':1,
                                        'None':0}},
                         {"col":"BsmtExposure",
                             "mapping":{'Gd':4,
                                        'Av':3,
                                        'Mn':2,
                                        'No':1,
                                        'None':0}},
                         {"col":"BsmtFinType1",
                             "mapping":{'GLQ':6,
                                        'ALQ':5,
                                        'BLQ':4,
                                        'Rec':3,
                                        'LwQ':2,
                                        'Unf':1,
                                        'None':0}},
                         {"col":"BsmtFinType2",
                             "mapping":{'GLQ':6,
                                        'ALQ':5,
                                        'BLQ':4,
                                        'Rec':3,
                                        'LwQ':2,
                                        'Unf':1,
                                        'None':0}},
                         {"col":"HeatingQC",
                             "mapping":{'Ex':5,
                                        'Gd':4,
                                        'TA':3,
                                        'Fa':2,
                                        'Po':1}},
                         {"col":"KitchenQual",
                             "mapping":{'Ex':5,
                                        'Gd':4,
                                        'TA':3,
                                        'Fa':2,
                                        'Po':1}},
                         {"col":"FireplaceQu",
                             "mapping":{'Ex':5,
                                        'Gd':4,
                                        'TA':3,
                                        'Fa':2,
                                        'Po':1,
                                        'None':0}},
                         {"col":"GarageFinish",
                             "mapping":{'Fin':3,
                                        'RFn':2,
                                        'Unf':1,
                                        'None':0}},
                         {"col":"GarageQual",
                             "mapping":{'Ex':5,
                                        'Gd':4,
                                        'TA':3,
                                        'Fa':2,
                                        'Po':1,
                                        'None':0}},
                         {"col":"GarageCond",
                             "mapping":{'Ex':5,
                                        'Gd':4,
                                        'TA':3,
                                        'Fa':2,
                                        'Po':1,
                                        'None':0}},
                         {"col":"PoolQC",
                             "mapping":{'Ex':4,
                                        'Gd':3,
                                        'TA':2,
                                        'Fa':1,
                                        'None':0}}
                       ]

#We input these columns and there corresponding dictionaries into ce.OrdinalEncoder in order to swap these values out
ce_ord = ce.OrdinalEncoder(mapping = ordinal_cols_mapping1,return_df = True)
#Now we have to fit the encoder to our training data.
#train_X1 = ce_ord.fit_transform(train_X1,train_y1) #This doesn't create a combined dataframe like I originally thought, this is just the X dataframe.
#test_X1 = ce_ord.fit_transform(test_X1)
combined_df1 = ce_ord.fit_transform(combined_df1)


##########Convert all CAT variables into numerical values. ##########
#We need to examine each CAT column and determine which would be the best way to convert that column.
#For this pipeline, all categorical variables are going to be converted into one hot vectors.

ce_one_hot = ce.OneHotEncoder(cols = ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','LotConfig',
                                     'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle',
                                     'RoofMatl','Exterior1st','Exterior2nd','MasVnrType','Foundation','Heating',
                                     'CentralAir','Electrical','Functional','GarageType','PavedDrive','Fence',
                                     'MiscFeature','SaleType','SaleCondition'])

# train_X1 = ce_one_hot.fit_transform(train_X1,train_y1)
# test_X1 = ce_one_hot.fit_transform(test_X1)
combined_df1 = ce_one_hot.fit_transform(combined_df1)


#First, we break up combined_df1 into train_X1 and test_X1
train_X1 = combined_df1.loc[combined_df1.Id <= train_IDs[len(train_IDs)-1],:]
train_X1 = train_X1.loc[:,train_X1.columns != 'Id']
train_X1 = train_X1.reset_index(drop = True)

test_X1 = combined_df1.loc[combined_df1.Id >= test_IDs[0],:]
test_X1 = test_X1.loc[:,test_X1.columns != 'Id']
test_X1 = test_X1.reset_index(drop = True)

train_y1 = train_y


In [4]:
################# FUNCTIONS PT.1 #################
#Defining important functions for evaluating Boosting models.

def modelfitCV(alg, train_X, train_y, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(train_X,train_y)
    
    #Predict on the training set
    train_predictions = alg.predict(train_X)
    
    #Perform cross-validation
    if performCV:
        cv_score = cross_val_score(alg, train_X, train_y, cv = cv_folds, scoring='neg_mean_squared_log_error')
        
    #Print the model report
    print("\nModel Report")
    print("Mean Squared Log Error : %.4g" % metrics.mean_squared_log_error(train_y, train_predictions))
    print("Explained Variance Score : %.4g" % metrics.explained_variance_score(train_y, train_predictions)) #1.0 is the best value
    
    if performCV:
        #print('CV Score: %s'% cv_score)
        print("CV Scores \nMean : %.7g | Std : %.7g | Min : %.7g | Max : %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
    
    #Print Feature Importance
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_,train_X.columns).sort_values(ascending=False)[0:30]
        feat_imp.plot(kind='bar', title = 'Feature Importances')
        plt.ylabel('Feature Importance Score')
        #print(feat_imp) #I may add this


        
def modelfitXGB(alg, train_X, train_y, useTrainCV=True, printFeatureImportance=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_params = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(train_X.values,label=train_y.values)
        cvresult = xgb.cv(xgb_params,xgtrain,num_boost_round=alg.get_params()['n_estimators'],nfold=cv_folds,metrics='rmse',
                          early_stopping_rounds=early_stopping_rounds,verbose_eval=False)
        alg.set_params(n_estimators=cvresult.shape[0])
        print("n_estimators: %.4g" % alg.get_params()['n_estimators'])
        
    #Fit Algorithm on the data
    alg.fit(train_X,train_y,eval_metric='rmse')
    
    #Predict training set
    train_predictions = alg.predict(train_X)
    
    #Print Model Report
    print("\nModel Report")
    print("Mean Squared Log Error : %.4g" % metrics.mean_squared_log_error(train_y, train_predictions))
    print("Explained Variance Score : %.4g" % metrics.explained_variance_score(train_y, train_predictions)) #1.0 is the best value
    
    if printFeatureImportance:
        feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)[0:30]
        feat_imp.plot(kind='bar',title = 'Feature Importances')
        plt.ylabel('Feature Important Score')
        
        
        
        
        
        
        
######### SAVEFITMODELS #########
#Save our predictions to the proper directory.
def SaveFitModels(pred, IDs, fileName, saveDirectory1 = '/Users/armenta/Kaggle/Housing Prices/Predictions/', 
                  saveDirectory2 ='/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/'):
    
    #Converting the predictions into a form that can be combined with their ID's
    pred = pd.Series(pred)
    pred = pd.concat([pred,IDs.rename('Id')],axis=1)
    pred = pred.rename(columns = {0:'SalePrice','Id':'Id'})
    pred = pred[['Id','SalePrice']]
    #Create the path to save the outputs
    path1 = saveDirectory1 + fileName
    path2 = saveDirectory2 + fileName
    #Save the outputs
    pred.to_csv(path_or_buf = path1)
    pred.to_csv(path_or_buf = path2)
    
    

   
    

def TDComp(model_results,column,number=10):
    #################################################### FUNCTION DESCRIPTION ##############################################
    #################################################### INTRODUCTION ################################################
    #This is a function that takes the model_results from a parameter gridsearch or randomizedsearch, and
    #grabs the top # of column values (specified by column and number) and transforms it into a viable 
    #format that can be used for the XGBTrainDevComparisons function. The output is supposed to be used for 
    #the xgb_parameter_values variable.
    #################################################### VARIABLE DEFINITIONS ##############################################
    #***model_results = the model results from running a parameter grid or randomized search on a ML model.
    #***column = The column that you want to look at, or the parameter that was tuned. Usually, the string is
    # 'param_' + paramter name 
    #***number = The number of values that you want to collect from the model_results. Cannot be bigger than 
    # model_results.shape[0], or else you would be wanting to grab more values than there are in the dataframe.

    #Example code:
    #top_n_est = pd.Series(model_results_dart1_2.loc[model_results_dart1_2.rank_test_score<=10,'param_n_estimators']).reset_index(drop=True)
    
    #Grab the specific values that you want. This series is designed to be used as xgb_parameter_values for the 
    #XGBTrainDevComparisons function below.
    top_values = pd.Series(model_results.loc[model_results.rank_test_score<=number,column]).reset_index(drop=True)
    return top_values



def Standardizer(train_X = train_X1, test_X = test_X1, StandardScaler = preprocessing.StandardScaler(), Standardizer = True, Normalizer = False, SandN = False):
    #################################################### FUNCTION DESCRIPTION ##############################################
    #################################################### INTRODUCTION ################################################
    #This function will do a Standard transform_fit on train_X and test_X. This is done so we do not get test data leak 
    #when we do the transform_fit on train_X and can get a purer examination of our models and compare them.
    #################################################### VARIABLE DEFINITIONS ##############################################
    #***train_X = the training data (features only).  
    #***train_y = the training data (target only).
    #***test_X = the testing data (features only). (I may include a predict option later, I am not sure)
    #***Standardizer = boolean that determines if we scale the features or not. Cannot be used 
    # if Normalizer is True.
    #***scaler = the sklearn scaler that we will use to scale the data along the columns. 
    # The 3 options are MinMaxScaler, RobustScaler, and StandardScaler. 
    #***Normalizer = boolean value that determines if we normalize the values along the rows. 
    # scalers will scale across the features so that the distribution of values along the features 
    # changes, but this variable affects the actual rows (or vectors if you will) instead. Not 
    # recommended unless you understand the changes that will occur after normalization.
    # Cannot be used with Standardizer = True
    #***SandN = Boolean that determines if we Normalize (first) and Standardize (second) the data. 
    
    #Save the column names so that we can convert the arrays to dataframes
    columns = train_X.columns
    
    if Standardizer:
        #Now we standardize our data.
        #We initially fit the scaler to the train data (find the mean and std to be used on the other sets)
        #then we take the fit scaler and transform the dev and test set.
        standardized_train_X = StandardScaler.fit_transform(train_X) #Transform the train data
        standardized_train_X = pd.DataFrame(standardized_train_X, columns=columns) #Convert to a dataframe
        standardized_test_X = StandardScaler.transform(test_X) #Transform the test data
        standardized_test_X = pd.DataFrame(standardized_test_X, columns=columns) #Convert to a dataframe
        #Return the standardized datasets
        return standardized_train_X, standardized_test_X
    
    elif Normalizer:
        #We can normalize the data
        normalizer = preprocessing.Normalizer() #Instantiate the normalizer
        normalized_train_X = normalizer.fit_transform(train_X) #Transform the train data
        normalized_train_X = pd.DataFrame(normalized_train_X,columns=columns) #Convert to a dataframe
        normalized_test_X = normalizer.transform(test_X) #Transform the test data
        normalized_test_X = pd.DataFrame(normalized_test_X,columns=columns) #Convert to a dataframe
        #Return the normalized datasets
        return normalized_train_X, normalized_test_X
    
    elif SandN:
        normalizer = preprocessing.Normalizer() #Instantiate the normalizer
        s_train_X = StandardScaler.fit_transform(train_X) #Standardize the train data
        s_test_X = StandardScaler.transform(test_X) #Standardize the test data 
        sn_train_X = normalizer.fit_transform(s_train_X) #Normalize the train data 
        sn_train_X = pd.DataFrame(sn_train_X,columns=columns) #Convert to a dataframe
        sn_test_X = normalizer.transform(s_test_X) #Normalize the test data
        sn_test_X = pd.DataFrame(sn_test_X,columns=columns) #Convert to a dataframe
        #Return the standardized / normalized datasets
        return sn_train_X, sn_test_X

In [5]:
################# STANDARDIZE THE DATA ##############
#12/20/19
#Standardized Data
strain_X1, stest_X1 = Standardizer()

#Normalized Data
ntrain_X1, ntest_X1 = Standardizer(Standardizer=False, Normalizer=True)

#Standardized and Normalized Data
sntrain_X1, sntest_X1 = Standardizer(Standardizer=False, SandN=True)

In [6]:
################# FUNCTIONS PT.2 #################
################### XGBRModelTune Function ###################
def XGBRModelTune(xgb_alg, xgb_param, xgb_param_vals, train_X=strain_X1, train_y=train_y1, test_X=stest_X1, 
                  cv_num=3, scoring='neg_mean_squared_log_error',Randomized = False, n_iter = 10, 
                  plot2d = True, modelfit = False):
    #################################################### FUNCTION DESCRIPTION ##############################################
    #################################################### INTRODUCTION ################################################
    #This is a function that is used to tune parameters for the XGBoost parameters. There are a total of approximately
    #11 parameters to change in XGBoost, but there will only be 9 that can be tuned in this function. 
    #The only 2 that are not being tuned: objective and booster. You can change these in the definition of the function,
    #but they will not be tuned in this function because the number of values are so low, that I think its best 
    #to manually test it.
    #################################################### VARIABLE DEFINITIONS ##############################################
    #***train_X = the training data (features only).  
    #***train_y = the training data (target only).
    #***test_X = the testing data (features only). (I may include a predict option later, I am not sure)
    #***xgb_alg = the XGBRegressor algorithm with starting parameters (can decide to leave some parameters blank)
    #***xgb_param = This is the parameter name. This will be a string of the parameter we are tuning.
    #***xgb_param_vals = This will be the range that we will search for when we grid search for the best variable values.
    # The range should be as long as you can possibly make it so we can test a plethora of values.
    # If Randomized = True, make sure that the array is larger than the value given for n_jobs, 
    # as this will return an error for RandomizedSearchCV. If you are unsure, then just leave Randomized = False. 
    # The different variables are as follows:
          #***learning_rate = the learning rate of the XGBRegressor algorithm.
          #***n_estimators = the number of trees to use in this ensemble model. 
          #***max_depth = maximum depth allowed for an individual tree.
          #***min_child_weight = minimum number of weights allowed for a child node; basically a variable that describes the amount of 
          # observations that are allowed in each child node. The higher the value, the more values that are required in each node.
          #***gamma = A value that defines the minimum positive reduction in the loss function that must occur for a node to split.
          #***subsample = A value that denotes the % of samples to be used in each node of the tree.
          #***colsample_bytree = A value that determines the % of columns to be used for each tree.
          #***objective = The loss function to be minimized.
          #***booster = The type of model that we run at each iteration. Can choose gbtree (tree-based models), gblinear (linear models),
          # or dart which is similar to gbtree but it implements deep neural networks drop-out technique.
          #***reg_lambda = L2 regularization term on weights. Used to handle the main regularization part of XGBoost.
          #***reg_alpha = L1 regularization term on weights.  
    #***cv_num = The number of cross-validation folds that will be used in the parameter search process.
    #***Randomized = A boolean value that decides if the first search you do for parameter searches is randomized or not.
    #***n_iter = A number that is only used if Randomized is true. It essentially determines the number of minimum iterations 
    # RandomizedSearchCV will do before it stops testing random values of the variable in the distribution.
    # I recommend len(xgb_param_vals) - 10.
    #***plot2d = A boolean that will decide whether we show a 2d plot of error vs variable values. This will essentially help
    # us determine a more effective and smaller range to look at after we do the search.
    #***modelfit = A boolean that will determine if we run the modelfitXGB function to observe important features 
    # in the XGBR model
    
    #This prevents us from getting warnings that are unnecessary and don't add to anything.
    warnings.simplefilter(action='ignore', category=FutureWarning)
    
    #For RandomizedCVSearch
    if Randomized:
        #Create the dictionary object that is used in RandomizedSearchCV
        param_distributions = {xgb_param:xgb_param_vals}
        #Create the RandomizedSearchCV object
        random_search_model = RandomizedSearchCV(estimator = xgb_alg,param_distributions = param_distributions,
                                           n_iter = n_iter,scoring = scoring,n_jobs=-1,iid=False,cv=cv_num)
        #Fit the data to our random search object
        random_search_model.fit(train_X,train_y)
        #These variables will be returned along with the model.
        rs_results = pd.DataFrame(random_search_model.cv_results_) #The results of the random search
        best_param_val = random_search_model.best_params_ #The best parameter
        best_score_val = random_search_model.best_score_ #The best score associated with the best parameter
        
        #Store the returned values in a single list 
        return_values = [random_search_model,rs_results,best_param_val,best_score_val]
        print(best_param_val, best_score_val)
        #Create a 2d plot of mean_test_score (y) vs parameter values (x)
        if plot2d:
            rcParams['figure.figsize'] = 12, 4 #width x height in inches
            param_name = 'param_'+ xgb_param
            fig = px.scatter(rs_results,x=param_name,y='mean_test_score',color='mean_test_score')
            fig.show()
        #Create a bar plot showing the weights of the most important features so far. 
        if modelfit:
            p_dict = {xgb_param:best_param_val[xgb_param]}
            xgb_alg.set_params(**p_dict)
            modelfitXGB(xgb_alg,train_X,train_y,cv_folds=cv_num)  
        
        return return_values 
    
    
    
    #For a GridSearchCV
    else:
        #Create the dictionary object that is used in GridSearchCV
        param_grid = {xgb_param:xgb_param_vals}
        #Create the GridSearch object that will be fitted on the training_data.
        grid_search_model = GridSearchCV(estimator = xgb_alg,param_grid = param_grid,scoring = scoring,
                                        n_jobs = -1,iid = False, cv = cv_num)
        #Fit the training data to the grid search object
        grid_search_model.fit(train_X,train_y)
        
        #Save these following three variables to be returned later 
        gs_results = pd.DataFrame(grid_search_model.cv_results_) #The results of the grid search
        best_param_val = grid_search_model.best_params_ #The best parameter value
        best_score_val = grid_search_model.best_score_ #The best score associated with the best parameter value
        
        #Save the return values in a single list
        return_values = [grid_search_model,gs_results,best_param_val,best_score_val]
        print(best_param_val, best_score_val)
        
        #Create a 2d plot of mean_test_score (y) vs parameter values (x)
        if plot2d:
            rcParams['figure.figsize'] = 12, 4 #width x height in inches
            param_name = 'param_'+ xgb_param
            fig = px.scatter(gs_results,x=param_name,y='mean_test_score',color='mean_test_score')
            fig.show()
        #Create a bar plot showing the weights of the most important features so far. 
        if modelfit:
            p_dict = {xgb_param:best_param_val[xgb_param]}
            xgb_alg.set_params(**p_dict)
            modelfitXGB(xgb_alg,train_X,train_y,cv_folds=cv_num)
        return return_values 
    
    
    
def TrainTestErrors(model,savefileName=None,save=True,train_X=strain_X1,train_y=train_y1,test_X=stest_X1,
                    t_IDs=test_IDs,metric=metrics.mean_squared_log_error):
    #################################################### FUNCTION DESCRIPTION ##############################################
    #################################################### INTRODUCTION ################################################
    #This is a function that will compute the train set errors and explained variances of a specific model.
    #This will also compute the test predictions, and save them if save=True.
    #################################################### VARIABLE DEFINITIONS ##############################################
    #***model = The Machine Learning model.
    #***savefileName = The string of the filename.
    #***save = Boolean that determines whether we save the test predictions.
    #***train_X = the training data (features only).  
    #***train_y = the training data (target only).
    #***test_X = the testing data (features only).
    #***t_IDs = IDs for the testing data.
    #***metric = the metric for which we are examining the error.
    
    #Fit the model to the training data 
    model_fit = model.fit(train_X,train_y)
    
    #Create predictions on the training set. Compute the error and explained variance.
    train_pred = model_fit.predict(train_X)
    train_error = metric(train_y,train_pred)
    train_explained_var = metrics.explained_variance_score(train_y,train_pred)
    
    #If save = True, create predictions on the test set, and save the predictions using SaveFitModels() 
    if save:
        test_pred = model_fit.predict(test_X)
        SaveFitModels(test_pred,t_IDs,savefileName)
    
    return train_error, train_explained_var

In [7]:
############ Linear Regression #############
# regressor = LinearRegression(normalize = True)
# linmodel = regressor.fit(train_X1,train_y1)
# linmodel_p = linmodel.predict(test_X1) #These are the predictions for simple linear regression'
# linmodel_p = pd.Series(linmodel_p) #Need to convert it to a series before we can concatenate it.
# linmodel_p = pd.concat([linmodel_p,test_IDs.rename('Id')],axis=1) #Add the IDs
# linmodel_p = linmodel_p.rename(columns = {0:'SalePrice','Id':'Id'}) #Rename the Columns
# linmodel_p = linmodel_p[['Id','SalePrice']] #Switch the order of the columns 
#Comment out after you run once.
#linmodel_p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/linmodel_p_09042019.csv')
#linmodel_p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/linmodel_p_09042019.csv')
#BEFORE SUBMITTING THE FILES, MAKE SURE TO DELETE THE INDEX COLUMN IN EXCEL!!!!!!!!

In [8]:
############ Ridge Linear Regression ############
#Unlike the regular linear regression, there are MANY MANY POSSIBILITIES TO CHOOSE FROM.

#alpha = 0.001
# ridge_reg1 = Ridge(alpha = 0.001,normalize = True)
# rr_model1 = ridge_reg1.fit(train_X1,train_y1)
# rr_model1p = rr_model1.predict(test_X1)
# rr_model1p = pd.Series(rr_model1p)
# rr_model1p = pd.concat([rr_model1p,test_IDs.rename('Id')],axis=1)
# rr_model1p = rr_model1p.rename(columns = {0:'SalePrice','Id':'Id'})
# rr_model1p = rr_model1p[['Id','SalePrice']]

#rr_model1p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/rr_model1p_09042019.csv')
#rr_model1p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/rr_model1p_09042019.csv')

#alpha = 0.01
# ridge_reg2 = Ridge(alpha = 0.01,normalize = True)
# rr_model2 = ridge_reg2.fit(train_X1,train_y1)
# rr_model2p = rr_model2.predict(test_X1)
# rr_model2p = pd.Series(rr_model2p)
# rr_model2p = pd.concat([rr_model2p,test_IDs.rename('Id')],axis=1)
# rr_model2p = rr_model2p.rename(columns = {0:'SalePrice','Id':'Id'})
# rr_model2p = rr_model2p[['Id','SalePrice']]

#rr_model2p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/rr_model2p_09052019.csv')
#rr_model2p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/rr_model2p_09052019.csv')


#alpha = 0.1
# ridge_reg3 = Ridge(alpha = 0.1,normalize = True)
# rr_model3 = ridge_reg3.fit(train_X1,train_y1)
# rr_model3p = rr_model3.predict(test_X1)
# rr_model3p = pd.Series(rr_model3p)
# rr_model3p = pd.concat([rr_model3p,test_IDs.rename('Id')],axis=1)
# rr_model3p = rr_model3p.rename(columns = {0:'SalePrice','Id':'Id'})
# rr_model3p = rr_model3p[['Id','SalePrice']]

# rr_model3p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/rr_model3p_09052019.csv')
# rr_model3p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/rr_model3p_09052019.csv')


#alpha = 0.25
# ridge_reg4 = Ridge(alpha = 0.25,normalize = True)
# rr_model4 = ridge_reg4.fit(train_X1,train_y1)
# rr_model4p = rr_model4.predict(test_X1)
# rr_model4p = pd.Series(rr_model4p)
# rr_model4p = pd.concat([rr_model4p,test_IDs.rename('Id')],axis=1)
# rr_model4p = rr_model4p.rename(columns = {0:'SalePrice','Id':'Id'})
# rr_model4p = rr_model4p[['Id','SalePrice']]

# rr_model4p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/rr_model4p_09052019.csv')
# rr_model4p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/rr_model4p_09052019.csv')


#alpha = 0.5
# ridge_reg5 = Ridge(alpha = 0.5,normalize = True)
# rr_model5 = ridge_reg5.fit(train_X1,train_y1)
# rr_model5p = rr_model5.predict(test_X1)
# rr_model5p = pd.Series(rr_model5p)
# rr_model5p = pd.concat([rr_model5p,test_IDs.rename('Id')],axis=1)
# rr_model5p = rr_model5p.rename(columns = {0:'SalePrice','Id':'Id'})
# rr_model5p = rr_model5p[['Id','SalePrice']]

# rr_model5p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/rr_model5p_09052019.csv')
# rr_model5p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/rr_model5p_09052019.csv')


#alpha = 0.75
# ridge_reg6 = Ridge(alpha = 0.75,normalize = True)
# rr_model6 = ridge_reg6.fit(train_X1,train_y1)
# rr_model6p = rr_model6.predict(test_X1)
# rr_model6p = pd.Series(rr_model6p)
# rr_model6p = pd.concat([rr_model6p,test_IDs.rename('Id')],axis=1)
# rr_model6p = rr_model6p.rename(columns = {0:'SalePrice','Id':'Id'})
# rr_model6p = rr_model6p[['Id','SalePrice']]

# rr_model6p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/rr_model6p_09052019.csv')
# rr_model6p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/rr_model6p_09052019.csv')


#alpha = 1
# ridge_reg7 = Ridge(alpha = 1,normalize = True)
# rr_model7 = ridge_reg7.fit(train_X1,train_y1)
# rr_model7p = rr_model7.predict(test_X1)
# rr_model7p = pd.Series(rr_model7p)
# rr_model7p = pd.concat([rr_model7p,test_IDs.rename('Id')],axis=1)
# rr_model7p = rr_model7p.rename(columns = {0:'SalePrice','Id':'Id'})
# rr_model7p = rr_model7p[['Id','SalePrice']]

# rr_model7p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/rr_model7p_09052019.csv')
# rr_model7p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/rr_model7p_09052019.csv')

#alpha = 10
# ridge_reg8 = Ridge(alpha = 10,normalize = True)
# rr_model8 = ridge_reg8.fit(train_X1,train_y1)
# rr_model8p = rr_model8.predict(test_X1)
# rr_model8p = pd.Series(rr_model8p)
# rr_model8p = pd.concat([rr_model8p,test_IDs.rename('Id')],axis=1)
# rr_model8p = rr_model8p.rename(columns = {0:'SalePrice','Id':'Id'})
# rr_model8p = rr_model8p[['Id','SalePrice']]

# rr_model8p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/rr_model8p_09052019.csv')
# rr_model8p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/rr_model8p_09052019.csv')



#alpha = 2
# ridge_reg9 = Ridge(alpha = 2,normalize = True)
# rr_model9 = ridge_reg9.fit(train_X1,train_y1)
# rr_model9p = rr_model9.predict(test_X1)
# rr_model9p = pd.Series(rr_model9p)
# rr_model9p = pd.concat([rr_model9p,test_IDs.rename('Id')],axis=1)
# rr_model9p = rr_model9p.rename(columns = {0:'SalePrice','Id':'Id'})
# rr_model9p = rr_model9p[['Id','SalePrice']]

# rr_model9p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/rr_model9p_09052019.csv')
# rr_model9p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/rr_model9p_09052019.csv')


#alpha = 5
# ridge_reg10 = Ridge(alpha = 5,normalize = True)
# rr_model10 = ridge_reg10.fit(train_X1,train_y1)
# rr_model10p = rr_model10.predict(test_X1)
# rr_model10p = pd.Series(rr_model10p)
# rr_model10p = pd.concat([rr_model10p,test_IDs.rename('Id')],axis=1)
# rr_model10p = rr_model10p.rename(columns = {0:'SalePrice','Id':'Id'})
# rr_model10p = rr_model10p[['Id','SalePrice']]

# rr_model10p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/rr_model10p_09052019.csv')
# rr_model10p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/rr_model10p_09052019.csv')


In [9]:
############ Regression Trees #############

#MSE
# dt_reg = DecisionTreeRegressor()
# dt_model = dt_reg.fit(train_X1,train_y1)
# dt_modelp = dt_model.predict(test_X1)
# dt_modelp = pd.Series(dt_modelp)
# dt_modelp = pd.concat([dt_modelp,test_IDs.rename('Id')],axis=1)
# dt_modelp = dt_modelp.rename(columns = {0:'SalePrice','Id':'Id'})
# dt_modelp = dt_modelp[['Id','SalePrice']]

#dt_modelp.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/dt_modelp_09052019.csv')
#dt_modelp.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/dt_modelp_09052019.csv')


#Friedman MSE
# dt_reg2 = DecisionTreeRegressor(criterion='friedman_mse')
# dt_model2 = dt_reg2.fit(train_X1,train_y1)
# dt_model2p = dt_model2.predict(test_X1)
# dt_model2p = pd.Series(dt_model2p)
# dt_model2p = pd.concat([dt_model2p,test_IDs.rename('Id')],axis=1)
# dt_model2p = dt_model2p.rename(columns = {0:'SalePrice','Id':'Id'})
# dt_model2p = dt_model2p[['Id','SalePrice']]

#dt_model2p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/dt_model2p_09052019.csv')
#dt_model2p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/dt_model2p_09052019.csv')


#MAE
dt_reg3 = DecisionTreeRegressor(criterion='mae')
dt_model3 = dt_reg3.fit(train_X1,train_y1)
dt_model3p = dt_model3.predict(test_X1)
dt_model3p = pd.Series(dt_model3p)
dt_model3p = pd.concat([dt_model3p,test_IDs.rename('Id')],axis=1)
dt_model3p = dt_model3p.rename(columns = {0:'SalePrice','Id':'Id'})
dt_model3p = dt_model3p[['Id','SalePrice']]

#dt_model3p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/dt_model3p_09052019.csv')
#dt_model3p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/dt_model3p_09052019.csv')



In [11]:
############ Regression Forests ###########
#n_estimators = 100, criterion = mse, bootstrap = True
# forest100 = RandomForestRegressor(n_estimators = 100)
# fmodel100 = forest100.fit(train_X1,train_y1)
# fmodel100_p = fmodel100.predict(test_X1)
# fmodel100_p = pd.Series(fmodel100_p)
# fmodel100_p = pd.concat([fmodel100_p,test_IDs.rename('Id')],axis=1)
# fmodel100_p = fmodel100_p.rename(columns = {0:'SalePrice','Id':'Id'})
# fmodel100_p = fmodel100_p[['Id','SalePrice']]

# fmodel100_p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/fmodel100_p_09062019.csv')
# fmodel100_p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/fmodel100_p_09062019.csv')


#n_estimators = 1000, criterion = mse, bootstrap = True
# forest1000 = RandomForestRegressor(n_estimators = 1000)
# fmodel1000 = forest1000.fit(train_X1,train_y1)
# fmodel1000_p = fmodel1000.predict(test_X1)
# fmodel1000_p = pd.Series(fmodel1000_p)
# fmodel1000_p = pd.concat([fmodel1000_p,test_IDs.rename('Id')],axis=1)
# fmodel1000_p = fmodel1000_p.rename(columns = {0:'SalePrice','Id':'Id'})
# fmodel1000_p = fmodel1000_p[['Id','SalePrice']]

# fmodel1000_p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/fmodel1000_p_09062019.csv')
# fmodel1000_p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/fmodel1000_p_09062019.csv')


#n_estimators = 100, criterion = mae, bootstrap = True
# forest1002 = RandomForestRegressor(n_estimators = 100,criterion = 'mae')
# fmodel1002 = forest1002.fit(train_X1,train_y1)
# fmodel100_2p = fmodel1002.predict(test_X1)
# fmodel100_2p = pd.Series(fmodel100_2p)
# fmodel100_2p = pd.concat([fmodel100_2p,test_IDs.rename('Id')],axis=1)
# fmodel100_2p = fmodel100_2p.rename(columns = {0:'SalePrice','Id':'Id'})
# fmodel100_2p = fmodel100_2p[['Id','SalePrice']]

# fmodel100_2p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/fmodel100_2p_09062019.csv')
# fmodel100_2p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/fmodel100_2p_09062019.csv')


#n_estimators = 1000, criterion = mae, bootstrap = True
# forest10002 = RandomForestRegressor(n_estimators = 1000,criterion = 'mae')
# fmodel10002 = forest10002.fit(train_X1,train_y1)
# fmodel1000_2p = fmodel10002.predict(test_X1)
# fmodel1000_2p = pd.Series(fmodel1000_2p)
# fmodel1000_2p = pd.concat([fmodel1000_2p,test_IDs.rename('Id')],axis=1)
# fmodel1000_2p = fmodel1000_2p.rename(columns = {0:'SalePrice','Id':'Id'})
# fmodel1000_2p = fmodel1000_2p[['Id','SalePrice']]

# fmodel1000_2p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/fmodel1000_2p_09062019.csv')
# fmodel1000_2p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/fmodel1000_2p_09062019.csv')



#n_estimators = 100, criterion = mse, bootstrap = False
# forest1003 = RandomForestRegressor(n_estimators = 100,criterion = 'mse', bootstrap = False)
# fmodel1003 = forest1003.fit(train_X1,train_y1)
# fmodel100_3p = fmodel1003.predict(test_X1)
# fmodel100_3p = pd.Series(fmodel100_3p)
# fmodel100_3p = pd.concat([fmodel100_3p,test_IDs.rename('Id')],axis=1)
# fmodel100_3p = fmodel100_3p.rename(columns = {0:'SalePrice','Id':'Id'})
# fmodel100_3p = fmodel100_3p[['Id','SalePrice']]

# fmodel100_3p.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/fmodel100_3p_09062019.csv')
# fmodel100_3p.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/fmodel100_3p_09062019.csv')


In [19]:
############ 9/10/19 RandomForestRegressor using Cross Validation and Grid Search ##########
#using modelfitCV in order to tune some parameters for our random forests.
#Using default values for the first run
# rf_0 = RandomForestRegressor(random_state = 2, n_estimators=10)
# modelfitCV(rf_0, train_X1, train_y1)

#########Tuning of the n_estimators parameters 10 -> 100 in steps of 10
# param_test1 = {'n_estimators':range(10,101,10)}
# gsearch = grid search
# gsearch1 = GridSearchCV(estimator = RandomForestRegressor(min_samples_split=14,min_samples_leaf=2, 
#                                                           max_depth=5,max_features='sqrt',random_state=2),
#                        param_grid = param_test1, scoring='neg_mean_squared_log_error',n_jobs=4,iid=False,cv=5)
# gsearch1.fit(train_X1,train_y1)
#[gsearch1.cv_results_[x] for x in ('params','mean_test_score','std_test_score')],gsearch1.best_params_,gsearch1.best_score_


#########Tuning of the n_estimators parameters 100 -> 200 in steps of 10
# param_test2 = {'n_estimators':range(100,201,10)}
# gsearch2 = GridSearchCV(estimator = RandomForestRegressor(min_samples_split=14,min_samples_leaf=2,max_depth=5,max_features='sqrt',random_state=2),
#                        param_grid = param_test2, scoring = 'neg_mean_squared_log_error',n_jobs=4,iid=False,cv=5)
# gsearch2.fit(train_X1,train_y1)
# [gsearch2.cv_results_[x] for x in ('params','mean_test_score','std_test_score')],gsearch2.best_params_,gsearch2.best_score_



#########Tuning of max_depth (4->10 in steps of 2) and min_samples_split (10->50 in steps of 5)
# param_test3 = {'max_depth':range(4,11,2), 'min_samples_split':range(10,51,5)}
# gsearch3 = GridSearchCV(estimator = RandomForestRegressor(n_estimators=160, max_features = 'sqrt',random_state=2),
#                        param_grid = param_test3, scoring = 'neg_mean_squared_log_error', n_jobs=4,iid=False,cv=5)
# gsearch3.fit(train_X1,train_y1)
# [gsearch3.cv_results_[x] for x in ('params','mean_test_score','std_test_score')],gsearch3.best_params_,gsearch3.best_score_


#########Tuning of max_depth (10->20 in steps of 2) and min_samples_split (2->12 in steps of 2)
# param_test4 = {'max_depth':range(10,21,2), 'min_samples_split':range(2,13,2)}
# gsearch4 = GridSearchCV(estimator = RandomForestRegressor(n_estimators=160, max_features = 'sqrt',random_state=2),
#                        param_grid = param_test4, scoring = 'neg_mean_squared_log_error', n_jobs=4,iid=False,cv=5)
# gsearch4.fit(train_X1,train_y1)
#results4 = pd.DataFrame(gsearch4.cv_results_)
#results4.loc[:,('params','mean_test_score','std_test_score')]
#gsearch4.best_params_, gsearch4.best_score_


#########Tuning of min_samples_leaf (1->10 in steps of 1)
# param_test5 = {'min_samples_leaf':range(1,11,1)}
# gsearch5 = GridSearchCV(estimator = RandomForestRegressor(n_estimators=160, max_features = 'sqrt', max_depth=18, min_samples_split=2, random_state=2),
#                        param_grid = param_test5, scoring = 'neg_mean_squared_log_error', n_jobs=4, iid=False, cv=5)
# gsearch5.fit(train_X1,train_y1)
# results5 = pd.DataFrame(gsearch5.cv_results_)
#results5.loc[:,('params','mean_test_score','std_test_score')]
#gsearch5.best_params_, gsearch5.best_score_

#Now we will check on how our mean score has improved since when we first ran modelfitCV
#rcParams['figure.figsize'] = 12,4 #just in case the graph size isn't the same as before
#modelfitCV(gsearch5.best_estimator_,train_X1,train_y1)


#########Tuning of max_features (12->40 in steps of 4)
# param_test6 = {'max_features':range(12,41,4)}
# gsearch6 = GridSearchCV(estimator = RandomForestRegressor(n_estimators=160,max_depth=18,min_samples_split=2,min_samples_leaf=1,random_state=2),
#                        param_grid = param_test6, scoring = 'neg_mean_squared_log_error', n_jobs=4, iid=False, cv=5)
# gsearch6.fit(train_X1,train_y1)
# results6 = pd.DataFrame(gsearch6.cv_results_)
#results6.loc[:,('params','mean_test_score','std_test_score')]
#gsearch6.best_params_, gsearch6.best_score_


#Use our tuned parameters to create a model. Lets check modelfitCV and then fit our data to the test data.
#rfr_tuned = RandomForestRegressor(n_estimators = 160, max_depth=18, min_samples_split=2, min_samples_leaf=1,max_features=36,random_state=2)
#modelfitCV(rfr_tuned,train_X1,train_y1)


###############################
#TRAINING THE 1st TUNED MODEL
##############################
# rfr_tuned_model = rfr_tuned.fit(train_X1,train_y1)
# rfr_tuned_modelp = rfr_tuned_model.predict(test_X1)
# rfr_tuned_modelp = pd.Series(rfr_tuned_modelp)
# rfr_tuned_modelp = pd.concat([rfr_tuned_modelp,test_IDs.rename('Id')],axis=1)
# rfr_tuned_modelp = rfr_tuned_modelp.rename(columns = {0:'SalePrice','Id':'Id'})
# rfr_tuned_modelp = rfr_tuned_modelp[['Id','SalePrice']]

# rfr_tuned_modelp.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/rfr_tuned_modelp_09102019.csv')
# rfr_tuned_modelp.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/rfr_tuned_modelp_09102019.csv')



#########Retuning of min_samples_split (2 -> 30 in steps of 2)
# param_test7 = {'min_samples_split':range(2,31,2)}
# gsearch7 = GridSearchCV(estimator = RandomForestRegressor(n_estimators=160,max_depth=18,min_samples_leaf=1,max_features=36,random_state=2),
#                        param_grid = param_test7, scoring = 'neg_mean_squared_log_error', n_jobs=4, iid=False, cv=5)
# gsearch7.fit(train_X1,train_y1)
# results7 = pd.DataFrame(gsearch7.cv_results_)
# results7.loc[:,('params','mean_test_score','std_test_score')]
#gsearch7.best_params_, gsearch7.best_score_ 


#########Retuning of min_samples_leaf (1 -> 15 in steps of 1)
# param_test8 = {'min_samples_leaf':range(1,16,1)}
# gsearch8 = GridSearchCV(estimator = RandomForestRegressor(n_estimators=160,max_depth=18,max_features=36,min_samples_split=2,random_state=2),
#                        param_grid = param_test8, scoring = 'neg_mean_squared_log_error', n_jobs=4, iid=False, cv=5)
# gsearch8.fit(train_X1,train_y1)
# results8 = pd.DataFrame(gsearch8.cv_results_)
# results8.loc[:,('params','mean_test_score','std_test_score')]
# gsearch8.best_params_, gsearch8.best_score_ 

In [22]:
############ AdaBoost using CrossValidation and Grid Search ###########
#uncomment when you just open up the notebook
#rfr_tuned = RandomForestRegressor(n_estimators = 160, max_depth=18, min_samples_split=2, min_samples_leaf=1,max_features=36,random_state=2)


#Create the AdaBoosting model
#AB1 = AdaBoostRegressor(rfr_tuned,random_state=2)


############################################################################
########YOU DONT NEED TO RUN THIS SECTION AGAIN! USE THE NEW VALUES TO TUNE  
###########################################################################
############Tuning of the learning_rate (randomly generated) and n_estimators (randomly generated)
#Creating the random values for the learning rate
# r_exp = -4*np.random.rand(20)
# alpha = 10**r_exp
# param_testAB1 = {'learning_rate':alpha,'n_estimators': np.random.randint(10,100,20)} #the parameters that we are tuning
# #rgsearch = random grid search
# rgsearchAB1 = RandomizedSearchCV(estimator = AB1, param_distributions = param_testAB1,n_iter=20,scoring = 'neg_mean_squared_log_error',
#                                   cv=5, iid=False, n_jobs=-1)
# rgsearchAB1.fit(train_X1,train_y1) #This took approx 10 minutes to run #Ran 9/13/19 1:14AM 

# resultsAB1 = pd.DataFrame(rgsearchAB1.cv_results_) #convert the results to a dataframe to be easily read
# resultsAB1.loc[:,('params','mean_test_score','std_test_score')] 
# rgsearchAB1.best_params_, rgsearchAB1.best_score_
# rcParams['figure.figsize'] = 12, 4 #width x height in inches
# modelfitCV(rgsearchAB1.best_estimator_,train_X1,train_y1) #Create a graph to store in our power point

# adaboost_tuned1 = rgsearchAB1.best_estimator_
# adaboost_p1 = adaboost_tuned1.predict(test_X1)
# adaboost_p1 = pd.Series(adaboost_p1)
# adaboost_p1 = pd.concat([adaboost_p1,test_IDs.rename('Id')],axis=1)
# adaboost_p1 = adaboost_p1.rename(columns = {0:'SalePrice','Id':'Id'})
# adaboost_p1 = adaboost_p1[['Id','SalePrice']]

# adaboost_p1.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/adaboost_p1_09122019.csv')
# adaboost_p1.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/adaboost_p1_09122019.csv')




########YOU ARE HERE AS OF 9/13/19 1:35AM
########Further tuning of the learning_rate (0.60 -> 0.75 in steps of 0.01), and the n_estimators (41 -> 59 in steps of 2)
# param_testAB2 = {'learning_rate':np.arange(0.6,0.75,0.01),'n_estimators':range(41,60,2)}
# rgsearchAB2 = GridSearchCV(estimator = AB1, param_grid = param_testAB2, scoring='neg_mean_squared_log_error',
#                           cv=5, iid=False, n_jobs=-1)
# rgsearchAB2.fit(train_X1,train_y1) #Took about 3 hours.

# resultsAB2 = pd.DataFrame(rgsearchAB2.cv_results_)
# resultsAB2.loc[:,('params','mean_test_score','std_test_score')]
# rgsearchAB2.best_params_, rgsearchAB2.best_score_

#modelfitCV(rgsearchAB2.best_estimator_,train_X1,train_y1)

# adaboost_tuned2 = rgsearchAB2.best_estimator_
# adaboost_p2 = adaboost_tuned2.predict(test_X1)
# adaboost_p2 = pd.Series(adaboost_p2)
# adaboost_p2 = pd.concat([adaboost_p2,test_IDs.rename('Id')],axis=1)
# adaboost_p2 = adaboost_p2.rename(columns = {0:'SalePrice','Id':'Id'})
# adaboost_p2 = adaboost_p2[['Id','SalePrice']]

# adaboost_p2.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/adaboost_p2_09132019.csv')
# adaboost_p2.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/adaboost_p2_09132019.csv')



##########Further tuning of the learning_rate (0.75 -> 0.85 steps of 0.02), and the n_estimators (59 -> 71 in steps of 2)
# param_testAB3 = {'learning_rate':np.arange(0.75,0.85,0.02),'n_estimators':range(59,72,2)}
# rgsearchAB3 = GridSearchCV(estimator = AB1, param_grid = param_testAB3, scoring ='neg_mean_squared_log_error',
#                           cv=5, iid=False, n_jobs=-1)
# rgsearchAB3.fit(train_X1,train_y1)

#resultsAB3 = pd.DataFrame(rgsearchAB3.cv_results_)
# resultsAB3.loc[:,('params','mean_test_score','std_test_score')]
# rgsearchAB3.best_params_, rgsearchAB3.best_score_

#modelfitCV(rgsearchAB3.best_estimator_,train_X1,train_y1)

# adaboost_tuned3 = rgsearchAB3.best_estimator_
# adaboost_p3 = adaboost_tuned3.predict(test_X1)
# adaboost_p3 = pd.Series(adaboost_p3)
# adaboost_p3 = pd.concat([adaboost_p3,test_IDs.rename('Id')],axis=1)
# adaboost_p3 = adaboost_p3.rename(columns = {0:'SalePrice','Id':'Id'})
# adaboost_p3 = adaboost_p3[['Id','SalePrice']]

# adaboost_p3.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/adaboost_p3_09132019.csv')
# adaboost_p3.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/adaboost_p3_09132019.csv')




##########Tuning the loss function to see if any improvements will manifest.
# AB2 = AdaBoostRegressor(rfr_tuned, learning_rate = 0.77,n_estimators = 67,random_state=2)
# param_testAB4 = {'loss': ['linear','square','exponential']}
# rgsearchAB4 = GridSearchCV(estimator=AB2, param_grid = param_testAB4, scoring = 'neg_mean_squared_log_error',
#                           cv=5, iid=False, n_jobs=-1)
# rgsearchAB4.fit(train_X1,train_y1)

# resultsAB4 = pd.DataFrame(rgsearchAB4.cv_results_)
# resultsAB4.loc[:,('params','mean_test_score','std_test_score')]
#rgsearchAB4.best_params_, rgsearchAB4.best_score_ #linear is the best way to go!






# DO NOT DELETE OR MOVE, KEEP THIS CODE HERE.
# From a random website on Adaboost (but you had to pay for the rest)
# from sklearn.ensemble import AdaBoostRegressor
# from sklearn.model_selection import RandomizedSearchCV

#The person that coded this created 
# param_dist = {
#  'n_estimators': [50, 100],
#  'learning_rate' : [0.01,0.05,0.1,0.3,1],
#  'loss' : ['linear', 'square', 'exponential']
#  }

# pre_gs_inst = RandomizedSearchCV(AdaBoostRegressor(),
#  param_distributions = param_dist,
#  cv=3,
#  n_iter = 10,
#  n_jobs=-1)

# pre_gs_inst.fit(X_train, y_train)

In [28]:
############ Gradient Boosting with RandomizedCrossValidation ############
#Lets see how generic gradient boosting works with no tuning
# gb_0 = GradientBoostingRegressor(random_state=5)
# rcParams['figure.figsize'] = 12, 4
# modelfitCV(gb_0, train_X1, train_y1)

#Lets see how the base model performs on Kaggle.
# GBmodel = gb_0.fit(train_X1,train_y1)
# GBmodelp = GBmodel.predict(test_X1)
# GBmodelp = pd.Series(GBmodelp)
# GBmodelp = pd.concat([GBmodelp,test_IDs.rename('Id')],axis=1)
# GBmodelp = GBmodelp.rename(columns = {0:'SalePrice','Id':'Id'})
# GBmodelp = GBmodelp[['Id','SalePrice']]

# GBmodelp.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/GBmodelp_09162019.csv')
# GBmodelp.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/GBmodelp_09162019.csv')

#This regular model outperformed the tuned Adaboost + Regression Trees (Wow)



#Random Tuning of learning_rate with n_estimators. 
#This is not recommended by the article because they affect each other but this is a little experiment that I 
#want to run to test how good this process will be.

###########Tune learning_rate (randomly generated 20 values) and n_estimators (40 -> 140 in steps of 10)
# r_expGB = -4*np.random.rand(20)
# LR_GB = 10**r_expGB
# NE = range(40,141,10)
# GB_param_test1 = {'n_estimators':NE,'learning_rate':LR_GB}
# GBrgsearch1 = RandomizedSearchCV(estimator = GradientBoostingRegressor(min_samples_split=14,min_samples_leaf=2,max_depth=5,
#                                                                       max_features='sqrt',subsample=0.8,random_state=5),
#                                  param_distributions=GB_param_test1,n_iter=80, scoring='neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
#GBrgsearch1.fit(train_X1,train_y1)

#GBresults1 = pd.DataFrame(GBrgsearch1.cv_results_)
#GBresults1.loc[:,('params','mean_test_score','std_test_score')]
#GBrgsearch1.best_params_,GBrgsearch1.best_score_
# rcParams['figure.figsize'] = 12, 4
# modelfitCV(GBrgsearch1.best_estimator_,train_X1,train_y1)

# GB1 = GBrgsearch1.best_estimator_
# GB_model1 = GB1.fit(train_X1,train_y1)
# GBp1 = GB_model1.predict(test_X1)
# GBp1 = pd.Series(GBp1)
# GBp1 = pd.concat([GBp1,test_IDs.rename('Id')],axis=1)
# GBp1 = GBp1.rename(columns = {0:'SalePrice','Id':'Id'})
# GBp1 = GBp1[['Id','SalePrice']]

# GBp1.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/GBp1_09162019.csv')
# GBp1.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/GBp1_09162019.csv')




#########Tune learning_rate (130 -> 200 steps of 10) and n_estimators (0.040 -> 0.060 in steps of 0.002)
# GB_param_test2 = {'n_estimators':range(130,201,10),'learning_rate':np.arange(0.040,0.061,0.002)}
# GBgsearch2 = GridSearchCV(estimator = GradientBoostingRegressor(min_samples_split=14,min_samples_leaf=2,max_depth=5,
#                                                                 max_features='sqrt',subsample=0.8,random_state=5),
#                           param_grid=GB_param_test2,scoring='neg_mean_squared_log_error',cv=5,iid=False,n_jobs=-1)
# GBgsearch2.fit(train_X1,train_y1)

# GBresults2 = pd.DataFrame(GBgsearch2.cv_results_)
# GBresults2.loc[:,('params','mean_test_score','std_test_score')]
# GBgsearch2.best_params_,GBgsearch2.best_score_
# rcParams['figure.figsize'] = 12, 4
# modelfitCV(GBgsearch2.best_estimator_,train_X1,train_y1)

# GB2 = GBgsearch2.best_estimator_
# GB_model2 = GB2.fit(train_X1,train_y1)
# GBp2 = GB_model2.predict(test_X1)
# GBp2 = pd.Series(GBp2)
# GBp2 = pd.concat([GBp2,test_IDs.rename('Id')],axis=1)
# GBp2 = GBp2.rename(columns = {0:'SalePrice','Id':'Id'})
# GBp2 = GBp2[['Id','SalePrice']]

# GBp2.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/GBp2_09172019.csv')
# GBp2.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/GBp2_09172019.csv')






#########Tune n_estimators (180 -> 250 in steps of 10) and max_depth (4 -> 10 steps of 1)
# GB_param_test3 = {'n_estimators':range(180,251,10),'max_depth':range(4,11,1)}
# GBgsearch3 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.0580,max_features='sqrt',subsample=0.8,random_state=5),
#                          param_grid=GB_param_test3,scoring='neg_mean_squared_log_error',cv=5,iid=False,n_jobs=-1)
# GBgsearch3.fit(train_X1,train_y1)

# GBresults3 = pd.DataFrame(GBgsearch3.cv_results_)
# GBresults3.loc[:,('params','mean_test_score','std_test_score')]
# GBgsearch3.best_params_,GBgsearch3.best_score_


##########Tune n_estimators (230 -> 300 in steps of 10) and max_depth (3 -> 7 steps of 1)
# GB_param_test4 = {'n_estimators':range(230,301,10),'max_depth':range(3,8,1)}
# GBgsearch4 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.0580,max_features='sqrt',subsample=0.8,random_state=5),
#                          param_grid=GB_param_test4,scoring='neg_mean_squared_log_error',cv=5,iid=False,n_jobs=-1)
# GBgsearch4.fit(train_X1,train_y1)
# GBgsearch4.best_params_,GBgsearch4.best_score_



#########Tune n_estimators (190 -> 390 in steps of 10) 
# GB_param_test5 = {'n_estimators':range(190,391,10)}
# GBgsearch5 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.0580,max_depth=4,max_features='sqrt',subsample=0.8,random_state=5),
#                          param_grid=GB_param_test5,scoring='neg_mean_squared_log_error',cv=5,iid=False,n_jobs=-1)
# GBgsearch5.fit(train_X1,train_y1)
# GBgsearch5.best_params_,GBgsearch5.best_score_


########Tune min_samples_split (4 -> 20 steps of 1) and min_samples_leaf (2 -> 10 steps of 1)
# GB_param_test6 = {'min_samples_split':range(4,21,1),'min_samples_leaf':range(2,11,1)}
# GBgsearch6 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.0580,n_estimators=380,max_depth=4,max_features='sqrt',subsample=0.8,random_state=5),
#                          param_grid=GB_param_test6,scoring='neg_mean_squared_log_error',cv=5,iid=False,n_jobs=-1)
# GBgsearch6.fit(train_X1,train_y1)
# GBgsearch6.best_params_,GBgsearch6.best_score_

# modelfitCV(GBgsearch6.best_estimator_,train_X1,train_y1)

# GB6 = GBgsearch6.best_estimator_
# GBmodel6 = GB6.fit(train_X1,train_y1)
# GBp6 = GBmodel6.predict(test_X1)
# GBp6 = pd.Series(GBp6)
# GBp6 = pd.concat([GBp6,test_IDs.rename('Id')],axis=1)
# GBp6 = GBp6.rename(columns = {0:'SalePrice','Id':'Id'})
# GBp6 = GBp6[['Id','SalePrice']]

# GBp6.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/GBp6_09172019.csv')
# GBp6.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/GBp6_09172019.csv')


#########Tune max_features (10 -> 32 steps of 2) and subsample (0.4->0.8 steps of 0.1)
# GB_param_test7 = {'max_features':range(10,33,2),'subsample':np.arange(0.4,0.9,0.1)}
# GBgsearch7 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.0580,n_estimators=380,max_depth=4,min_samples_split=19,min_samples_leaf=2,random_state=5),
#                          param_grid=GB_param_test7,scoring='neg_mean_squared_log_error',cv=5,iid=False,n_jobs=-1)
# GBgsearch7.fit(train_X1,train_y1)
# GBgsearch7.best_params_,GBgsearch7.best_score_

# modelfitCV(GBgsearch7.best_estimator_,train_X1,train_y1)

# GB7 = GBgsearch7.best_estimator_
# GBmodel7 = GB7.fit(train_X1,train_y1)
# GBp7 = GBmodel7.predict(test_X1)
# GBp7 = pd.Series(GBp7)
# GBp7 = pd.concat([GBp7,test_IDs.rename('Id')],axis=1)
# GBp7 = GBp7.rename(columns = {0:'SalePrice','Id':'Id'})
# GBp7 = GBp7[['Id','SalePrice']]

# GBp7.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/GBp7_09172019.csv')
# GBp7.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/GBp7_09172019.csv')



In [9]:
############ XGBoost (9/18/19) ############
#Following the instructions from the paper.
#First starting model

# This code can remove the warning you get because its not that serious, but just in case. 
# import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)


#############Untuned model
# xgb1 = XGBRegressor(learning_rate=0.1,n_estimators=1000,max_depth=6,min_child_weight=1,gamma=0,subsample=0.6,
#                    colsample_bytree=0.1,scale_pos_weight=1,seed=13,objective='reg:squarederror')
# rcParams['figure.figsize'] = 12, 4 #width x height in inches
# modelfitXGB(xgb1,train_X1,train_y1)

# XGBmodel = xgb1.fit(train_X1,train_y1)
# XGBp = XGBmodel.predict(test_X1)
# XGBp = pd.Series(XGBp)
# XGBp = pd.concat([XGBp,test_IDs.rename('Id')],axis=1)
# XGBp = XGBp.rename(columns = {0:'SalePrice','Id':'Id'})
# XGBp = XGBp[['Id','SalePrice']]

# XGBp.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/XGBp_09182019.csv')
# XGBp.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/XGBp_09182019.csv')



#############Tune max_depth and min_child_weight 
# XGB_param_test1 = {'max_depth':range(2,13,2),'min_child_weight':range(1,6,1)}
# XGBgsearch1 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,n_estimators=300,gamma=0,subsample=0.6,colsample_bytree=0.1,
#                                                   objective='reg:squarederror',scale_pos_weight=1,seed=13),param_grid=XGB_param_test1,
#                           scoring='neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
# XGBgsearch1.fit(train_X1,train_y1)

# XGBresults1 = pd.DataFrame(XGBgsearch1.cv_results_)
# XGBresults1.loc[:,('params','mean_test_score','std_test_score')]
# XGBgsearch1.best_params_,XGBgsearch1.best_score_



#############Finetune max_depth and min_child_weight even more.
# XGB_param_test2 = {'max_depth':range(3,7,1),'min_child_weight':range(2,6,1)}
# XGBgsearch2 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,n_estimators=300,gamma=0,subsample=0.6,colsample_bytree=0.1,
#                                                   objective='reg:squarederror',scale_pos_weight=1,seed=13),
#                            param_grid=XGB_param_test2,scoring='neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
# XGBgsearch2.fit(train_X1,train_y1)

# XGBresults2 = pd.DataFrame(XGBgsearch2.cv_results_)
# XGBresults2.loc[:,('params','mean_test_score','std_test_score')]
# XGBgsearch2.best_params_,XGBgsearch2.best_score_

# modelfitXGB(XGBgsearch2.best_estimator_,train_X1,train_y1)

# XGB2 = XGBgsearch2.best_estimator_
# XGBmodel2 = XGB2.fit(train_X1,train_y1)
# XGBp2 = XGBmodel2.predict(test_X1)
# XGBp2 = pd.Series(XGBp2)
# XGBp2 = pd.concat([XGBp2,test_IDs.rename('Id')],axis=1)
# XGBp2 = XGBp2.rename(columns = {0:'SalePrice','Id':'Id'})
# XGBp2 = XGBp2[['Id','SalePrice']]

# XGBp2.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/XGBp2_09182019.csv')
# XGBp2.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/XGBp2_09182019.csv')



###########Tune gamma
# XGB_param_test3 = {'gamma':np.arange(0.0,1.0,0.1)}
# XGBgsearch3 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,n_estimators=300,max_depth=3,min_child_weight=2,subsample=0.6,colsample_bytree=0.1,
#                                                  objective='reg:squarederror',scale_pos_weight=1,seed=13),
#                           param_grid=XGB_param_test3,scoring='neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
# XGBgsearch3.fit(train_X1,train_y1)
# XGBresults3 = pd.DataFrame(XGBgsearch3.cv_results_)
# XGBresults3.loc[:,('params','mean_test_score','std_test_score')] #All these values are the same. In other words, Gamma doesn't really matter.
#XGBgsearch3.best_params_,XGBgsearch3.best_score_


##########Tune subsample and colsample_bytree
# XGB_param_test4 = {'subsample':np.arange(0.3,1.0,0.1),'colsample_bytree':np.arange(0.1,1.0,0.1)}
# XGBgsearch4 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,n_estimators=300,max_depth=3,min_child_weight=2,gamma=0,
#                                                  objective='reg:squarederror',scale_pos_weight=1,seed=13),
#                           param_grid=XGB_param_test4,scoring='neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
# XGBgsearch4.fit(train_X1,train_y1)

# XGBresults4 = pd.DataFrame(XGBgsearch4.cv_results_)
# XGBresults4.loc[:,('params','mean_test_score','std_test_score')]
# XGBgsearch4.best_params_,XGBgsearch4.best_score_

# xgb_recalibrate = XGBRegressor(learning_rate=0.1,n_estimators=1000,max_depth=3,min_child_weight=2,gamma=0,
#                               subsample=0.9,colsample_bytree=0.8,objective='reg:squarederror',scale_pos_weight=1,seed=13)
# rcParams['figure.figsize'] = 12, 4 #width x height in inches
# modelfitXGB(xgb_recalibrate,train_X1,train_y1) #the recalibration shows n_estimators = 100 is optimal.


#Check on how this tuned model does on Kaggle. <- It got worse! It may be overfitting the training data
# XGB4 = XGBRegressor(learning_rate=0.1,n_estimators=100,max_depth=3,min_child_weight=2,gamma=0,
#                               subsample=0.9,colsample_bytree=0.8,objective='reg:squarederror',scale_pos_weight=1,seed=13)
# XGBmodel4 = XGB4.fit(train_X1,train_y1)
# XGBp4 = XGBmodel4.predict(test_X1)
# XGBp4 = pd.Series(XGBp4)
# XGBp4 = pd.concat([XGBp4,test_IDs.rename('Id')],axis=1)
# XGBp4 = XGBp4.rename(columns = {0:'SalePrice','Id':'Id'})
# XGBp4 = XGBp4[['Id','SalePrice']]

# XGBp4.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/XGBp4_09182019.csv')
# XGBp4.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/XGBp4_09182019.csv')


##########Tune 1st regularization parameter reg_alpha
# XGB_param_test5 = {'reg_alpha':[0.0001,0.001,0.01,0.1,1,10]}
# XGBgsearch5 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,n_estimators=100,max_depth=3,min_child_weight=2,gamma=0,
#                                                  subsample=0.9,colsample_bytree=0.8,objective='reg:squarederror',scale_pos_weight=1,seed=13),
#                           param_grid=XGB_param_test5,scoring='neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
# XGBgsearch5.fit(train_X1,train_y1)

# XGBresults5 = pd.DataFrame(XGBgsearch5.cv_results_)
# XGBresults5.loc[:,('params','mean_test_score','std_test_score')]
# XGBgsearch5.best_params_,XGBgsearch5.best_score_


##########Tune 2nd regularization parameter reg_lambda
# XGB_param_test6 = {'reg_lambda':[0.0001,0.001,0.01,0.1,1,10]}
# XGBgsearch6 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,n_estimators=100,max_depth=3,min_child_weight=2,gamma=0,reg_alpha=0.001,
#                                                  subsample=0.9,colsample_bytree=0.8,objective='reg:squarederror',scale_pos_weight=1,seed=13),
#                           param_grid=XGB_param_test6,scoring='neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
# XGBgsearch6.fit(train_X1,train_y1)

# XGBresults6 = pd.DataFrame(XGBgsearch6.cv_results_)
# XGBresults6.loc[:,('params','mean_test_score','std_test_score')]
#XGBgsearch6.best_params_,XGBgsearch6.best_score_




##########Fine tune reg_lambda
# XGB_param_test7 = {'reg_lambda':np.arange(0.3,2.0,0.1)}
# XGBgsearch7 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,n_estimators=100,max_depth=3,min_child_weight=2,gamma=0,reg_alpha=0.001,
#                                                  subsample=0.9,colsample_bytree=0.8,objective='reg:squarederror',scale_pos_weight=1,seed=13),
#                           param_grid=XGB_param_test7,scoring='neg_mean_squared_log_error',n_jobs=-1,iid=False,cv=5)
# XGBgsearch7.fit(train_X1,train_y1)

# XGBresults7 = pd.DataFrame(XGBgsearch7.cv_results_)
# XGBresults7.loc[:,('params','mean_test_score','std_test_score')]
#XGBgsearch7.best_params_,XGBgsearch7.best_score_


# xgb_recalibrate2 = XGBRegressor(learning_rate=0.1,n_estimators=1000,max_depth=3,min_child_weight=2,gamma=0,reg_alpha=0.001,reg_lambda=1,
#                               subsample=0.9,colsample_bytree=0.8,objective='reg:squarederror',scale_pos_weight=1,seed=13)
# rcParams['figure.figsize'] = 12, 4 #width x height in inches
# modelfitXGB(xgb_recalibrate2,train_X1,train_y1) #the recalibration shows n_estimators = 213 is optimal.

# Check how it performs after the model is mostly tuned.
# XGB7 = XGBRegressor(learning_rate=0.1,n_estimators=213,max_depth=3,min_child_weight=2,gamma=0,reg_alpha=0.001,reg_lambda=1,
#                               subsample=0.9,colsample_bytree=0.8,objective='reg:squarederror',scale_pos_weight=1,seed=13)
# XGBmodel7 = XGB7.fit(train_X1,train_y1)
# XGBp7 = XGBmodel7.predict(test_X1)
# XGBp7 = pd.Series(XGBp7)
# XGBp7 = pd.concat([XGBp7,test_IDs.rename('Id')],axis=1)
# XGBp7 = XGBp7.rename(columns = {0:'SalePrice','Id':'Id'})
# XGBp7 = XGBp7[['Id','SalePrice']]

# XGBp7.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/XGBp7_09182019.csv')
# XGBp7.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/XGBp7_09182019.csv')



##########Reducing the learning_rate = 0.01 to see if the new n_estimators will improve the model.
# xgb_recalibrate3 = XGBRegressor(learning_rate=0.01,n_estimators=5000,max_depth=3,min_child_weight=2,gamma=0,reg_alpha=0.001,reg_lambda=1,
#                                subsample=0.9,colsample_bytree=0.8,objective='reg:squarederror',scale_pos_weight=1,seed=13)
# rcParams['figure.figsize'] = 12, 4 #width x height in inches
# modelfitXGB(xgb_recalibrate3,train_X1,train_y1) #the recalibration shows n_estimators = 2438

# Final tuned model 
# XGBF = XGBRegressor(learning_rate=0.01,n_estimators=2438,max_depth=3,min_child_weight=2,gamma=0,reg_alpha=0.001,reg_lambda=1,
#                               subsample=0.9,colsample_bytree=0.8,objective='reg:squarederror',scale_pos_weight=1,seed=13)
# XGBmodelF = XGBF.fit(train_X1,train_y1)
# XGBpF = XGBmodelF.predict(test_X1)
# XGBpF = pd.Series(XGBpF)
# XGBpF = pd.concat([XGBpF,test_IDs.rename('Id')],axis=1)
# XGBpF = XGBpF.rename(columns = {0:'SalePrice','Id':'Id'})
# XGBpF = XGBpF[['Id','SalePrice']]

# XGBpF.to_csv(r'/Users/armenta/Kaggle/Housing Prices/Predictions/XGBpF_09182019.csv')
# XGBpF.to_csv(r'/Users/armenta/Desktop/Data Science/Kaggle/Getting Started Projects/Housing Data/PREDICTIONS/XGBpF_09182019.csv')




In [16]:
############ STANDARDIZED: XGBOOST (12/22/19) ###########
warnings.simplefilter(action='ignore', category=FutureWarning)
rcParams['figure.figsize'] = 12, 4 #width x height in inches

########### PART I ############

############ Round I - n_est #############
# xgb1 = XGBRegressor(learning_rate=0.1, max_depth=5, min_child_weight=4, subsample=0.6, colsample_bytree=0.0555, 
#                          objective='reg:squarederror',seed=7)
# n_est_range = range(100,355,5)
# model1, model_results1, best_n_est, best_score1 = XGBRModelTune(xgb1,'n_estimators',n_est_range)

############ Round II - max_depth ###########
# xgb2 = XGBRegressor(learning_rate=0.1, n_estimators=330, min_child_weight=4, subsample=0.6, colsample_bytree=0.0555, 
#                     objective='reg:squarederror',seed=7)
# max_d_range = range(1,11,1)
# model2, model_results2, best_n_est, best_score2 = XGBRModelTune(xgb2, 'max_depth', max_d_range)

############ Round III - min_child_weight ############
# xgb3 = XGBRegressor(learning_rate=0.1, n_estimators=330, max_depth=4, subsample=0.6, colsample_bytree=0.0555, 
#                      objective='reg:squarederror',seed=7)
# mcw_range = range(1,11,1)
# model3, model_results3, best_mcw, best_score3 = XGBRModelTune(xgb3, 'min_child_weight', mcw_range)

############ Round IV - gamma ##########
# xgb4 = XGBRegressor(learning_rate=0.1, n_estimators=330, max_depth=4, subsample=0.6, colsample_bytree=0.0555, 
#                       objective='reg:squarederror',seed=7)
# gamma_range = range(0,21,1)
# model4, model_results4, best_gamma, best_score4 = XGBRModelTune(xgb4, 'gamma', gamma_range)

############ Round V - subsample ############
# xgb5 = XGBRegressor(learning_rate=0.1, n_estimators=330, max_depth=4, colsample_bytree=0.0555, 
#                     objective='reg:squarederror',seed=7)
# ss_range = np.arange(0.1,1.01,0.01)
# model5, model_results5, best_ss, best_score5 = XGBRModelTune(xgb5, 'subsample', ss_range)

############ Round VI - colsample_bytree ############
# xgb6 = XGBRegressor(learning_rate=0.1, n_estimators=330, max_depth=4, subsample=1, 
#                     objective='reg:squarederror',seed=7)
# colsamp_range = np.arange(0.01,1.0,0.01)
# model6, model_results6, best_colsamp, best_score6 = XGBRModelTune(xgb6, 'colsample_bytree', colsamp_range)

############ Round VII - lambda ###########
# xgb7 = XGBRegressor(learning_rate=0.1, n_estimators=330, max_depth=4, subsample=1,colsample_bytree=0.46, 
#                      objective='reg:squarederror',seed=7)
# lamb_range = range(0,16,1)
# model7, model_results7, best_lambda, best_score7 = XGBRModelTune(xgb7, 'reg_lambda', lamb_range)

############ Round VIII - alpha ############
# xgb8 = XGBRegressor(learning_rate=0.1, n_estimators=330, max_depth=4, subsample=1,colsample_bytree=0.46, 
#                      objective='reg:squarederror',seed=7)
# alpha_range = range(0,16,1)
# model8, model_results8, best_alpha, best_score8 = XGBRModelTune(xgb8, 'reg_alpha', alpha_range)

############ Round IX - learning_rate #############
# xgb9 = XGBRegressor(n_estimators=1000, max_depth=4, subsample=1, colsample_bytree=0.46, objective='reg:squarederror',
#                     seed=7,reg_alpha=1)
# exp_LR = -3*np.random.rand(100)
# learning_rate_range = 10**exp_LR
# model9, model_results9, best_LR, best_score9 = XGBRModelTune(xgb9, 'learning_rate', learning_rate_range, 
#                                                              Randomized=True, n_iter=60, modelfit=False)

############ Save Tuned Model ############
# lr9 = 0.024852786825405707
# xgb_tuned = XGBRegressor(learning_rate=lr9, n_estimators=1000, max_depth=4, subsample=0.66, colsample_bytree=0.46,
#                         objective='reg:squarederror',seed=7, reg_alpha=1)
# modelfitXGB(xgb_tuned,strain_X1,train_y1)
#train_err1, train_var1 = TrainTestErrors(xgb_tuned,'xgbtuned_DP1_strain_X1_01192020.csv')
#train_err1



########### PART II ##########

########### Round I - min_child_weight #########
# lr9 = 0.024852786825405707
# xgb1 = XGBRegressor(learning_rate=lr9, n_estimators=1000, max_depth=4, subsample=0.66, colsample_bytree=0.46, 
#                    objective='reg:squarederror',seed=7,reg_alpha=1, reg_lambda=1)
# mcw_range = range(1,11,1)
# model1, model_results1, best_mcw, best_score1 = XGBRModelTune(xgb1, 'min_child_weight', mcw_range)

############ TEST VALUES ##########
# xgbI_MCW3 = XGBRegressor(learning_rate=lr9,n_estimators=1000,min_child_weight=3,max_depth=4,subsample=0.66,
#                          colsample_bytree=0.46,reg_alpha=1, reg_lambda=1, objective='reg:squarederror',seed=7)
# train_err1, train_exp_var1 = TrainTestErrors(xgbI_MCW3,'xgbI_MCW3_02102020.csv')
# train_err1
# xgbI_MCW5 = XGBRegressor(learning_rate=lr9,n_estimators=1000,min_child_weight=5,max_depth=4,subsample=0.66,
#                           colsample_bytree=0.46,reg_alpha=1, reg_lambda=1, objective='reg:squarederror',seed=7)
# train_err2, train_exp_var2 = TrainTestErrors(xgbI_MCW5, 'xgbI_MCW5_02102020.csv')
# train_err2
# model_results1

########### Round II - reg_lambda #########
# xgb2 = XGBRegressor(learning_rate=lr9, n_estimators=1000, max_depth=4, subsample=0.66, colsample_bytree=0.46, 
#                     min_child_weight=1, objective='reg:squarederror',seed=7,reg_alpha=1)
# lamb_range = range(0,21,1)
# model2, model_results2, best_lambda, best_score2 = XGBRModelTune(xgb2, 'reg_lambda', lamb_range)

########### TEST VALUES ############
# xgbI_RL3 = XGBRegressor(learning_rate=lr9,n_estimators=1000,min_child_weight=1,max_depth=4,subsample=0.66,
#                        colsample_bytree=0.46,reg_alpha=1, reg_lambda=3, objective='reg:squarederror',seed=7)
# train_err3, train_exp_var3 = TrainTestErrors(xgbI_RL3,'xgbI_RL3_02102020.csv')
# train_err3
# xgbI_RL7 = XGBRegressor(learning_rate=lr9,n_estimators=1000,min_child_weight=1,max_depth=4,subsample=0.66,
#                        colsample_bytree=0.46,reg_alpha=1, reg_lambda=7, objective='reg:squarederror',seed=7)
# train_err4, train_exp_var4 = TrainTestErrors(xgbI_RL7,'xgbI_RL7_02102020.csv')
# train_err4
# xgbI_RL11 = XGBRegressor(learning_rate=lr9,n_estimators=1000,min_child_weight=1,max_depth=4,subsample=0.66,
#                         colsample_bytree=0.46,reg_alpha=1, reg_lambda=11, objective='reg:squarederror',seed=7)
# train_err5, train_exp_var5 = TrainTestErrors(xgbI_RL11,'xgbI_RL11_02102020.csv')
# train_err5
# xgbI_RL15 = XGBRegressor(learning_rate=lr9,n_estimators=1000,min_child_weight=1,max_depth=4,subsample=0.66,
#                         colsample_bytree=0.46,reg_alpha=1, reg_lambda=15, objective='reg:squarederror',seed=7)
# train_err6, train_exp_var6 = TrainTestErrors(xgbI_RL15,'xgbI_RL15_02102020.csv')
# train_err6
# model_results2

########## Round III - reg_alpha ############
# lr9=0.024852786825405707
# xgb3 = XGBRegressor(learning_rate=lr9, n_estimators=1000, max_depth=4, subsample=0.66, colsample_bytree=0.46, 
#                     min_child_weight=1, objective='reg:squarederror',seed=7,reg_lambda=7)
# alpha_range = range(0,21,1)
# model3, model_results3, best_alpha, best_score3 = XGBRModelTune(xgb3, 'reg_alpha', alpha_range)

########## TEST VALUES ###########
# xgbI_RA17 = XGBRegressor(learning_rate=lr9,n_estimators=1000,min_child_weight=1,max_depth=4,subsample=0.66,
#                           colsample_bytree=0.46,reg_alpha=17, reg_lambda=7, objective='reg:squarederror',seed=7)
# train_err7, train_exp_var7 = TrainTestErrors(xgbI_RA17, 'xgbI_RA17_02172019.csv')
# train_err7
# xgbI_RA11 =XGBRegressor(learning_rate=lr9,n_estimators=1000,min_child_weight=1,max_depth=4,subsample=0.66,
#                           colsample_bytree=0.46,reg_alpha=11, reg_lambda=7, objective='reg:squarederror',seed=7)
# train_err8, train_exp_var8 = TrainTestErrors(xgbI_RA11, 'xgbI_RA11_02172019.csv')
# train_err8
# xgbI_RA5 = XGBRegressor(learning_rate=lr9,n_estimators=1000,min_child_weight=1,max_depth=4,subsample=0.66,
#                         colsample_bytree=0.46,reg_alpha=5, reg_lambda=7, objective='reg:squarederror',seed=7)
# train_err9, train_exp_var9 = TrainTestErrors(xgbI_RA5, 'xgbI_RA5_02172019.csv')
# train_err9
#model_results3

########## Round IV - max_depth ###########
# xgb4 = XGBRegressor(learning_rate=lr9, n_estimators=1000, subsample=0.66, colsample_bytree=0.46, 
#                      min_child_weight=1, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# md_range = range(1,11,1)
# model4, model_results4, best_md, best_score4 = XGBRModelTune(xgb4, 'max_depth', md_range)

########## TEST VALUES ##########
# xgbI_MD3 = XGBRegressor(learning_rate=lr9, n_estimators=1000, subsample=0.66, colsample_bytree=0.46,max_depth=3, 
#                         min_child_weight=1, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# train_err10, train_exp_var10 = TrainTestErrors(xgbI_MD3, 'xgbI_MD3_02172019.csv')
# train_err10
# xgbI_MD5 = XGBRegressor(learning_rate=lr9, n_estimators=1000, subsample=0.66, colsample_bytree=0.46,max_depth=5, 
#                         min_child_weight=1, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# train_err11, train_exp_var11 = TrainTestErrors(xgbI_MD5, 'xgbI_MD5_02172019.csv')
# train_err11
# model_results4

########## Round V - subsample ##########
# xgb5 = XGBRegressor(learning_rate=lr9, n_estimators=1000, colsample_bytree=0.46, min_child_weight=1, 
#                     max_depth=4, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# sub_range = np.arange(0.35,0.51,0.01)
# model5, model_results5, best_sub, best_score5 = XGBRModelTune(xgb5, 'subsample', sub_range)

########## TEST VALUES ###########
# lr9=0.024852786825405707
# xgbI_S82 = XGBRegressor(learning_rate=lr9, n_estimators=1000, colsample_bytree=0.46, min_child_weight=1, subsample=0.82,
#                      max_depth=4, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# train_err12, train_exp_var12 = TrainTestErrors(xgbI_S82, 'xgbI_S82_02172019.csv')
# train_err12
# xgbI_S70 = XGBRegressor(learning_rate=lr9, n_estimators=1000, colsample_bytree=0.46, min_child_weight=1, subsample=0.7,
#                         max_depth=4, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# train_err13, train_exp_var13 = TrainTestErrors(xgbI_S70, 'xgbI_S70_02172019.csv')
# train_err13
# xgbI_S60 = XGBRegressor(learning_rate=lr9, n_estimators=1000, colsample_bytree=0.46, min_child_weight=1, subsample=0.6,
#                         max_depth=4, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# train_err14, train_exp_var14 = TrainTestErrors(xgbI_S60, 'xgbI_S60_02172019.csv')
# train_err14
# xgbI_S50 = XGBRegressor(learning_rate=lr9, n_estimators=1000, colsample_bytree=0.46, min_child_weight=1, subsample=0.5,
#                         max_depth=4, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# train_err15, train_exp_var15 = TrainTestErrors(xgbI_S50, 'xgbI_S50_02172019.csv')
# train_err15
# xgbI_S40 = XGBRegressor(learning_rate=lr9, n_estimators=1000, colsample_bytree=0.46, min_child_weight=1, subsample=0.4,
#                          max_depth=4, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# train_err16, train_exp_var16 = TrainTestErrors(xgbI_S40, 'xgbI_S40_02242020.csv')
# train_err16
# xgbI_S45 = XGBRegressor(learning_rate=lr9, n_estimators=1000, colsample_bytree=0.46, min_child_weight=1, subsample=0.45,
#                          max_depth=4, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# train_err17, train_exp_var17 = TrainTestErrors(xgbI_S45, 'xgbI_S45_02242020.csv')
# train_err17
# xgbI_S49 = XGBRegressor(learning_rate=lr9, n_estimators=1000, colsample_bytree=0.46, min_child_weight=1, subsample=0.49,
#                          max_depth=4, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# train_err18, train_exp_var18 = TrainTestErrors(xgbI_S49, 'xgbI_S49_02242020.csv')
# train_err18
#model_results5

########## Round VI - colsample_bytree ###########
# xgb6 = XGBRegressor(learning_rate=lr9, n_estimators=1000, subsample=0.66, min_child_weight=1, 
#                     max_depth=4, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# cs_bt_range = np.arange(0.05,1.05,0.05)
# model6, model_results6, best_colsample, best_score6 = XGBRModelTune(xgb6, 'colsample_bytree', cs_bt_range)

########## TEST VALUES ##########
# xgbI_CS25 = XGBRegressor(learning_rate=lr9, n_estimators=1000, colsample_bytree=0.25, min_child_weight=1, subsample=0.66,
#                          max_depth=4, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# train_err19, train_exp_var19 = TrainTestErrors(xgbI_CS25, 'xgbI_CS25_02242020.csv')
# train_err19
# xgbI_CS30 = XGBRegressor(learning_rate=lr9, n_estimators=1000, colsample_bytree=0.30, min_child_weight=1, subsample=0.66,
#                          max_depth=4, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# train_err20, train_exp_var20 = TrainTestErrors(xgbI_CS30, 'xgbI_CS30_02242020.csv')
# train_err20
# xgbI_CS20 = XGBRegressor(learning_rate=lr9, n_estimators=1000, colsample_bytree=0.20, min_child_weight=1, subsample=0.66,
#                          max_depth=4, objective='reg:squarederror',seed=7,reg_lambda=7,reg_alpha=5)
# train_err21, train_exp_var21 = TrainTestErrors(xgbI_CS20, 'xgbI_CS20_02242020.csv')
# train_err21
#model_results6





0.004472215000110959

In [10]:
############# FINAL (3/9/20) ##############

#We'll save the feature importances of the model to an Excel file so we can compare it with data from 
#our other pipeline in Tableau.
#According to the Excel file, our best model was: 
#xgbI_RL7 (learning_rate ~ 0.0248527, n_est = 1000, max_depth=4, min_child_weight = 1, subsample=0.66, colsample_bytree=0.46, objective='reg:squarederror', seed=7,  reg_alpha=1, reg_lambda=3) *Standardized data

#Create the model.
lr9= 0.024852786825405707
best_model = XGBRegressor(learning_rate=lr9,n_estimators=1000,min_child_weight=1,max_depth=4,subsample=0.66,
                        colsample_bytree=0.46, reg_alpha=1, reg_lambda=7, objective='reg:squarederror',seed=7)
#Fit the model to the data.
best_model.fit(strain_X1,train_y1)

#Grab the important features.
feat_imp = pd.Series(best_model.get_booster().get_fscore()).sort_values(ascending=False)

#Save the feature importances.
feat_imp.to_csv('/Users/armenta/Kaggle/Housing Prices/FI_pipeline1.csv')







In [2]:
########## NOT NEEDED CODE (FOR NOW) ############
# #Check the amount of missing values for each column. 
# #np.sum(train_X.isnull())
#
#
#
# #LOOKING AT THE DISTRIBUTION OF DIFFERENT VARIABLES (ENDED 5/14/19)
# #Plot a histogram of some of the data.
# #CATEGORICAL VARIABLES
# curr_col = train_X.Neighborhood
# curr_col.value_counts().plot(kind='bar')
# curr_col.value_counts()
#
# #Allows us to look at the unique values and match up with the sheet
# curr_col.unique()
# curr_col.describe()
#
# curr_var = train_house.loc[train_house.Neighborhood == neighborhood[24],'SalePrice']
# curr_var.hist(bins=25)
# curr_var.describe()
#
# #LOOKING AT SPECIFIC COLUMNS TO SEE HOW WELL THEY AFFECT THE SALES PRICE 
#
# var1 = train_house.loc[train_house.Alley == 'Pave', :]
# var2 = train_house.loc[train_house.Alley == 'Grvl', :]
# var3 = train_house.loc[train_house.SaleType == 'COD', :]
# var4 = train_house.loc[train_house.SaleType == 'ConLD', :]
# var5 = train_house.loc[train_house.SaleType == 'ConLw', :]
# var6 = train_house.loc[train_house.SaleType == 'ConLI', :]
# var7 = train_house.loc[train_house.SaleType == 'CWD', :]
# var8 = train_house.loc[train_house.SaleType == 'Oth', :]
# var9 = train_house.loc[train_house.SaleType == 'Con', :]
#
#
# var1.SalePrice.hist(bins=25)
# var1.SalePrice.describe()
#
# var2.SalePrice.hist(bins=25)
# var2.SalePrice.describe()
#
# var3.SalePrice.hist(bins=25)
# var3.SalePrice.describe()
#
# var4.SalePrice.hist(bins=25)
# var4.SalePrice.describe()
#
# var5.SalePrice.hist(bins=25)
# var5.SalePrice.describe()
#
# var6.SalePrice.hist(bins=25)
# var6.SalePrice.describe()
#
# var7.SalePrice.hist(bins=25)
# var7.SalePrice.describe()
#
# var8.SalePrice.hist(bins=25)
# var8.SalePrice.describe()
#
# var9.SalePrice.hist(bins=25)
# var9.SalePrice.describe()
#
# #Plot a histogram of some of the data 
# #NUMERICAL VARIABLES 
# num_col = train_X.loc[:,'MiscVal']
# num_col.hist(bins = 50)
#
# num_col.describe()
# num_col.value_counts()



# #Looking at the house with the largest lot area to see if it makes sense.
# #train_house.loc[train_house.LotArea.idxmax(),:]

# #Summary
# # The idea I am getting of this house is that it is:
# # - A house not surrounded by any other houses.
# # - It has a total of ~4000 sq ft of house, and ~210,000 sq ft of land surronding it
# # - Bit of an old school (1965), brick faced, hip roof style house. Kind of like a haunted looking house with brick face
# # - All of this factored in with the quality of the house makes sense why its not the most expensive house even if it 
# # the largest. 
# # In conclusion, this house can stay in this dataset.

# train_house.SalePrice.hist(bins = 25)
# train_house.SalePrice.describe()

# LQ = train_house.loc[train_house.LowQualFinSF > 0 ,]
# LQ.SalePrice.hist(bins=25)

# InsidePorches = train_house.loc[train_house.EnclosedPorch > 0, ]
# InsidePorches.EnclosedPorch.hist(bins=25)
# InsidePorches.EnclosedPorch.describe()

# InsidePorches.SalePrice.hist(bins=25)

# SznP = train_house.loc[train_house.loc[:,'3SsnPorch']>0,]
# SznP.SalePrice.hist(bins=25)
#
# screenporch = train_house.loc[train_house.ScreenPorch >0,]
# screenporch.SalePrice.hist(bins=25)

# pool_niggas = train_house.loc[train_house.PoolArea >0,]
# pool_niggas.SalePrice.hist(bins=25)

# misc_val = train_house.loc[train_house.MiscVal > 0,]
# misc_val.SalePrice.hist(bins=25)


#For BsmtExposure variable
# inds2 = train_house.BsmtExposure.loc[pd.isnull(train_house.BsmtExposure)]
# inds2 = inds2.index


# for x in inds2:
#      if(sum(inds == x) == 0):
#         index = x
        
# index

# #For BsmtExposure variable
# inds2 = train_house.BsmtFinType2.loc[pd.isnull(train_house.BsmtFinType2)]
# inds2 = inds2.index


# for x in inds2:
#      if(sum(inds == x) == 0):
#         index = x
        
# index


# #Checking out the condition and quality of houses that have terrible functionality scores to see what the missing 
# #functional values could possibly be.

# combined_df.loc[combined_df.Functional == 'Maj2',:]

#SPECIAL CELL *NEED TO FIND A WAY TO INCORPORATE THIS CELL WITH THE ONE BELOW IT.
#
#Gotta make some corrections for the following columns: BsmtExposure, and BsmtFinType2
#train_X.loc[948,'BsmtExposure'] = 'No'
#
#Comment back in and Comment out once you run ONCE!
#train_X = train_X.drop(332) #we are dropping this record because we do not know what to fill in for BsmtFinType2 for index = 332
#must do the same thing for our Y dataset
#train_y = train_y.drop(332)
# combined_df = combined_df.drop(combined_df.index[[2150,2119,2187,1554,2215,2472,2575,2488,1914,1944,2249,2903]])


# #COMBINING THE TRAINING AND TESTING DATASETS TO CREATE A SUPER DATA SET.
# combined_df = pd.concat([train_X,test_X])
# combined_df = combined_df.reset_index(drop = True) #the drop variables removes the old indices. Otherwise it gets created as a new column
# #combined_df.isnull().sum()





# ########We need to handle all the missing values in the following columns:
# #THESE WERE HANDLED FROM THE TRAIN_X DATASET.
# #LotFrontage = 484
# #Alley = 2709
# #MasVnrType = 24 
# #MasVnrArea = 23
# #BsmtQual = 76
# #BsmtCond = 77
# #BsmtExposure = 76
# #BsmtFinType1 = 74
# #BsmtFinType2 = 74
# #FireplaceQu = 1412
# #GarageType = 156
# #GarageYrBlt = 157
# #GarageFinish = 157
# #GarageQual = 157
# #GarageCond = 157
# #PoolQC = 2896
# #Fence = 2337
# #MiscFeature = 2802

# #THESE ARE NEW FEATURES WITH MISSING VALUES ADDED FROM TEST_X DATASET (EXCEPT ELECTRICAL).
# #THESE WILL ALL BE REMOVED (12 ROWS IN TOTAL)
# #MSZoning = 4, Utilities = 2, Exterior1st = 1, Exterior2nd = 1, BsmtFinSF1 = 1, BsmtFinSF2 = 1, BsmtUnfSF = 1, TotalBsmtSF = 1, 
# #BsmtFullBath = 2, BsmtHalfBath = 2, KitchenQual = 1, Functional = 2, GarageCars = 1, GarageArea = 1, SaleType = 1, Electrical = 1

# #Removing some indices that I discovered from preliminary research along with some bad data from the columns listed above
# #332 -> BsmtFinType2 is NaN while the other basement variables are okay, so I didn't know what to replace this with.
# #948 -> BsmtExposure was NaN while other basement variables are okay.
# #1379 -> Removed the Electrical NaN in the dataset, it is stupid to keep this.
# #2151 - 2904 -> Related to all of the missing values listed above.
# combined_df = combined_df.drop(combined_df.index[[332,948,1379,2151,2120,2188,1555,2216,2473,2576,2489,1915,1945,2250,2904]])
# combined_df = combined_df.reset_index(drop = True)



# #Uncomment this once we've handled the missing values that need to be removed. Do this for the
# #combined dataset, not just the training dataset.
# #First we will do the transformations on practice_train_X to make sure it does what we really want to do.
# # train_X1 = train_X1.fillna({'LotFrontage' : 0,'Alley' : 'No Alley','MasVnrType': 'NA','MasVnrArea':0,
# #                                             'BsmtQual' : 'None','BsmtCond' : 'None','BsmtExposure' : 'None', 
# #                                             'BsmtFinType1' : 'None','BsmtFinType2' : 'None',
# #                                             'FireplaceQu' : 'None','GarageType' : 'None','GarageYrBlt' : 'None',
# #                                             'GarageFinish' : 'None','GarageQual' : 'None', 'GarageCond' : 'None',
# #                                             'PoolQC' : 'None', 'Fence' : 'None', 'MiscFeature' : 'None'})



# #LOOKING AT SUSPICIOUS MISSING VALUES
# combined_df.loc[combined_df.BsmtCond.isnull()==True,]

# #Allows us to look at the unique values and match up with the sheet
# curr_col.unique()
# curr_col.describe()


#Left over code from checking different attributes of combined_df
#combined_df.loc[combined_df.SaleType.isnull() == True,:]
#Used this snippet to coordinate the row #'s with their listed index
#combined_df.index[2903]
#combined_df.loc[combined_df.Id == 2905,:]
# test_X1 = test_X1.fillna({'LotFrontage' : 0,'Alley' : 'No Alley','MasVnrType': 'NA','MasVnrArea':0,
#                                             'BsmtQual' : 'None','BsmtCond' : 'None','BsmtExposure' : 'None', 
#                                             'BsmtFinType1' : 'None','BsmtFinType2' : 'None','Electrical' : 'NA',
#                                             'FireplaceQu' : 'None','GarageType' : 'None','GarageYrBlt' : 'None',
#                                             'GarageFinish' : 'None','GarageQual' : 'None', 'GarageCond' : 'None',
#                                             'PoolQC' : 'None', 'Fence' : 'None', 'MiscFeature' : 'None'})



#The bad indices were due to GarageYrBlt and I replaced the missing values with None instead
#of a number (like 0).
#row1 = train_X1.loc[0,:]
#len(row1[row1=='None'])
# bad_indices = []
# for i in range(train_X1.shape[0]):
#     curr_row = train_X1.loc[i,:]
#     if(len(curr_row[curr_row=='None']) != 0):
#         bad_indices.append(i)
# train_X1.loc[bad_indices,:]

#pd.unique(train_X1.LandSlope)

# #Used to get the current directory
# import os 
# os.getcwd()


#Uncomment this once we've handled the missing values that need to be removed. Do this for the
#combined dataset, not just the training dataset.
#First we will do the transformations on practice_train_X to make sure it does what we really want to do.
# train_X = train_X.fillna({'LotFrontage' : 0,'Alley' : 'No Alley','MasVnrType': 'NA','MasVnrArea':0,
#                                             'BsmtQual' : 'None','BsmtCond' : 'None','BsmtExposure' : 'None', 
#                                             'BsmtFinType1' : 'None','BsmtFinType2' : 'None',
#                                             'FireplaceQu' : 'None','GarageType' : 'None','GarageYrBlt' : 0,
#                                             'GarageFinish' : 'None','GarageQual' : 'None', 'GarageCond' : 'None',
#                                             'PoolQC' : 'None', 'Fence' : 'None', 'MiscFeature' : 'None'})

# test_X = test_X.fillna({'MSZoning':'NA','LotFrontage' : 0,'Utilities':'NA','Alley' : 'No Alley','MasVnrType': 'NA',
#                         'MasVnrArea':0,'BsmtQual' : 'None','BsmtCond' : 'None','BsmtExposure' : 'None', 'Exterior1st':'NA',
#                                             'Exterior2nd':'NA','BsmtFinType1' : 'None','BsmtFinSF1':0,'BsmtFinType2':'None',
#                                             'BsmtFinSF2':0,'BsmtUnfSF':0,'TotalBsmtSF':0,'BsmtFullBath':0,'BsmtHalfBath':0,
#                         'KitchenQual':'NA','Functional':'NA','FireplaceQu' : 'None','GarageType' : 'None','GarageYrBlt' : 0,
#                                             'GarageFinish' : 'None','GarageCars':0,'GarageArea':0,'GarageQual' : 'None', 'GarageCond' : 'None',
#                                             'PoolQC' : 'None', 'Fence' : 'None', 'MiscFeature' : 'None','SaleType':'NA'})



#We need to make sure that they have the exact number of columns and that they are lined up the same.
#Shows the columns that are in train_X1 but are not in test_X1
#missing_col_from_test = train_X1.columns.difference(test_X1.columns)
#missing_col_from_train = test_X1.columns.difference(train_X1.columns)