This notebook is based on the lessons on machine learning that Kaggle curates at https://www.kaggle.com/learn/overview.

This first section starts out with a lot of data prep and discovery.

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer
from xgboost import XGBRegressor

# defined function to get mean absolute error for decision tree
def get_decision_depth_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

# defined function to get mean absolute error for random forest
def get_forest_mae(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    return(mae)

# another way to check MAE with sklearn
def get_forest_mae2(X, y):
    # multiple by -1 to make positive MAE score instead of neg value returned as sklearn convention
    return -1 * cross_val_score(RandomForestRegressor(50), 
                                X, y, 
                                scoring = 'neg_mean_absolute_error').mean()

# Identify numeric columns
def numeric_cols(data_frame):
    numeric_cols = [cname for cname in data_frame.columns if 
                                data_frame[cname].dtype in ['int64', 'float64']]
    return(numeric_cols)

# Identify categorical columns with low cardinality (a small number of distinct values)
def low_cardinality_cols(data_frame):
    low_cardinality_cols = [cname for cname in data_frame.columns if 
                                data_frame[cname].nunique() < 10 and
                                data_frame[cname].dtype == "object"]
    return(low_cardinality_cols)

# Identify columns with missing data
def cols_with_missing(data_frame):
    cols_with_missing = [cname for cname in data_frame.columns 
                                 if data_frame[cname].isnull().any()]
    return(cols_with_missing)

# Read core training and test data
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')

# Drop houses where the target is missing
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

# get statistical properties of the data
#print(train_data.describe())

# view first 5 rows of the data
#train_data.head(5)

# view the columns in the data 
#print(train_data.columns)

# Check size of data (1460 entries across 81 columns)
#train_data.shape

Some quick partial dependency graphs.

In [None]:
cols_to_use = ['FullBath','OverallQual','LotArea','1stFlrSF','2ndFlrSF','YearBuilt']
#cols_to_use = low_cardinality_cols(train_data) + numeric_cols(train_data)
y = train_data.SalePrice
X = train_data[cols_to_use]
#X = X.drop(['SalePrice'], axis=1)
#X = pd.get_dummies(X)
my_imputer = Imputer()
X = my_imputer.fit_transform(X)

my_model = GradientBoostingRegressor()
my_model.fit(X, y)
my_plots = plot_partial_dependence(my_model,       
                                   features=[0, 1, 2, 3, 4, 5], # column to plot
                                   X=X, # raw predictors data
                                   feature_names=cols_to_use, # labels on graphs
                                   grid_resolution=10) # number of values to plot on x axis

Split data into training data and validation data for the decision tree model.

In [None]:
# specify columns to use for fitting decision tree model
predictor_list = ['FullBath','OverallQual','LotArea','1stFlrSF','2ndFlrSF','YearBuilt']

# define x and y
y = train_data.SalePrice
X = train_data[predictor_list]

# split data into training and validation data, for both predictors and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

Compare MAE (mean absolute error) with differing values of max_leaf_nodes...this basically tests different error based on the depth of the tree to find the sweet spot between overfitting and underfitting.

In [None]:
# compare MAE (mean absolute error) with differing values of max_leaf_nodes...
# this basically tests different error based on the depth of the tree to find 
# the sweet spot between overfitting and underfitting
best_mae = float("inf") # set to infinity
for max_leaf_nodes in [5, 50, 500, 5000]:
    current_mae = get_decision_depth_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    if current_mae < best_mae:
        best_max_leaf_nodes = max_leaf_nodes
        best_mae = current_mae
    print("Max leaf nodes: {}  \t\t Mean Absolute Error:  {}".format(max_leaf_nodes, current_mae))
print("------------------------------------------------")
print("Best max leaf nodes: {}".format(best_max_leaf_nodes))

Define the decision tree model and review it's MAE...

In [None]:
# define the decision tree model
decision_tree_model = DecisionTreeRegressor(max_leaf_nodes=best_max_leaf_nodes, random_state=0)
# train the model
decision_tree_model.fit(train_X,train_y)
# run predictions against validation data to determine the error 
# of the decision tree 
predicted_home_prices = decision_tree_model.predict(val_X)
decision_mae = mean_absolute_error(val_y, predicted_home_prices)
print("Decision tree MAE: {}".format(decision_mae))

In [None]:
# a quick sample of predictions against the validation data for decision tree
print("Making predictions for the following houses:")
print(val_X.head(2))
print("------------------------------------------------")
print("The predictions are")
print(decision_tree_model.predict(val_X.head(2)))

Define the random forest model and review its MAE...

In [None]:
# define the random forest model 
forest_model = RandomForestRegressor()
# train the model
forest_model.fit(train_X, train_y)
# run predictions against validation data to determine the error 
# of the random forest tree 
predicted_home_prices = forest_model.predict(val_X)
forest_mae = mean_absolute_error(val_y, predicted_home_prices)
print("Decision tree MAE: {}".format(decision_mae))
print("Random Forest MAE: {}".format(forest_mae))

In [None]:
# a quick sample of predictions against the validation data for forest tree
print("Making predictions for the following 5 houses:")
print(val_X.head(2))
print("------------------------------------------------")
print("The predictions are")
print(forest_model.predict(val_X.head(2)))

In [None]:
# reset data based on all numeric predictors
mycols = low_cardinality_cols(train_data) + numeric_cols(train_data)
# If I use one hot on my final model then I'll need to leverage the .align method between train and test data
X = train_data[mycols]
X = pd.get_dummies(X)
X = X.drop(['SalePrice'], axis=1)
train_X, val_X, train_y, val_y = train_test_split  (X, 
                                                    y,
                                                    train_size=0.7, 
                                                    test_size=0.3, 
                                                    random_state=0)
X.head()

Check MAE of random forest by just dropping columns with nulls...

In [None]:
# check mae of random forest by just dropping columns with nulls
reduced_train_X = train_X.drop(cols_with_missing(X), axis=1)
reduced_val_X  = val_X.drop(cols_with_missing(X), axis=1)
print("Mean Absolute Error from dropping columns with Missing Values:")
print(get_forest_mae(reduced_train_X, reduced_val_X, train_y, val_y))

Check MAE of random forest with imputation...

In [None]:
# check mae of random forest with imputation
my_imputer = Imputer()
imputed_train_X = my_imputer.fit_transform(train_X)
imputed_val_X = my_imputer.transform(val_X)
print("Mean Absolute Error from Imputation:")
print(get_forest_mae(imputed_train_X, imputed_val_X, train_y, val_y))

Check MAE of random forest with imputation while tracking imputed columns...

In [None]:
# check mae of random forest with imputation while tracking imputed columns
imputed_train_X_plus = train_X.copy()
imputed_val_X_plus = val_X.copy()

for col in cols_with_missing(imputed_train_X_plus):
    imputed_train_X_plus[col + '_was_missing'] = imputed_train_X_plus[col].isnull()
    imputed_val_X_plus[col + '_was_missing'] = imputed_val_X_plus[col].isnull()

# Imputation
my_imputer = Imputer()
imputed_train_X_plus = my_imputer.fit_transform(imputed_train_X_plus)
imputed_val_X_plus = my_imputer.transform(imputed_val_X_plus)

print("Mean Absolute Error from Imputation while Tracking What Was Imputed:")
print(get_forest_mae(imputed_train_X_plus, imputed_val_X_plus, train_y, val_y))

Check MAE of XGBoost (Gradient Boosted Decision Trees)...

In [None]:
y = train_data.SalePrice
X = train_data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X.as_matrix(), 
                                                    y.as_matrix(), 
                                                    test_size=0.25)

#impute missing values
my_imputer = Imputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)

# n_estimators specifies the number of fitting rounds to attempt
# a small learning_rate deteriorates the value of more runs which helps with overfitting
# n_jobs increases the number of cores (parallelism) in the computation to improve performance
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs = 2)
# Add silent=True to avoid printing out updates with each cycle
# early_stopping_rounds stops the model fitting once there are multiple rounds of similar error
# n_estimators above is overruled by the early_stopping_rounds setting
my_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(test_X, test_y)], verbose=False)

# make predictions
predictions = my_model.predict(test_X)
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

**Run a Pipeline to simplify along with streamlined cross validation**

In [None]:
# pull all data data into target (y) and predictors (X)
train_y = train_data.SalePrice
final_train_X_cols = low_cardinality_cols(train_data) + numeric_cols(train_data)
train_X = train_data[final_train_X_cols]
train_X = pd.get_dummies(train_X)
train_X = train_X.drop(['SalePrice'], axis=1)

# Treat the test data in the same way as training data. In this case, pull same columns.
final_test_X_cols = low_cardinality_cols(test_data) + numeric_cols(test_data)
test_X = test_data[final_test_X_cols]
test_X = pd.get_dummies(test_X)

# inner join the data to ensure the exact columns included are aligned
train_X, test_X = train_X.align(test_X,
                                join='inner', 
                                axis=1)

#fit the model with imputation at the same time using a pipeline
my_pipeline = make_pipeline(Imputer(), RandomForestRegressor())
scores = cross_val_score(my_pipeline, X, y, scoring='neg_mean_absolute_error')
# average the three scores and make it a positive number
print('Mean Absolute Error %2f' %(-1 * scores.mean()))

In [None]:
# train the model
my_pipeline.fit(train_X, train_y)

# Use the model to make predictions
final_predicted_prices = my_pipeline.predict(test_X)

# look at the predicted prices to ensure we have something sensible.
print(final_predicted_prices)