In [1]:
# Imports
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [4]:
# Wrangle function. Be sure to adjust the 'filepath' variable to match the location of your file.

def wrangle(filepath):
    df = pd.read_csv(filepath)
    
    # Remove the most extreme 1% prices,
    # the most extreme .1% latitudes, &
    # the most extreme .1% longitudes
    df = df[(df['price'] >= np.percentile(df['price'], 0.5)) & 
            (df['price'] <= np.percentile(df['price'], 99.5))]
    
    return df

filepath = "X:/A - Stuff/house_data/kc_house_data.csv"
df = wrangle(filepath)
if df.isnull().values.any() == False:
    print("There are no null values in your data set.")
else:
    print("Modify the wrangle function to deal with those null values.")

There are no null values in your data set.


In [5]:
# Assigning target and features, and printing baseline MAE
X = df.drop(columns = ['price'])
y = df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=69)
print('Baseline MAE Score:', mean_absolute_error(y, [y.mean()]*len(y)))

Baseline MAE Score: 217730.56857841637


In [6]:
# Linear regression model using gridsearch for hyperparameter tuning
pipeline_lin = make_pipeline(OrdinalEncoder(),SimpleImputer(),LinearRegression())
params_lin = {'linearregression__n_jobs' : range(50,800,25)}
model_lin = GridSearchCV(pipeline_lin, param_grid=params_lin, cv=5, n_jobs=-1, verbose=1)
model_lin.fit(X_train, y_train);
print('Linear training MAE Score:', mean_absolute_error(y_train, model_lin.predict(X_train)))
print('Linear validation MAE Score:', mean_absolute_error(y_val, model_lin.predict(X_val)))

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Linear training MAE Score: 112490.12749730404
Linear validation MAE Score: 113414.7500382454


In [7]:
# Random forest regression model using gridsearch for hyperparameter tuning
pipeline_RFR = make_pipeline(OrdinalEncoder(), SimpleImputer(), RandomForestRegressor(random_state=69, ccp_alpha=0.0078))
params_RFR = {'randomforestregressor__min_samples_leaf': range(1,5,1),
              'randomforestregressor__max_leaf_nodes': range(1900,2000,50),
              'randomforestregressor__max_depth': range(20,30,2),
              'randomforestregressor__max_features': range(10,15,1)}
model_RFR = GridSearchCV(pipeline_RFR, param_grid=params_RFR, cv=2, n_jobs=-1, verbose=1)
model_RFR.fit(X_train, y_train);
print('Training MAE Score:', mean_absolute_error(y_val, model_RFR.predict(X_val)))
print('Validation MAE Score:', mean_absolute_error(y_train, model_RFR.predict(X_train)))

Fitting 2 folds for each of 200 candidates, totalling 400 fits


KeyboardInterrupt: 