In [38]:
from xgboost import XGBRegressor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

In [39]:
# Read the data
X = pd.read_csv('./ames_housing_train.csv', index_col='Id')
X_test_full = pd.read_csv('./ames_housing_test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

In [40]:
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.75, test_size=0.25,
                                                                random_state=0)

In [41]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()



In [43]:

# One-hot encode the data (to shorten the code, we use pandas)
X_train = pd.get_dummies(X_train)

X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

# Save the column names
X_train_copy = X_train.copy()
X_valid_copy = X_valid.copy()

In [44]:
# Get a list of columns with missing values in them
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])
miss_cols = X_train.columns[X_train.isnull().any()]

LotFrontage    199
MasVnrArea       5
GarageYrBlt     56
dtype: int64


In [46]:
my_imputer = SimpleImputer(strategy = 'median')

X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Fill in the lines below: imputation removed column names; put them back
X_train.columns = X_train_copy.columns
X_valid.columns = X_valid_copy.columns

In [48]:
my_model = XGBRegressor(n_estimators=150, learning_rate=0.05)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)],
             verbose=False)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=150,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [8]:
predictions = my_model.predict(X_valid)
mae = mean_absolute_error(predictions, y_valid)

print("Mean Absolute Error:" , mae)

Mean Absolute Error: 16812.893428938358


In [9]:
estimators = [100, 150, 250, 500, 1000, 1500]
learning = [0.01, 0.05, 0.1, 0.15, 0.20]

In [10]:
errors = []
for n in estimators:
    my_model = XGBRegressor(objective ='reg:squarederror', n_estimators=n, learning_rate=0.05)
    my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)],
             verbose=False)
    predictions = my_model.predict(X_valid)
    mae = mean_absolute_error(predictions, y_valid)
    errors.append(mae)
    
print(errors)

[17275.764362157533, 16812.893428938358, 16812.893428938358, 16812.893428938358, 16812.893428938358, 16812.893428938358]


In [11]:
errors = []
for l in learning:
    my_model = XGBRegressor(objective ='reg:squarederror', n_estimators=150, learning_rate=l)
    my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)],
             verbose=False)
    predictions = my_model.predict(X_valid)
    mae = mean_absolute_error(predictions, y_valid)
    errors.append(mae)
    
print(errors)

[42450.96222174657, 16812.893428938358, 16560.136258561644, 17059.53422517123, 16962.65565068493]


## Testing

In [49]:
my_model = XGBRegressor(n_estimators=150, learning_rate=0.1)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)],
             verbose=False)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=150,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [50]:
# Fill in the line below: get test predictions
preds_test = my_model.predict(X_test)

In [52]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)