**This notebook is an exercise in the [Intermediate Machine Learning](https://www.kaggle.com/learn/intermediate-machine-learning) course.  You can reference the tutorial at [this link](https://www.kaggle.com/alexisbcook/introduction).**

---


In [1]:
# Set up code checking
import os
if not os.path.exists("../input/train.csv"):
    os.symlink("../input/home-data-for-ml-course/train.csv", "../input/train.csv")  
    os.symlink("../input/home-data-for-ml-course/test.csv", "../input/test.csv")  
from learntools.core import binder
binder.bind(globals())
from learntools.ml_intermediate.ex1 import *
print("Setup Complete")

Setup Complete


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')

# Obtain target and predictors
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

num_cols = X.select_dtypes(exclude=['object']).columns
cat_cols = X.select_dtypes(include=['object']).columns

fullcols = num_cols.append(cat_cols)

X = X[fullcols]

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [3]:
X_train.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
619,11694,2007,1828,0,2,3,9
871,6600,1962,894,0,1,2,5
93,13360,1921,964,0,1,2,5
818,13265,2002,1689,0,2,3,7
303,13704,2001,1541,0,2,3,6


In [4]:
X_train.isnull().sum()

LotArea         0
YearBuilt       0
1stFlrSF        0
2ndFlrSF        0
FullBath        0
BedroomAbvGr    0
TotRmsAbvGrd    0
dtype: int64

In [5]:
print(X_train.columns)
print('='*50)
X_train.info()

Index(['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath',
       'BedroomAbvGr', 'TotRmsAbvGrd'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 1168 entries, 619 to 685
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   LotArea       1168 non-null   int64
 1   YearBuilt     1168 non-null   int64
 2   1stFlrSF      1168 non-null   int64
 3   2ndFlrSF      1168 non-null   int64
 4   FullBath      1168 non-null   int64
 5   BedroomAbvGr  1168 non-null   int64
 6   TotRmsAbvGrd  1168 non-null   int64
dtypes: int64(7)
memory usage: 73.0 KB


In [6]:
from xgboost import XGBRegressor

# Define the models
model_1 = XGBRegressor(n_estimators=200,  learning_rate=0.05,
                                  max_depth=4,random_state=1)
model_2 = XGBRegressor(n_estimators=350,  learning_rate=0.05,
                                  max_depth=4,random_state=1)
model_3 = XGBRegressor(n_estimators=500,  learning_rate=0.05,
                                  max_depth=4,random_state=1)
model_4 = XGBRegressor(n_estimators=600,  learning_rate=0.05,
                                  max_depth=4,random_state=1)
model_5 = XGBRegressor(n_estimators=1000,  learning_rate=0.01,
                                  max_depth=4,random_state=4)

models = [model_1, model_2, model_3, model_4, model_5]

In [7]:
from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 21528
Model 2 MAE: 21375
Model 3 MAE: 21696
Model 4 MAE: 21839
Model 5 MAE: 21724


In [8]:
best_model = model_2

In [9]:
# Define a model
my_model = best_model 

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

numericalTransformer = SimpleImputer(strategy='median')


categoricalTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numericalTransformer, num_cols),
        ('cat', categoricalTransformer, cat_cols)
    ])

myPipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model',my_model)])

In [11]:
# Fit the model to the training data
myPipeline.fit(X_train, y_train)
preds = myPipeline.predict(X_valid)

print(mean_absolute_error(y_valid, preds ))

21375.71662296661


In [12]:
# Generate test predictions
preds_test = myPipeline.predict(X_test)

# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)