In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
from sklearn.model_selection import train_test_split    # Splits arrays or matrices into random train and test subsets
from sklearn.model_selection import KFold               # Cross-validator
from sklearn.model_selection import cross_validate      # Evaluate metrics by cross-validation
from sklearn.model_selection import GridSearchCV        # Search over specified parameter values for an estimator
from sklearn.compose import ColumnTransformer           # Applies transformers to columns of DataFrames
from sklearn.pipeline import Pipeline                   # Helps building a chain of transforms and estimators
from sklearn.impute import SimpleImputer                # Imputation transformer for completing missing values
from sklearn.preprocessing import OneHotEncoder         # Encode categorical features
from sklearn.metrics import mean_absolute_error, mean_squared_error         # One of many statistical measures of error
from xgboost import XGBRegressor

In [3]:
training_set = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', keep_default_na = False, na_values = ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', ''])
test_set = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', keep_default_na = False, na_values = ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', ''])

In [4]:
X = training_set.copy()
y = X.SalePrice
X.drop(['SalePrice'], axis = 1, inplace = True)

In [5]:
X_test = test_set.copy()
print(X.shape)
print(y.shape)
print(X_test.shape)

(1460, 80)
(1460,)
(1459, 80)


In [6]:
missing_values = X.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print(missing_values)

Series([], dtype: int64)


In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             1460 non-null   int64 
 1   MSSubClass     1460 non-null   int64 
 2   MSZoning       1460 non-null   object
 3   LotFrontage    1460 non-null   object
 4   LotArea        1460 non-null   int64 
 5   Street         1460 non-null   object
 6   Alley          1460 non-null   object
 7   LotShape       1460 non-null   object
 8   LandContour    1460 non-null   object
 9   Utilities      1460 non-null   object
 10  LotConfig      1460 non-null   object
 11  LandSlope      1460 non-null   object
 12  Neighborhood   1460 non-null   object
 13  Condition1     1460 non-null   object
 14  Condition2     1460 non-null   object
 15  BldgType       1460 non-null   object
 16  HouseStyle     1460 non-null   object
 17  OverallQual    1460 non-null   int64 
 18  OverallCond    1460 non-null

In [8]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, random_state = 0)

In [9]:
categorical_cols = [col for col in X_train_full.columns if 
                   X_train_full[col].nunique() <= 15 and
                   X_train_full[col].dtype == 'object']

numeric_cols = [col for col in X_train_full.columns if 
               X_train_full[col].dtype in ['int64', 'float64']]

my_columns = categorical_cols + numeric_cols
X_train = X_train_full[my_columns].copy()
X_valid = X_valid_full[my_columns].copy()
X_test = X_test[my_columns].copy()

In [10]:
numerical_transformer = SimpleImputer(strategy = 'mean')
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'constant',fill_value = 'NA')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_cols)
])

In [11]:
model = XGBRegressor(verbosity = 0, random_state = 0)

my_pipeline_xgb = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', model)
])

my_pipeline_xgb.fit(X_train, y_train)
preds = my_pipeline_xgb.predict(X_valid)
print("XGB Score:", my_pipeline_xgb.score(X_valid, y_valid))

XGB Score: 0.7467378303274879


In [12]:
from sklearn.ensemble import RandomForestRegressor
model_RFR = RandomForestRegressor(random_state = 0)

my_pipeline_RFR = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', model_RFR)
])

my_pipeline_RFR.fit(X_train, y_train)
preds = my_pipeline_RFR.predict(X_valid)
print("RF Score:", my_pipeline_RFR.score(X_valid, y_valid))

RF Score: 0.7024350584874632


In [13]:
final_model = model
final_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', final_model)
])

X_train = training_set.copy()
y_train = X_train.SalePrice
X_train.drop(['SalePrice'], axis = 1, inplace = True)

final_pipeline.fit(X_train, y_train)

final_prediction = final_pipeline.predict(X_test)

output = pd.DataFrame({
    'Id': X_test.Id,
    'SalePrice': final_prediction
})
output.to_csv('submission.csv', index = False)

In [14]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(1460, 80)
(1460,)
(1459, 75)
