# Set up

In [29]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# loading data
data_train_path = './home-data-for-ml-course/train.csv'
data_test_path = './home-data-for-ml-course/test.csv'

import pandas as pd

# Read the data
X = pd.read_csv(data_train_path, index_col='Id')
X_test_full = pd.read_csv(data_test_path, index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# One-hot encode the data (to shorten the code, we use pandas)
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)


In [30]:
# looking at the data
print(X_train.head())
print(X_train.describe())

     MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
Id                                                                           
619          20         90.0    11694            9            5       2007   
871          20         60.0     6600            5            5       1962   
93           30         80.0    13360            5            7       1921   
818          20          NaN    13265            8            5       2002   
303          20        118.0    13704            7            5       2001   

     YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  SaleType_ConLw  \
Id                                                     ...                   
619          2007       452.0          48           0  ...           False   
871          1962         0.0           0           0  ...           False   
93           2006         0.0         713           0  ...           False   
818          2002       148.0        1218           0  ...     

# Building Model

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Initialize the XGBRegressor model
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)

# Fit the model without early stopping
my_model.fit(X_train, y_train, verbose=False)

# Predict on the validation set
y_valid_pred = my_model.predict(X_valid)

# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_valid, y_valid_pred)
print(f'Mean Absolute Error (MAE) on validation set: {mae}')


Mean Absolute Error (MAE) on validation set: 17224.27947078339


In [54]:
# Make predictions on the test set (which does not have the target)
y_test_pred = my_model.predict(X_test)

# Create a DataFrame for the submission
submission = pd.DataFrame({
    'Id': X_test.index,  # Using the index (Id) from the test set
    'SalePrice': y_test_pred
})

# Save the predictions to a CSV file for submission
submission.to_csv('submission.csv', index=False)
