In [173]:
from cgi import test
import pandas as pd
import math
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

data = pd.read_csv("train.csv")
#features = ["LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "TotalBsmtSF", "GrLivArea", "1stFlrSF", "2ndFlrSF", "GarageArea", "PoolArea", "YrSold"]
#X = data[features]
#y = data["SalePrice"]
X = data.drop(columns="SalePrice")
y = data.SalePrice

numerical_cols = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]
categorical_cols = [col for col in X.columns if X[col].dtype == "object"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [174]:
numerical_transformer = SimpleImputer()
categorical_transformer = Pipeline(
  steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
  ]
)

preprocessor = ColumnTransformer(
  transformers=[
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
  ]
)

my_pipeline = Pipeline(
  steps=[
    ("preprocessor", preprocessor),
    ("model", XGBRegressor(n_estimators=1000, learning_rate=0.03, early_stopping_rounds=5, eval_metric=mean_squared_log_error))
  ]
)

'scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring="neg_mean_squared_log_error")\nfor i in range(len(scores)):\n  scores[i] = math.sqrt(scores[i])\nprint(scores.mean())'

In [175]:
X_test = pd.read_csv("test.csv")

#my_pipeline.fit(X, y)
#predictions = my_pipeline.predict(X_test)
X_valid_transformed = preprocessor.fit_transform(X_valid)

my_pipeline.fit(X_train, y_train, model__eval_set=[(X_valid_transformed, y_valid)])
predictions = my_pipeline.predict(X_valid)
score = math.sqrt(mean_squared_log_error(y_valid, predictions))
print(score)

[0]	validation_0-rmse:196156.08211	validation_0-mean_squared_log_error:16.79468
[1]	validation_0-rmse:191260.06962	validation_0-mean_squared_log_error:10.63656
[2]	validation_0-rmse:187117.36621	validation_0-mean_squared_log_error:8.53834
[3]	validation_0-rmse:183257.09881	validation_0-mean_squared_log_error:7.01300
[4]	validation_0-rmse:178707.16942	validation_0-mean_squared_log_error:5.60353
[5]	validation_0-rmse:174610.91658	validation_0-mean_squared_log_error:4.66036
[6]	validation_0-rmse:170653.27853	validation_0-mean_squared_log_error:3.89624
[7]	validation_0-rmse:166795.18812	validation_0-mean_squared_log_error:3.32032
[8]	validation_0-rmse:162686.76611	validation_0-mean_squared_log_error:2.85567
[9]	validation_0-rmse:158996.54228	validation_0-mean_squared_log_error:2.51387
[10]	validation_0-rmse:155497.06546	validation_0-mean_squared_log_error:2.23707
[11]	validation_0-rmse:151672.67459	validation_0-mean_squared_log_error:1.97077
[12]	validation_0-rmse:148619.08580	validation_0

In [176]:
predictions = my_pipeline.predict(X_test)

def make_submission(predictions):
  predictions_df = pd.DataFrame(data={"Id": range(1461, 1461 + len(predictions)), "SalePrice": predictions})
  predictions_df.to_csv("submission.csv", index=False)

make_submission(predictions)
