In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

home_data_for_ml_course_path = kagglehub.competition_download('home-data-for-ml-course')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split

trainFilepath = "/kaggle/input/home-data-for-ml-course/train.csv"
trainData = pd.read_csv(trainFilepath)

y = trainData.SalePrice

features = ["MSZoning", "Street", "Alley", "LotArea", "OverallQual", "OverallCond", "YearBuilt", "TotalBsmtSF", "CentralAir",
           "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "GarageArea", "PoolArea", ]
X = trainData[features]

age = 2025 - X.YearBuilt
X = X.drop("YearBuilt", axis=1)
X = pd.concat([X, age], axis=1)
X.rename(columns={"YearBuilt": "Age"}, inplace=True)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy="constant", fill_value=0)

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer,["LotArea", "OverallQual", "OverallCond", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr",
                                       "KitchenAbvGr", "TotRmsAbvGrd", "GarageArea", "PoolArea", "Age"]),
        ("cat", categorical_transformer, ["MSZoning", "Street", "Alley", "CentralAir"])
    ])

param_grid = {
    "model__n_estimators": [50, 100, 200],
    "model__max_depth": [None, 10, 20, 30],
    # "model__min_samples_split": [2, 5, 10]
    # "model__min_samples_leaf": [1, 2, 5]
    # "model__max_features": ["auto", "sqrt", "log2"]
    "preprocessor__cat__imputer__strategy": ["most_frequent", "constant"]
}

model = RandomForestRegressor(random_state=0, criterion="absolute_error")

pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)
                          ])

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="neg_mean_absolute_error")
grid_search.fit(X_train, y_train)

predictions = grid_search.predict(X_valid)

score = mean_absolute_error(y_valid, predictions)
print('MAE:', score)

In [None]:
testFilepath = "/kaggle/input/home-data-for-ml-course/test.csv"
testData = pd.read_csv(testFilepath)

X_test = testData[features]

age_test = 2025 - X_test.YearBuilt
X_test = X_test.drop("YearBuilt", axis=1)
X_test = pd.concat([X_test, age_test], axis=1)
X_test.rename(columns={"YearBuilt": "Age"}, inplace=True)

testPredictions = grid_search.predict(X_test)

output = pd.DataFrame({"Id": testData.Id, "SalePrice": testPredictions})
output.to_csv("submission.csv", index=False)

print("Output saved")