In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"huongnguyn","key":"850408383ad3a7c42e742bca6e3f5f73"}'}

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle competitions download -c house-prices-advanced-regression-techniques
!unzip -o house-prices-advanced-regression-techniques.zip -d house_prices

Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 355MB/s]
Archive:  house-prices-advanced-regression-techniques.zip
  inflating: house_prices/data_description.txt  
  inflating: house_prices/sample_submission.csv  
  inflating: house_prices/test.csv   
  inflating: house_prices/train.csv  


In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import GridSearchCV

In [5]:
train_df = pd.read_csv("house_prices/train.csv")
test_df = pd.read_csv("house_prices/test.csv")


In [6]:
#Remove outliers for train
cond1 = (train_df['GrLivArea'] > 3000) & (train_df['SalePrice'] < 400000)
cond2 = train_df['LotArea'] > 100000
cond3 = (train_df['TotalBsmtSF'] > 3000) & (train_df['SalePrice'] < 300000)
cond4 = train_df['BsmtFinSF2'] > 1200
cond5 = train_df['ScreenPorch'] > 400
cond6 = train_df['MasVnrArea'] > 1400
cond7 = train_df['EnclosedPorch'] > 400
cond8 = train_df['OpenPorchSF'] > 400

outlier_condition = cond1 | cond2 | cond3 | cond4 | cond5 | cond6| cond7 | cond8
print(outlier_condition.sum())

23


In [7]:
cleaned_train = train_df[~outlier_condition].copy()

In [8]:
X_train = cleaned_train.drop(columns=["SalePrice"])
y_train = cleaned_train["SalePrice"]
X_test = test_df.copy()

In [9]:
#Define cleaning function for test set
def clean(df):
    df = df.copy()

    # Fill missing values
    df['PoolQC'] = df['PoolQC'].fillna("No Pool")
    df['MiscFeature'] = df['MiscFeature'].fillna('No Feature')
    df['Alley'] = df['Alley'].fillna('No Alley')
    df['Fence'] = df['Fence'].fillna('No Fence')
    df['MasVnrType'] = df['MasVnrType'].fillna('No Mas Vnr Type')
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
    df['FireplaceQu'] = df['FireplaceQu'].fillna('No Fireplace')
    df['LotFrontage'] = df['LotFrontage'].fillna(0)
    df['GarageCars'] = df['GarageCars'].fillna(0)
    df['GarageArea'] = df['GarageArea'].fillna(0)
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)
    df['KitchenQual'] = df['KitchenQual'].fillna('No Kitchen')




    for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
        if col in df.columns:
            df[col] = df[col].fillna("No Garage")

    for col in ['BsmtHalfBath','BsmtFullBath','TotalBsmtSF',
                'BsmtUnfSF','BsmtFinSF1','BsmtFinSF2']:
        if col in df.columns:
            df[col] = df[col].fillna(0)

    for col in ["BsmtExposure", "BsmtFinType2", "BsmtFinType1",
                "BsmtQual", "BsmtCond"]:
        if col in df.columns:
            df[col] = df[col].fillna("No Basement")

    # Drop unnecessary columns
    drop_cols = ['Id','1stFlrSF','TotRmsAbvGrd','GarageCars']
    for col in drop_cols:
        if col in df.columns:
            df = df.drop(col, axis=1)


    # Ordinal mapping
    ordinal_mappings = {
        "ExterQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2},
        "ExterCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1},
        "BsmtQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "No Basement": 0}, #Po for dataset test only
        "BsmtCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "No Basement": 0},
        "HeatingQC": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1}, #Po is added for dataset test only
        "KitchenQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "No Kitchen": 0},
        "FireplaceQu": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "No Fireplace": 0},
        "GarageQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "No Garage": 0},
        "GarageCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "No Garage": 0},
        "BsmtExposure": {"Gd": 4, "Av": 3, "Mn": 2, "No": 1, "No Basement": 0},
        "BsmtFinType1": {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3,
                           "LwQ": 2, "Unf": 1, "No Basement": 0},
        "BsmtFinType2": {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3,
                           "LwQ": 2, "Unf": 1, "No Basement": 0},
        "Functional": {"Typ": 6, "Min1": 5, "Min2": 4, "Mod": 3,
                       "Maj1": 2, "Maj2": 1, "Sev": 0, "Sal": -1,},  #Sal for dataset test only
        "Fence": {"GdPrv": 4, "MnPrv": 3, "GdWo": 2, "MnWw": 1, "No Fence": 0},
        "LandSlope": {"Gtl": 3, "Mod": 2, "Sev": 1},
        "PavedDrive": {"Y": 3, "P": 2, "N": 1}
    }

    for col, mapping in ordinal_mappings.items():
        if col in df.columns:
            df[col] = df[col].map(mapping)


    return df

In [10]:
X_train_cleaned = clean(X_train)
X_test_cleaned = clean(X_test)

In [11]:
#Fill mode value from train data to into test data
from sklearn.impute import SimpleImputer

frequent_cols = ['Electrical', 'Utilities', 'MSZoning',
                 'Exterior1st', 'Exterior2nd', 'Functional', 'SaleType']

cat_imputer = SimpleImputer(strategy="most_frequent")

X_train_cleaned[frequent_cols] = cat_imputer.fit_transform(X_train_cleaned[frequent_cols])
X_test_cleaned[frequent_cols]  = cat_imputer.transform(X_test_cleaned[frequent_cols])


In [12]:
print(X_train_cleaned.shape, X_test_cleaned.shape)

(1437, 76) (1459, 76)


In [13]:
print(X_train_cleaned.isnull().values.sum())
print(X_test_cleaned.isnull().values.sum())

0
0


In [14]:
# Pipeline for Scale & OneHot
numeric_cols = X_train_cleaned.select_dtypes(exclude="object").columns
categorical_cols = X_train_cleaned.select_dtypes(include="object").columns

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor_ann = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

preprocessor_rf = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [None]:
#ANN model
X_train_ann = preprocessor_ann.fit_transform(X_train_cleaned)
X_test_ann = preprocessor_ann.transform(X_test_cleaned)
y_train_log = np.log1p(y_train)
input_shape = X_train_ann.shape[1]
ann_model = keras.Sequential([
    layers.Dense(1, input_shape=(input_shape,))
])
ann_model.compile(optimizer='adam', loss='mse', metrics=[keras.metrics.RootMeanSquaredError()])
ann_model.fit(X_train_ann, y_train_log, epochs=200, batch_size=64, verbose=0)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


<keras.src.callbacks.history.History at 0x7e7d6a428a40>

In [None]:
#Make predictions on the test set using the ANN model
y_test_pred_ann = np.expm1(ann_model.predict(X_test_ann).ravel())

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [23]:
#RandomForestRegressor Model
X_train_rf = preprocessor_rf.fit_transform(X_train_cleaned)
X_test_rf = preprocessor_rf.transform(X_test_cleaned)
y_train_log = np.log1p(y_train)
rfr = RandomForestRegressor(n_estimators=500, random_state=8, n_jobs=-1)
param_grid = {'max_depth': [20], 'max_features': [0.5],'min_samples_leaf': [1]}
grid = GridSearchCV(rfr,param_grid,cv=5,scoring='neg_root_mean_squared_error',n_jobs=-1)

grid.fit(X_train_rf, y_train_log)


In [24]:
#Make predictions on the test set using the RandomForestRegressor model
y_test_pred_rf = np.expm1(grid.predict(X_test_rf).ravel())

In [25]:
submission = pd.DataFrame({
    "Id": test_df["Id"],
    "SalePrice": y_test_pred_rf  # or y_test_pred_rf
})

submission.to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")

Submission file created: submission.csv
