In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load datasets

In [2]:
train_ds = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_ds  = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [3]:
X = train_ds.drop(columns=['Id', 'SalePrice'], axis=1)
y = train_ds['SalePrice']

Identify numeric and categorical features dynamically

In [4]:
numeric_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Create pipeline

In [5]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

Combine preprocessing with the model in a single pipeline

In [6]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier()),
])

Fit the pipeline on the training data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
pipeline.fit(X_train, y_train)

In [9]:
train_predictions = pipeline.predict(X_test)

In [10]:
rmse_score = np.sqrt(mean_squared_error(train_predictions, y_test))
rmse_score

42910.866885954536

# Making predictions & submission file

Make predictions on the testing data

In [11]:
submission_df = test_ds[['Id']]
test_ds = test_ds.drop(columns=['Id'])

In [12]:
predicted_prices = pipeline.predict(test_ds)

In [13]:
submission_df['SalePrice'] = predicted_prices
submission_df = submission_df.set_index('Id')
submission = submission_df.to_csv("submission.csv", encoding='utf-8')

In [14]:
!head submission.csv

Id,SalePrice
1461,125000
1462,135000
1463,173000
1464,173000
1465,180000
1466,178000
1467,173000
1468,178000
1469,173000
