# House Price Category Classification

This notebook trains a machine learning model that classifies houses into **Low**, **Medium**, or **High** price categories based on the cleaned dataset.

Steps performed in the notebook:

1. Load and inspect the dataset.
2. Derive price categories from the `Harga` (price) column.
3. Train a classifier on the structural features of each house.
4. Provide an interactive function that predicts the price category for user-supplied data.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
data_path = Path('cleaned_house_data.csv')
df = pd.read_csv(data_path)
df.head()


In [None]:
# Show dataset summary statistics
df.describe(include='all')


In [None]:
# Create price categories using quantiles
quantile_edges = df['Harga'].quantile([0, 1/3, 2/3, 1]).values
unique_edges = np.unique(quantile_edges)
if unique_edges.shape[0] < 4:
    unique_edges = np.linspace(df['Harga'].min(), df['Harga'].max(), 4)
price_labels = ['Low', 'Medium', 'High']
df['PriceCategory'] = pd.cut(df['Harga'], bins=unique_edges, labels=price_labels, include_lowest=True)
df['PriceCategory'] = df['PriceCategory'].cat.add_categories(['High']).fillna('High')
df[['Nama', 'Harga', 'PriceCategory']].head()


In [None]:
# Inspect class balance
df['PriceCategory'].value_counts()


In [None]:
# Prepare features and labels
feature_columns = ['LB', 'LT', 'KT', 'KM', 'GRS']
X = df[feature_columns]
y = df['PriceCategory']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

model.fit(X_train, y_train)


In [None]:
# Evaluate the classifier
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print('Confusion matrix:
', confusion_matrix(y_test, y_pred))


In [None]:
def collect_user_features():
    """Prompt the user to enter house features and return them as a DataFrame."""
    prompts = {
        'LB': 'Building area (LB) in square meters',
        'LT': 'Land area (LT) in square meters',
        'KT': 'Number of bedrooms (KT)',
        'KM': 'Number of bathrooms (KM)',
        'GRS': 'Number of garages/carports (GRS)'
    }

    values = {}
    for feature, message in prompts.items():
        while True:
            try:
                raw_value = input(f"Enter {message}: ")
                values[feature] = float(raw_value)
                break
            except ValueError:
                print('Please enter a numeric value.')

    return pd.DataFrame([values])


def predict_house_category(model):
    """Interactively gather user input and display the predicted price category."""
    print('Provide the house details to receive a price category prediction.')
    user_features = collect_user_features()
    prediction = model.predict(user_features)[0]
    print(f"
Predicted house price category: {prediction}")
    return user_features.assign(PredictedCategory=prediction)


Run the next cell and follow the prompts to predict a category for a new house.


In [None]:
# Uncomment the line below to predict interactively in the notebook.
# result = predict_house_category(model)
# result


In [None]:
# Example prediction using hard-coded values (no user input).
example_house = pd.DataFrame([{
    'LB': 180,  # building area
    'LT': 150,  # land area
    'KT': 3,    # bedrooms
    'KM': 3,    # bathrooms
    'GRS': 1    # garages/carports
}])
model.predict(example_house)[0]
