<a href="https://colab.research.google.com/github/habstrakT808/Prediksi-Harga-Rumah-dengan-Machine-Learning/blob/main/PREDIKSI_HARGA_RUMAH_HAFIYAN_AL_MUQAFFI_UMARY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#PREDIKSI HARGA RUMAH - MACHINE LEARNING TERAPAN

## Import library yang dibutuhkan

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

## 1. Load Dataset


In [None]:
print("Loading dataset...")
url = "https://www.kaggle.com/datasets/harlfoxem/housesalesprediction"
df = pd.read_csv("/content/kc_house_data.csv")

Loading dataset...


## 2. Exploratory Data Analysis

In [None]:
print("\n=== Data Overview ===")
print(f"Shape: {df.shape}")
print("\nData Types:")
print(df.dtypes)

print("\n=== Statistical Summary ===")
print(df.describe())

print("\n=== Check Missing Values ===")
print(df.isnull().sum())

# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# Visualizations
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(df['price'], kde=True)
plt.title('Price Distribution')
plt.xlabel('Price')

plt.subplot(1, 2, 2)
sns.scatterplot(x='sqft_living', y='price', data=df)
plt.title('Price vs. Square Footage')
plt.tight_layout()
plt.savefig('price_distribution_and_correlation.png')
plt.close()

# Correlation heatmap
plt.figure(figsize=(14, 12))
correlation = df.corr()
mask = np.triu(correlation)
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f', mask=mask)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.close()

print("\n=== Top Correlations with Price ===")
print(correlation['price'].sort_values(ascending=False).head(10))

# Plot locations with prices
plt.figure(figsize=(12, 8))
sns.scatterplot(x='long', y='lat', data=df, hue='price', palette='viridis', size='price',
               sizes=(20, 200), alpha=0.6)
plt.title('House Locations with Prices')
plt.tight_layout()
plt.savefig('location_prices.png')
plt.close()



=== Data Overview ===
Shape: (21613, 21)

Data Types:
id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

=== Statistical Summary ===
                 id         price      bedrooms     bathrooms   sqft_living  \
count  2.161300e+04  2.161300e+04  21613.000000  21613.000000  21613.000000   
mean   4.580302e+09  5.400881e+05      3.370842      2.114757   2079.899736   
std    2.876566e+09  3.671272e+05      0.930062      0.770163    918.440897   
min    1.000102e+06  7.500000e+04      0.000000      0.000000

## 3. Data Preparation

### 3.1 Handling Missing Values


In [None]:
print("Handling missing values...")
df['waterfront'] = df['waterfront'].fillna(0)
df['view'] = df['view'].fillna(0)
df['yr_renovated'] = df['yr_renovated'].fillna(0)

Handling missing values...


### 3.2 Feature Engineering

In [None]:
print("Creating new features...")
# Age of the house
df['age'] = 2015 - df['yr_built']

# Renovation status
df['renovated'] = df['yr_renovated'].apply(lambda x: 1 if x > 0 else 0)

# Total area
df['total_area'] = df['sqft_living'] + df['sqft_lot']

# Price per square foot
df['price_per_sqft'] = df['price'] / df['sqft_living']

# Extract date features
df['sale_year'] = df['date'].dt.year
df['sale_month'] = df['date'].dt.month
df['sale_day'] = df['date'].dt.day

Creating new features...


### 3.3 Handling Outliers

In [None]:
print("Handling outliers...")
# Visualize price outliers with boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['price'])
plt.title('Price Boxplot')
plt.tight_layout()
plt.savefig('price_boxplot.png')
plt.close()

# Remove price outliers using IQR method
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_clean = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]
print(f"Rows after removing outliers: {df_clean.shape[0]} (removed {df.shape[0] - df_clean.shape[0]} rows)")

Handling outliers...
Rows after removing outliers: 20467 (removed 1146 rows)


### 3.4 Feature Selection based on correlation

In [None]:
print("Selecting features based on correlation...")
corr_with_target = df_clean.corr()['price'].abs().sort_values(ascending=False)
print("\nTop 15 features by correlation with price:")
print(corr_with_target.head(16))

top_features = corr_with_target[1:16].index  # Excluding price itself

Selecting features based on correlation...

Top 15 features by correlation with price:
price             1.000000
grade             0.631182
sqft_living       0.622333
sqft_living15     0.562090
sqft_above        0.530451
price_per_sqft    0.465562
bathrooms         0.455927
lat               0.430762
bedrooms          0.296956
floors            0.273987
view              0.238268
sqft_basement     0.235701
total_area        0.105629
sqft_lot          0.093950
yr_renovated      0.083601
renovated         0.083162
Name: price, dtype: float64


### 3.5 Feature Scaling

In [None]:
print("\nApplying feature scaling...")
numeric_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
                   'sqft_above', 'sqft_basement', 'age', 'total_area']

scaler = StandardScaler()
df_clean[numeric_features] = scaler.fit_transform(df_clean[numeric_features])


Applying feature scaling...


### 3.6 One-Hot Encoding for zipcode

In [None]:
print("Applying one-hot encoding for zipcode...")
df_encoded = pd.get_dummies(df_clean, columns=['zipcode'], drop_first=True)

# Final dataset for modeling
selected_features = list(top_features)
# Add engineered features if not already in selected_features
for feature in ['renovated', 'age', 'total_area', 'sale_month']:
    if feature not in selected_features:
        selected_features.append(feature)

# Add some zipcode columns (location matters for house prices)
zipcode_cols = [col for col in df_encoded.columns if 'zipcode' in col][:10]  # Take first 10 zipcodes
selected_features.extend(zipcode_cols)

# Prepare final dataframe with selected features
df_final = df_encoded[['price'] + selected_features].copy()

print(f"\nFinal dataset shape: {df_final.shape}")
print("Selected features:", selected_features)

Applying one-hot encoding for zipcode...

Final dataset shape: (20467, 28)
Selected features: ['grade', 'sqft_living', 'sqft_living15', 'sqft_above', 'price_per_sqft', 'bathrooms', 'lat', 'bedrooms', 'floors', 'view', 'sqft_basement', 'total_area', 'sqft_lot', 'yr_renovated', 'renovated', 'age', 'sale_month', 'zipcode_98002', 'zipcode_98003', 'zipcode_98004', 'zipcode_98005', 'zipcode_98006', 'zipcode_98007', 'zipcode_98008', 'zipcode_98010', 'zipcode_98011', 'zipcode_98014']


## 4. Modeling

### 4.1 Split data into training and testing sets

In [None]:
X = df_final[selected_features]
y = df_final['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")

Training set: (16373, 27)
Testing set: (4094, 27)


### 4.2 Function to evaluate models

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print(f"\nModel: {model_name}")
    print(f"MAE: ${mae:.2f}")
    print(f"MSE: ${mse:.2f}")
    print(f"RMSE: ${rmse:.2f}")
    print(f"R²: {r2:.4f}")
    print("-" * 50)

    return mae, mse, rmse, r2

### 4.3 Model 1: Linear Regression

In [None]:
print("Training Linear Regression model...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test)

# Evaluate model
lr_metrics = evaluate_model(y_test, y_pred_lr, "Linear Regression")

Training Linear Regression model...

Model: Linear Regression
MAE: $43414.28
MSE: $3751787667.20
RMSE: $61251.84
R²: 0.9099
--------------------------------------------------


### 4.4 Model 2: Decision Tree Regressor

In [None]:
print("Training Decision Tree model...")
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_model.predict(X_test)

# Evaluate model
dt_metrics = evaluate_model(y_test, y_pred_dt, "Decision Tree")

Training Decision Tree model...

Model: Decision Tree
MAE: $6696.88
MSE: $164117980.21
RMSE: $12810.85
R²: 0.9961
--------------------------------------------------


### 4.5 Model 3: Random Forest Regressor

In [None]:
print("Training Random Forest model...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate model
rf_metrics = evaluate_model(y_test, y_pred_rf, "Random Forest")

Training Random Forest model...

Model: Random Forest
MAE: $2658.26
MSE: $54538812.98
RMSE: $7385.04
R²: 0.9987
--------------------------------------------------


### 4.6 Hyperparameter Tuning for Random Forest

In [None]:
print("Performing hyperparameter tuning for Random Forest...")
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# For quick execution, we'll use a smaller parameter grid
# Uncomment the below code and comment the above for a more comprehensive search
# param_grid = {
#     'n_estimators': [100],
#     'max_depth': [None, 20],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2]
# }

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=3,  # Use 3-fold cross-validation
    scoring='neg_mean_squared_error',
    n_jobs=-1  # Use all available cores
)

grid_search.fit(X_train, y_train)

# Best parameters
print("\nBest parameters:", grid_search.best_params_)

# Best model
best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)

# Evaluate tuned model
best_rf_metrics = evaluate_model(y_test, y_pred_best_rf, "Random Forest (Tuned)")

Performing hyperparameter tuning for Random Forest...

Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Model: Random Forest (Tuned)
MAE: $2658.26
MSE: $54538812.98
RMSE: $7385.04
R²: 0.9987
--------------------------------------------------


### 4.7 Feature Importance

In [None]:
print("\n=== Feature Importance from Random Forest Model ===")
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print(feature_importance.head(10))

# Visualize feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
plt.title('Top 15 Features by Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()


=== Feature Importance from Random Forest Model ===
           Feature  Importance
1      sqft_living    0.468636
4   price_per_sqft    0.398467
6              lat    0.122407
0            grade    0.008593
3       sqft_above    0.000755
2    sqft_living15    0.000196
15             age    0.000173
16      sale_month    0.000141
11      total_area    0.000140
5        bathrooms    0.000116


## 5. Model Comparison

In [None]:
print("\n=== Model Comparison ===")
models = ['Linear Regression', 'Decision Tree', 'Random Forest', 'Random Forest (Tuned)']
metrics = [lr_metrics, dt_metrics, rf_metrics, best_rf_metrics]

mae_values = [metric[0] for metric in metrics]
rmse_values = [metric[2] for metric in metrics]
r2_values = [metric[3] for metric in metrics]

# Create comparison plots
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.barplot(x=models, y=mae_values)
plt.title('MAE Comparison')
plt.ylabel('MAE ($)')
plt.xticks(rotation=45)

plt.subplot(2, 2, 2)
sns.barplot(x=models, y=rmse_values)
plt.title('RMSE Comparison')
plt.ylabel('RMSE ($)')
plt.xticks(rotation=45)

plt.subplot(2, 2, 3)
sns.barplot(x=models, y=r2_values)
plt.title('R² Comparison')
plt.ylabel('R²')
plt.xticks(rotation=45)

plt.subplot(2, 2, 4)
# Actual vs Predicted plot for best model
plt.scatter(y_test, y_pred_best_rf, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices (Random Forest Tuned)')

plt.tight_layout()
plt.savefig('model_comparison.png')
plt.close()

print("\n=== Best Model: Random Forest (Tuned) ===")
print(f"MAE: ${best_rf_metrics[0]:.2f}")
print(f"RMSE: ${best_rf_metrics[2]:.2f}")
print(f"R²: {best_rf_metrics[3]:.4f}")


=== Model Comparison ===

=== Best Model: Random Forest (Tuned) ===
MAE: $2658.26
RMSE: $7385.04
R²: 0.9987


## 6. Save the best model


In [None]:
print("\nSaving the best model...")
import joblib
joblib.dump(best_rf_model, 'house_price_prediction_model.pkl')
print("Model saved as 'house_price_prediction_model.pkl'")


Saving the best model...
Model saved as 'house_price_prediction_model.pkl'


## 7. Example prediction with the best model

In [None]:
print("\n=== Example Prediction ===")
# Create a sample house
sample_house = X_test.iloc[0].copy()
actual_price = y_test.iloc[0]

# Make prediction
predicted_price = best_rf_model.predict([sample_house])[0]

print(f"Actual price: ${actual_price:.2f}")
print(f"Predicted price: ${predicted_price:.2f}")
print(f"Difference: ${abs(actual_price - predicted_price):.2f}")
print(f"Percentage Error: {abs(actual_price - predicted_price) / actual_price * 100:.2f}%")

print("\n=== Project Completed Successfully ===")


=== Example Prediction ===
Actual price: $420000.00
Predicted price: $419067.50
Difference: $932.50
Percentage Error: 0.22%

=== Project Completed Successfully ===
