In [3]:
import os
import sys
import warnings

import pandas as pd
import seaborn as sns

# Menambahkan direktori src ke path
sys.path.append(os.path.abspath(os.path.join("..", "src")))

warnings.filterwarnings("ignore")

# dataframe setting
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.2f}".format

# visualization setting
sns.set()
title_size = 18
y_title = 1.01
title_weight = "bold"

# random state
RANDOM_STATE = 123

# **House Price Prediction Data Science Project**

# 3. Data Preparation

In [4]:
train_df = pd.read_csv("../data/raw/train_data.csv")
test_df = pd.read_csv("../data/raw/public_test_data.csv")

In [6]:
from data_processing import (
    data_integration,
    feature_engineering,
    feature_extrction,
    log_transform,
    mean_encoded,
    outliers_remove,
    reformatted,
    save_data,
    select_data,
)


## 3.1. Select Data

In [7]:
train_selected = select_data(train_df)
test_selected = select_data(test_df)

In [8]:
test_selected.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,price
0,4,2.5,3080,35430,2.0,0,0,3,3080,0,1997,0,9788-16090 NE Quail Creek Dr,Redmond,WA 98053,635000.0
1,3,2.5,2570,3600,2.5,0,0,3,2570,0,2007,0,1474-1476 29th Pl NE,Issaquah,WA 98029,680000.0
2,3,1.75,1400,6956,1.0,0,0,4,1400,0,1957,2001,10642 SE 200th St,Kent,WA 98031,230000.0
3,3,2.25,1480,5400,2.0,0,0,4,1480,0,1914,1945,816 Martin Luther King Jr Way,Seattle,WA 98122,600000.0
4,4,3.5,4390,11600,2.0,0,3,3,3060,1330,1990,2009,2758 68th Ave SE,Mercer Island,WA 98040,1610000.0


## 3.2. Clean Data

### 3.2.1. Outliers Handling

In [9]:
train_selected = outliers_remove(train_selected)

## 3.3. Construct Data

In [10]:
train_cleaned = feature_engineering(train_selected)
test_cleaned = feature_engineering(test_selected)

In [11]:
train_cleaned.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,price,basement,renovated,era_category
0,3,1.0,1330,15678,1.0,0,0,3,900,430,1984,0,3009 229th Pl NE,Sammamish,WA 98074,405000.0,1,0,Early Modern
1,4,2.5,2700,9320,2.0,0,0,4,2700,0,1994,0,1317 5th Ln,Kirkland,WA 98033,837500.0,0,0,Early Modern
2,2,1.0,790,8424,1.0,0,0,4,790,0,1953,1983,556 N 167th St,Shoreline,WA 98133,268500.0,0,1,Post-World War II
3,3,2.5,1800,2700,2.0,0,0,3,1800,0,2011,0,2803 SW Bataan St,Seattle,WA 98126,365000.0,0,0,Modern
4,4,2.5,2340,11784,2.0,0,0,3,2340,0,1997,0,24200-24498 144th Ave SE,Kent,WA 98042,330000.0,0,0,Early Modern


In [12]:
train_street_tfidf, test_street_tfidf = feature_extrction(
    train_cleaned["street"], test_cleaned["street"]
)

## 3.4. Integrate Data

In [13]:
us_cities = pd.read_csv("../data/external/mapping city latitude longitude.csv")

In [14]:
train_cleaned = data_integration(train_cleaned, us_cities)
test_cleaned = data_integration(test_cleaned, us_cities)

## 3.5. Reformatted Data

In [15]:
train_reformatted = reformatted(train_cleaned)
test_reformatted = reformatted(test_cleaned)

In [16]:
train_reformatted.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,statezip,price,basement,renovated,era_category,lat,lng
0,3,1.0,1330,15678,1.0,0,0,3,900,430,1984,0,3009 229th Pl NE,98074,405000.0,1,0,Early Modern,47.6,-122.04
1,4,2.5,2700,9320,2.0,0,0,4,2700,0,1994,0,1317 5th Ln,98033,837500.0,0,0,Early Modern,47.7,-122.21
2,2,1.0,790,8424,1.0,0,0,4,790,0,1953,1983,556 N 167th St,98133,268500.0,0,1,Post-World War II,47.76,-122.34
3,3,2.5,1800,2700,2.0,0,0,3,1800,0,2011,0,2803 SW Bataan St,98126,365000.0,0,0,Modern,47.62,-122.32
4,4,2.5,2340,11784,2.0,0,0,3,2340,0,1997,0,24200-24498 144th Ave SE,98042,330000.0,0,0,Early Modern,47.39,-122.21


## 3.6. Data Transformation

### 3.6.1. Mean Encoding

In [17]:
era_map = train_reformatted.groupby(["era_category"])["price"].mean()

In [18]:
train_transformed = mean_encoded(train_reformatted, era_map)
test_transformed = mean_encoded(test_reformatted, era_map)

### 3.6.2. Log Transformation

In [19]:
train_transformed = log_transform(train_transformed)
test_transformed = log_transform(test_transformed)

In [20]:
train_processed = pd.concat([train_transformed, train_street_tfidf], axis=1)
test_processed = pd.concat([test_transformed, test_street_tfidf], axis=1)

In [21]:
save_data(train_processed, "../data/processed/train_processed.csv")
save_data(test_processed, "../data/processed/test_processed.csv")

In [22]:
X_train = train_processed.drop(["price", "street"], axis=1)
y_train = train_processed["price"]

X_test = test_processed.drop(["price", "street"], axis=1)
y_test = test_processed["price"]

## 3.7. Feature Selection (Wrapper Mthod)

In [23]:
# regressor = XGBRegressor(random_state=RANDOM_STATE)

# rfe = RFECV(estimator=regressor, cv=10, scoring="neg_root_mean_squared_error").fit(
#     X_train_new, y_train
# )

# print(f"optimal number of features selected : {rfe.n_features_}")
# print(f"number of features in : {rfe.n_features_in_}")
# print(f"the feature ranking : {rfe.ranking_}")
# print(f"boolean mask of selected features : {rfe.support_}")
# print(f"selected features name : {rfe.get_feature_names_out()}")

# selected_features = rfe.get_feature_names_out().tolist()

In [24]:
selected_features = [
    "bathrooms",
    "sqft_living",
    "sqft_lot",
    "floors",
    "waterfront",
    "view",
    "condition",
    "sqft_above",
    "yr_built",
    "yr_renovated",
    "statezip",
    "era_category",
    "lat",
    "lng",
    "10th",
    "ave",
    "dr",
    "nw",
    "st",
    "sw",
]

In [25]:
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [26]:
# other feature selection result
selected_features_2 = [
    "sqft_living",
    "sqft_lot",
    "view",
    "sqft_basement",
    "statezip",
    "lat",
    "lng",
    "waterfront",
]
X_train_selected_2 = X_train[selected_features_2]
X_test_selected_2 = X_test[selected_features_2]