# Preprocess & Features engineering

In [311]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import metrics
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [312]:
preprocess_data = pd.read_csv('../Dataset/bangkok_cleaned.csv')

In [313]:
preprocess_data.shape

(14208, 16)

In [314]:
preprocess_data.isnull().sum()

id                            0
province                      0
district                      0
property_type                 0
bedrooms                      0
baths                         0
floor_area                    0
nearby_stations               0
nearby_station_distance    7005
nearby_supermarkets           0
nearby_shops                  0
facilities                    0
price                         0
station_name                  0
station_distance              0
num_facilities                0
dtype: int64

### Change the values from strings to lists in the 'facilities' column

In [316]:
def extract_facility(facilities):
    # Check if the value is NaN (missing value)
    if pd.isnull(facilities):
        return 'None'  # Return 'None' if there are no facilities listed
    
    # If facilities is not empty, process the string
    if facilities and len(facilities) != 0:  
       
        facility_text = str(facilities)[1:-1]
        
        facility_list = facility_text.split(',')
        
    return facility_list  

# Apply the extract_facility function to the 'facilities' column in preprocess_data DataFrame
preprocess_data['facilities'] = preprocess_data['facilities'].apply(extract_facility)


### Dummy Variable

These are binary variables created during the one-hot encoding process, where 1 represents the presence of a category and 0 represents its absence.


In [318]:
# Pivot the 'nearby_station_distance' data to create separate columns for each station's distance, filling missing values with 0.
pivot = preprocess_data.pivot_table(index=preprocess_data.index, 
                                  columns='station_name', 
                                  values='station_distance', 
                                  aggfunc= 'sum')
pivot = pivot.fillna(0)
pivot.columns = ['stat_' + str(col) for col in pivot.columns]
preprocess_data = pd.concat([preprocess_data.drop(columns=['nearby_station_distance','station_name','station_distance']), pivot], axis = 1)

In [319]:
# Before creating dummy variables for facilities, we need to explode the list of facilities.
# Explode the facilities list
facility_exploded = preprocess_data.explode('facilities')

# Create dummy variables
facility_dummies = pd.get_dummies(facility_exploded['facilities'].str.replace("'", ''), prefix='faci')

# Group by the original index and sum the dummy variables
facility_pivoted = facility_dummies.groupby(facility_exploded.index).sum()

# Concatenate the dummy variables with the original DataFrame after dropping the 'facilities' column
preprocess_data = pd.concat([preprocess_data.drop(columns=['facilities']), facility_pivoted], axis=1)


In [320]:
# For these categorical columns, we can create dummy variables directly.
preprocess_data = pd.get_dummies(data=preprocess_data, columns=["property_type"], prefix="type")
preprocess_data = pd.get_dummies(data=preprocess_data, columns=["district"], prefix="dist")
preprocess_data = pd.get_dummies(data=preprocess_data, columns=["province"], prefix="prov")

### Polynomial Feature

This involves creating new features by raising existing features to a power, typically to capture non-linear relationships in the data.


In [322]:
# Select the initial features for polynomial expansion
starter_features = ['bedrooms','baths','floor_area', 'nearby_stations', 'nearby_supermarkets', 'num_facilities']

In [323]:
# Select features for polynomial transformation
data_poly = preprocess_data[starter_features]

# Generate polynomial features of degree 3 without the bias term
poly = PolynomialFeatures(include_bias=False, degree=3)
X_poly = poly.fit_transform(data_poly)

# Create a DataFrame for polynomial features and concatenate with the original data
poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(starter_features))
preprocess_data = pd.concat([preprocess_data, poly_df], axis=1)

### Save file to tuning the model

In [325]:
preprocess_data.to_csv('../Dataset/bangkok_preprocess.csv',index = False)