In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error, mean_squared_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv("../data/Rumah.comdataset_v4.csv")
df = df.drop(columns=['Property Link', 'ID'])
df['Listing Area'] = df['Listing Area'].str.replace(' m²', '')
df = df.astype({'Listing Area':'int64'})
#Print the shape of the dataset before removing outliers

In [3]:
#Before we start, perform outlier detection and removal (removing all rows with outlier values) for numerical columns
outliers = pd.DataFrame()


numerical_cols = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
#Keep removing outliers until there are no more outliers (run the loop 5 times)
for i in range(10):
    #Check if there are any outliers
    for col in numerical_cols:
        # Perform outlier detection using Interquartile Range
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
#         # Before removing outliers, put the outliers in a separate dataframe
#         outliers = outliers.append(df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)])
        # Remove outliers
        df = df[(df[col] >= Q1 - 1.5*IQR) & (df[col] <= Q3 + 1.5*IQR)]



#Print the shape of the dataset after outlier removal
index1 = df[df.Price <= 100000000].index
df = df.drop(index1)

index2 = df[df['Listing Area'] <= 21].index
df = df.drop(index2)

print(df.shape)

(12886, 7)


In [4]:
# df.to_csv("../data/Rumah.comdataset_v4_modified.csv")

In [4]:
y = df.Price
X = df.drop(['Price'], axis=1)

In [5]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, 
                                                            train_size=0.8, test_size=0.2)

In [6]:
test_df = pd.concat([X_valid_full, y_valid], axis=1)

In [7]:
test_df.to_csv("../data/test_case.csv")

In [8]:
train_df = pd.concat([X_train_full, y_train], axis=1)
train_df.to_csv("../data/train_case.csv")

In [9]:
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]
high_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() >= 10 and 
                        X_train_full[cname].dtype == "object"]
low_cardinality_cols, high_cardinality_cols

(['Jakarta Division'], ['Street Address', 'Certificate'])

In [10]:
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [11]:
my_cols = low_cardinality_cols + high_cardinality_cols+ numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [12]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Jakarta Division', 'Street Address', 'Certificate']


In [13]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(label_X_train, y_train)
preds = model.predict(label_X_valid)
mape = mean_absolute_percentage_error(y_valid, preds)
print(mape)

0.32118974156460756


In [14]:
import pickle
filename = "../data/random_forest.pickle"

# save model
pickle.dump(model, open(filename, "wb"))


In [15]:
# load model
loaded_model = pickle.load(open(filename, "rb"))

# you can use loaded model to compute predictions
y_predicted = loaded_model.predict(label_X_valid)
mape = mean_absolute_percentage_error(y_valid, y_predicted)
print(mape)

0.32118974156460756


In [16]:
print(y_valid.shape)

(2582,)


In [17]:
filename1 = "../data/ordinal_encoder.pickle"

# save model
pickle.dump(ordinal_encoder, open(filename1, "wb"))