### Import Packages

In [9]:
import os
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint

### Google Drive (Optional)

In [10]:
# from google.colab import drive
# drive.mount('/content/drive')

### Global Variables

In [11]:
AUTOMODE = True

file_path = '../data_processing/data-1-year-normalized.csv'
df = pd.read_csv(file_path)
X_training, X_validation, Y_training, Y_validation = [], [], [], []

imputer = SimpleImputer(strategy='mean')   # deal with missing area

### Get Training and Validation Dataset

1. Preprocess data frame, convert all elements to int/float type, deal with NaN value in area column.
2. Split data frame into feature set and label set, and then seperate both into training dataset and validation dataset.

In [12]:
def DataPreprocessing(df):
    # 1.
    df = df.apply(pd.to_numeric, errors='coerce')
    df['area'] = imputer.fit_transform(df[['area']])

    # 2.
    features = df.drop('price', axis='columns')
    label = df['price']

    return train_test_split(features, label, test_size=0.3, random_state=3)

### Auto-Tuning

In [13]:
def AutoTuning(X_training, Y_training, num_iter):
    param = {
        'n_estimators': randint(100, 200),
        'max_depth': randint(10, 20),
        'min_samples_split': randint(2, 5),
        'min_samples_leaf': randint(1, 4),
        'random_state': randint(100, 200),
    }

    # automatically search for best parameters and the corresponding score
    random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param, n_iter=num_iter, cv=5, scoring='accuracy')
    print('Fitting ...')
    random_search.fit(X_training, Y_training)
    best_param = random_search.best_params_
    best_score = random_search.best_score_

    return best_param, round(best_score * 100, 3)

### Random Forest

In [14]:

def RandomForest(param, autoMode=False):

    def RandomForestClassifier_aux(param):
        classifier_auto = RandomForestClassifier(
            n_estimators=param['n_estimators'],
            criterion='entropy',
            max_depth=param['max_depth'],
            min_samples_split=param['min_samples_split'],
            min_samples_leaf=param['min_samples_leaf'],
            max_features='sqrt',
            class_weight='balanced',
            random_state=param['random_state'],
            n_jobs=-1
        )
        return classifier_auto
    
    print('Start building classifier ...')
    if autoMode is True:
        classifier = RandomForestClassifier_aux(param)
    else:   # 92.391 %
        classifier = RandomForestClassifier(
            n_estimators=173,
            criterion='entropy',
            max_depth=15,
            min_samples_split=3,
            min_samples_leaf=2,
            max_features='sqrt',
            class_weight='balanced',
            random_state=188,
            n_jobs=-1
        )
    print('[OK] Build classifier')

    classifier.fit(X_training, Y_training)

    return classifier

### Make Predictions

In [15]:
def MakePredictions(testing_dataset, classifier, validation_dataset=None):
    predictions = classifier.predict(testing_dataset)

    if validation_dataset is not None:
        # Specify the appropriate average parameter based on the multiclass problem
        average_type = 'weighted'  # 'micro', 'macro', 'weighted', or None

        # classification:
        acc = round((accuracy_score(validation_dataset, predictions) * 100), 3)
        cm = confusion_matrix(validation_dataset, predictions)
        precision = precision_score(validation_dataset, predictions, average=average_type)
        recall = recall_score(validation_dataset, predictions, average=average_type)
        f1 = f1_score(validation_dataset, predictions, average=average_type)
        # regression:
        mae = mean_absolute_error(validation_dataset, predictions)
        mse = mean_squared_error(validation_dataset, predictions)
        r2 = r2_score(validation_dataset, predictions)
        # output scores
        print(f'Accuracy score: {acc} %')
        print(f'Confusion Matrix: {cm}')
        print(f'Precision score: {precision}')
        print(f'Recall score: {recall}')
        print(f'F1 score: {f1}')
        print(f'Mean absolute error: {mae}')
        print(f'Mean squared error: {mse}')
        print(f'R2 score: {r2}')

    return predictions

In [16]:
X_training, X_validation, Y_training, Y_validation = DataPreprocessing(df)
print('[OK] DataPreprocessing')

final_param = None
if AUTOMODE is True:
    final_score = -1
    for _ in range(5):
        auto_param, auto_score = AutoTuning(X_training, Y_training, num_iter=10)
        if auto_score > final_score:
            final_param, final_score = auto_param, auto_score
        print(f'{_}/4 AutoTuning | Score: {final_score} | Param: {final_param}')
    print('[OK] AutoTuning')

# autoMode = False : 94.3% default
classifier = RandomForest(final_param, autoMode=AUTOMODE)
print('[OK] RandomForest')

Y_predictions_training = MakePredictions(X_training, classifier, Y_training)
print('[OK] MakePredictions w/ training dataset')

Y_predictions_validation = MakePredictions(X_validation, classifier)
print('[OK] MakePredictions w/ validation dataset')

[OK] DataPreprocessing
Fitting ...




0/4 AutoTuning | Score: 5.053 | Param: {'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 196, 'random_state': 194}
Fitting ...




1/4 AutoTuning | Score: 5.053 | Param: {'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 196, 'random_state': 194}
Fitting ...




2/4 AutoTuning | Score: 5.053 | Param: {'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 196, 'random_state': 194}
Fitting ...




3/4 AutoTuning | Score: 5.053 | Param: {'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 196, 'random_state': 194}
Fitting ...




4/4 AutoTuning | Score: 5.106 | Param: {'max_depth': 12, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 181, 'random_state': 112}
[OK] AutoTuning
Start building classifier ...
[OK] Build classifier
[OK] RandomForest
Accuracy score: 93.457 %
Confusion Matrix: [[2 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 5 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]
Precision score: 0.9539508716137934
Recall score: 0.9345744680851064
F1 score: 0.9361072490766535
Mean absolute error: 53152.65957446808
Mean squared error: 139039315425.53192
R2 score: 0.9959387676302401
[OK] MakePredictions w/ training dataset
[OK] MakePredictions w/ validation dataset


  _warn_prf(average, modifier, msg_start, len(result))
