### Import Packages

In [14]:
import os
import numpy as np
import pandas as pd

from scipy.stats import randint
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV )
from sklearn.metrics import (
    accuracy_score,
    r2_score )

### Google Drive (Optional)

In [42]:
# from google.colab import drive
# drive.mount('/content/drive')

### Global Variables

In [15]:
# random into interval
# training,validation (early stopping), testing
# validation tuning parameters

AUTOMODE = True
imputer = SimpleImputer(strategy='mean')   # deal with missing area

train_file_path = '../data_processing/Final-Processed-Data/Train/train-data-1-year.csv'
test_file_path = '../data_processing/Final-Processed-Data/Test/test-data-1-year.csv'
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

X_training, X_validation, Y_training, Y_validation = [], [], [], []

### Get Training and Validation Dataset
- training: train model
- validation: early stopping
- testing: evaluate model

In [16]:
def DataPreprocessing(df):
    # Prepocess data frame, convert all elements to int/float type, deal with NaN value in area column.
    df = df.apply(pd.to_numeric, errors='coerce')
    df['area'] = imputer.fit_transform(df[['area']])

    # Split data frame into feature set and label set, then seperate both into training and validation dataset.
    features = df.drop('price', axis='columns')
    label = df['price']

    # split training dataset into training and validation dataset, for early stopping
    return train_test_split(features, label, test_size=0.3, random_state=3)

X_training, X_validation, Y_training, Y_validation = DataPreprocessing(train_df)
print('[OK] DataPreprocessing')
print(f'{type(X_training)} {X_training.shape}')
print(f'{type(Y_training)} {Y_training.shape}')
print(f'{type(X_validation)} {X_validation.shape}')
print(f'{type(Y_validation)} {Y_validation.shape}')

[OK] DataPreprocessing
<class 'pandas.core.frame.DataFrame'> (1765, 17)
<class 'pandas.core.series.Series'> (1765,)
<class 'pandas.core.frame.DataFrame'> (757, 17)
<class 'pandas.core.series.Series'> (757,)


In [17]:
# parameters
learning_rate = 0.001
num_iterations = 50
rf_parameters = {
    'n_estimators': 151,
    'criterion': 'entropy',
    'max_depth': 15,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'class_weight': 'balanced',
    'random_state': 156,
    'n_jobs': -1
}

### Auto-Tuning (Randomized Cross Validation)

In [30]:
def AutoTuningRandomized(X_training, Y_training):
    param = {
        'n_estimators': randint(100, 200),
        'max_depth': randint(10, 20),
        'min_samples_split': randint(2, 5),
        'min_samples_leaf': randint(1, 4),
        'random_state': randint(100, 200),
    }

    # automatically search for best parameters and the corresponding score
    result = RandomizedSearchCV (
        RandomForestClassifier(),
        param_distributions=param,
        scoring='r2',
        verbose=4,
        cv=24
    )

    result.fit(X_training, Y_training)
    best_param = result.best_params_
    best_score = result.best_score_
    print(f'Best param in rCV: {best_param}')
    print(f'Best score in rCV: {round(best_score * 100, 3)}')

    return best_param

params_randomized = AutoTuningRandomized(X_training, Y_training)

Fitting 24 folds for each of 10 candidates, totalling 240 fits




[CV 1/24] END max_depth=19, min_samples_leaf=2, min_samples_split=2, n_estimators=102, random_state=175;, score=0.770 total time=   0.7s
[CV 2/24] END max_depth=19, min_samples_leaf=2, min_samples_split=2, n_estimators=102, random_state=175;, score=0.737 total time=   0.7s
[CV 3/24] END max_depth=19, min_samples_leaf=2, min_samples_split=2, n_estimators=102, random_state=175;, score=0.740 total time=   0.7s
[CV 4/24] END max_depth=19, min_samples_leaf=2, min_samples_split=2, n_estimators=102, random_state=175;, score=0.472 total time=   0.7s
[CV 5/24] END max_depth=19, min_samples_leaf=2, min_samples_split=2, n_estimators=102, random_state=175;, score=0.602 total time=   0.7s
[CV 6/24] END max_depth=19, min_samples_leaf=2, min_samples_split=2, n_estimators=102, random_state=175;, score=0.711 total time=   0.7s
[CV 7/24] END max_depth=19, min_samples_leaf=2, min_samples_split=2, n_estimators=102, random_state=175;, score=0.603 total time=   0.7s
[CV 8/24] END max_depth=19, min_samples_l

### Auto-Tuning (Grid Cross Validation)

In [None]:
def AutoTuningGrid(X_training, Y_training):
    params = {
        'n_estimators': list(range(100, 201, 5)),
        'max_depth': list(range(10, 21, 2)),
        'min_samples_split': list(range(2, 5, 1)),
        'min_samples_leaf': list(range(1, 4, 1)),
        'random_state': list(range(100, 201, 5))
    }

    result = GridSearchCV (
        RandomForestClassifier(),
        param_grid=params,
        scoring='r2',
        verbose=4,
        cv=3
    )

    result.fit(X_training, Y_training)
    best_param = result.best_params_
    best_score = result.best_score_
    print(f'Best param in gCV: {best_param}')
    print(f'Best score in gCV: {round(best_score * 100, 3)}')

    return best_param

params_grid = AutoTuningGrid(X_training, Y_training)

### Random Forest

In [46]:

def RandomForest(param, autoMode=False):

    def RandomForestClassifier_aux(param):
        classifier_auto = RandomForestClassifier(
            n_estimators=param['n_estimators'],
            criterion='entropy',
            max_depth=param['max_depth'],
            min_samples_split=param['min_samples_split'],
            min_samples_leaf=param['min_samples_leaf'],
            max_features='sqrt',
            class_weight='balanced',
            random_state=param['random_state'],
            n_jobs=-1
        )
        return classifier_auto
    
    print('Start building classifier ...')
    if autoMode is True:
        classifier = RandomForestClassifier_aux(param)
    else:   # 92.391 %
        classifier = RandomForestClassifier(
            n_estimators=131,
            criterion='entropy',
            max_depth=15,
            min_samples_split=2,
            min_samples_leaf=3,
            max_features='sqrt',
            class_weight='balanced',
            random_state=156,
            n_jobs=-1
        )
    print('[OK] Build classifier')

    classifier.fit(X_training, Y_training)

    return classifier

### Make Predictions

In [47]:
def MakePredictions(testing_dataset, classifier, validation_dataset=None):
    predictions = classifier.predict(testing_dataset)

    if validation_dataset is not None:
        # Specify the appropriate average parameter based on the multiclass problem
        average_type = 'weighted'  # 'micro', 'macro', 'weighted', or None
        acc = round((accuracy_score(validation_dataset, predictions) * 100), 3)
        r2 = r2_score(validation_dataset, predictions)
        # output scores
        print(f'Accuracy score: {acc} %')
        print(f'R2 score: {r2}')

    return predictions

In [48]:
# autoMode = False : 94.3% default
classifier = RandomForest(params_randomized, autoMode=AUTOMODE)
print('[OK] RandomForest')

Y_predictions_training = MakePredictions(X_training, classifier, Y_training)
print('[OK] MakePredictions w/ training dataset')

Y_predictions_validation = MakePredictions(X_validation, classifier)
print('[OK] MakePredictions w/ validation dataset')

[OK] DataPreprocessing
Start building classifier ...
[OK] Build classifier
[OK] RandomForest
Accuracy score: 41.529 %
R2 score: 0.908331261556717
[OK] MakePredictions w/ training dataset
[OK] MakePredictions w/ validation dataset
