### Import Packages

In [5]:
import os
import numpy as np
import pandas as pd

from scipy.stats import randint
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV )
from sklearn.metrics import (
    accuracy_score,
    r2_score )

### Global Variables

In [6]:
# random into interval
# training,validation (early stopping), testing
# validation tuning parameters

AUTOMODE = True
imputer = SimpleImputer(strategy='mean')   # deal with missing area

train_file_path = '../data_processing/Unfiltered-Data/Train/train-data-10-year.csv'
test_file_path = '../data_processing/Unfiltered-Data/Test/test-data-10-year.csv'
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

X_training, X_validation, Y_training, Y_validation = [], [], [], []

### Get Training and Validation Dataset
- training: train model
- validation: early stopping
- testing: evaluate model

In [8]:
def DataPreprocessing(train_df, test_df):
    # Prepocess data frame, convert all elements to int/float type, deal with NaN value in area column.
    train_df = train_df.apply(pd.to_numeric, errors='coerce')
    train_df['area'] = imputer.fit_transform(train_df[['area']])

    # Split data frame into feature set and label set, then seperate both into training and validation dataset.
    features = train_df.drop('price', axis='columns')
    label = train_df['price']

    X_training, X_validation, Y_training, Y_validation = train_test_split(features, label, test_size=0.3, random_state=3)
    X_testing = test_df.drop('price', axis='columns')
    Y_testing = test_df['price']

    # split training dataset into training and validation dataset, for early stopping
    return X_training, X_validation, Y_training, Y_validation, X_testing, Y_testing

X_training, X_validation, Y_training, Y_validation, X_testing, Y_testing = DataPreprocessing(train_df, test_df)
print('[OK] DataPreprocessing')
print(f'{type(X_training)} {X_training.shape}')
print(f'{type(Y_training)} {Y_training.shape}')
print(f'{type(X_validation)} {X_validation.shape}')
print(f'{type(Y_validation)} {Y_validation.shape}')
print(f'{type(X_testing)} {X_testing.shape}')
print(f'{type(Y_testing)} {Y_testing.shape}')

[OK] DataPreprocessing
<class 'pandas.core.frame.DataFrame'> (38037, 24)
<class 'pandas.core.series.Series'> (38037,)
<class 'pandas.core.frame.DataFrame'> (16302, 24)
<class 'pandas.core.series.Series'> (16302,)
<class 'pandas.core.frame.DataFrame'> (6038, 24)
<class 'pandas.core.series.Series'> (6038,)


In [9]:
# parameters
num_iterations = 50
rf_parameters = {
    'n_estimators': 167,
    'criterion': 'entropy',
    'max_depth': 14,
    'min_samples_split': 3,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'class_weight': 'balanced',
    'random_state': 101,
    'n_jobs': -1
}

### Randomized Cross Validation

In [10]:
def AutoTuningRandomized(X_training, Y_training):
    param = {
        'n_estimators': randint(100, 200),
        'max_depth': randint(10, 20),
        'min_samples_split': randint(2, 5),
        'min_samples_leaf': randint(1, 4),
        'random_state': randint(100, 200),
    }

    # automatically search for best parameters and the corresponding score
    result = RandomizedSearchCV (
        RandomForestClassifier(),
        param_distributions=param,
        scoring='r2',
        verbose=4,
        cv=3,
        n_iter=3
    )

    result.fit(X_training, Y_training)
    best_param = result.best_params_
    best_score = result.best_score_
    print(f'Best param in rCV: {best_param}')
    print(f'Best score in rCV: {round(best_score * 100, 3)}')

    return best_param

params_randomized = AutoTuningRandomized(X_training, Y_training)

Fitting 3 folds for each of 3 candidates, totalling 9 fits




[CV 1/3] END max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=173, random_state=142;, score=0.816 total time=  38.3s
[CV 2/3] END max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=173, random_state=142;, score=0.812 total time=  37.0s
[CV 3/3] END max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=173, random_state=142;, score=0.817 total time=  37.7s
[CV 1/3] END max_depth=18, min_samples_leaf=3, min_samples_split=3, n_estimators=196, random_state=158;, score=0.824 total time=  48.7s
[CV 2/3] END max_depth=18, min_samples_leaf=3, min_samples_split=3, n_estimators=196, random_state=158;, score=0.829 total time=  48.1s
[CV 3/3] END max_depth=18, min_samples_leaf=3, min_samples_split=3, n_estimators=196, random_state=158;, score=0.827 total time=  48.8s
[CV 1/3] END max_depth=17, min_samples_leaf=1, min_samples_split=2, n_estimators=181, random_state=189;, score=0.832 total time= 1.0min
[CV 2/3] END max_depth=17, min_samples_leaf=1, m

### Grid Cross Validation

In [13]:
def AutoTuningGrid(X_training, Y_training):

    params = {
        'n_estimators': list(range(100, 201, 5)),
        'max_depth': [16],
        'min_samples_split': [3],
        'min_samples_leaf': [2],
        'random_state': [150]
    }

    # params = {
    #     'n_estimators': list(range(100, 201, 5)),
    #     'max_depth': list(range(10, 21, 2)),
    #     'min_samples_split': list(range(2, 5, 1)),
    #     'min_samples_leaf': list(range(1, 4, 1)),
    #     'random_state': list(range(100, 201, 5))
    # }

    result = GridSearchCV (
        RandomForestClassifier(),
        param_grid=params,
        scoring='r2',
        verbose=4,
        cv=3
    )

    result.fit(X_training, Y_training)
    best_param = result.best_params_
    best_score = result.best_score_
    print(f'Best param in gCV: {best_param}')
    print(f'Best score in gCV: {round(best_score * 100, 3)}')

    return best_param

params_grid = AutoTuningGrid(X_training, Y_training)

Fitting 3 folds for each of 21 candidates, totalling 63 fits




[CV 1/3] END max_depth=16, min_samples_leaf=2, min_samples_split=3, n_estimators=100, random_state=150;, score=0.824 total time=  25.5s
[CV 2/3] END max_depth=16, min_samples_leaf=2, min_samples_split=3, n_estimators=100, random_state=150;, score=0.820 total time=  24.4s
[CV 3/3] END max_depth=16, min_samples_leaf=2, min_samples_split=3, n_estimators=100, random_state=150;, score=0.813 total time=  24.5s
[CV 1/3] END max_depth=16, min_samples_leaf=2, min_samples_split=3, n_estimators=105, random_state=150;, score=0.825 total time=  25.9s
[CV 2/3] END max_depth=16, min_samples_leaf=2, min_samples_split=3, n_estimators=105, random_state=150;, score=0.822 total time=  25.6s
[CV 3/3] END max_depth=16, min_samples_leaf=2, min_samples_split=3, n_estimators=105, random_state=150;, score=0.815 total time=  25.9s
[CV 1/3] END max_depth=16, min_samples_leaf=2, min_samples_split=3, n_estimators=110, random_state=150;, score=0.827 total time=  27.1s
[CV 2/3] END max_depth=16, min_samples_leaf=2, m

### Random Forest

In [9]:

def RandomForest(param, autoMode=False):


    def RandomForestClassifier_aux(param):
        classifier_auto = RandomForestClassifier(
            n_estimators=param['n_estimators'],
            criterion='entropy',
            max_depth=param['max_depth'],
            min_samples_split=param['min_samples_split'],
            min_samples_leaf=param['min_samples_leaf'],
            max_features='sqrt',
            class_weight='balanced',
            random_state=param['random_state'],
            n_jobs=-1
        )
        return classifier_auto
    
    
    print('Start building classifier ...')
    if autoMode is True:
        classifier = RandomForestClassifier_aux(param)
    else:   # manually
        classifier = RandomForestClassifier(
            n_estimators=131,
            criterion='entropy',
            max_depth=15,
            min_samples_split=2,
            min_samples_leaf=3,
            max_features='sqrt',
            class_weight='balanced',
            random_state=156,
            n_jobs=-1
        )
    print('[OK] Build classifier')

    classifier.fit(X_training, Y_training)

    return classifier

classifier = RandomForest(params_grid, autoMode=AUTOMODE)
print('[OK] RandomForest')

Start building classifier ...
[OK] Build classifier
[OK] RandomForest


### Make Predictions and Print Out Scores

In [11]:
def MakePredictions(testing_dataset, classifier, validation_dataset=None):
    predictions = classifier.predict(testing_dataset)

    if validation_dataset is not None:
        acc = round((accuracy_score(validation_dataset, predictions) * 100), 3)
        r2 = r2_score(validation_dataset, predictions)
        # output scores
        print(f'Accuracy score: {acc} %')
        print(f'R2 score: {round(r2, 5)}')

    return predictions

print('--- Training Dataset ---')
Y_predictions_training = MakePredictions(X_training, classifier, Y_training)

Accuracy score: 59.305 %
R2 score: 0.97076
[OK] MakePredictions w/ training dataset


In [12]:
print('--- Validation Dataset ---')
Y_predictions_validation = MakePredictions(X_validation, classifier, Y_validation)
print()
print('--- Testing Dataset ---')
Y_predictions_testing = MakePredictions(X_testing, classifier, Y_testing)

Accuracy score: 2.613 %
R2 score: 0.79819
[OK] MakePredictions w/ validation dataset
