### Import Packages

In [81]:
import os
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

### Google Drive (Optional)

In [82]:
# from google.colab import drive
# drive.mount('/content/drive')

### Global Variables

In [83]:
DEBUGMODE = True

file_path = 'data_processing/data-1-year.csv'
df = pd.read_csv(file_path)
X_training, X_validation, Y_training, Y_validation = [], [], [], []

imputer = SimpleImputer(strategy='mean')   # deal with missing area

### Get Training and Validation Dataset

1. Preprocess data frame, convert all elements to int/float type, deal with NaN value in area column.
2. Split data frame into feature set and label set, and then seperate both into training dataset and validation dataset.

In [84]:
def DataPreprocessing(df):
    # 1.
    df = df.apply(pd.to_numeric, errors='coerce')
    df['area'] = imputer.fit_transform(df[['area']])
    # if DEBUGMODE:
    #     df.info()

    # 2.
    features = df.drop('price', axis='columns')
    label = df['price']

    return train_test_split(features, label, test_size=0.7, random_state=100)

# def GetnSizeDataset(df, size=0.4):



### Auto-Tuning

In [85]:
def AutoTuning(X_training, Y_training, num_iter):
    param = {
        'n_estimators': randint(100, 200),
        'max_depth': randint(10, 20),
        'min_samples_split': randint(2, 5),
        'min_samples_leaf': randint(1, 4),
        'random_state': randint(100, 200),
    }

    # automatically search for best parameters and the corresponding accuracy rate
    random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param, n_iter=num_iter, cv=5, scoring='accuracy')
    print('[OK] fitting param...')
    random_search.fit(X_training, Y_training)
    best_param = random_search.best_params_
    best_score = random_search.best_score_

    return best_param, round(best_score * 100, 3)

### Random Forest

In [86]:

def RandomForest(param, autoMode=False):

    def RandomForestClassifier_aux(param):
        classifier_auto = RandomForestClassifier(
            n_estimators=param['n_estimators'],
            criterion='entropy',
            max_depth=param['max_depth'],
            min_samples_split=param['min_samples_split'],
            min_samples_leaf=param['min_samples_leaf'],
            max_features='sqrt',
            class_weight='balanced',
            random_state=param['random_state'],
            n_jobs=-1
        )
        return classifier_auto
    
    print('[ ] start build classifier')
    if autoMode is True:
        classifier = RandomForestClassifier_aux(param)
    else:   # manual
        classifier = RandomForestClassifier(
            n_estimators=300,
            criterion='entropy',
            max_depth=50,
            min_samples_split=10,
            min_samples_leaf=2,
            max_features='sqrt',
            class_weight='balanced',
            random_state=200,
            n_jobs=-1
        )
    print('[OK] Build classifier, going to fit')

    classifier.fit(X_training, Y_training)

    return classifier

### Make Predictions

In [87]:
def MakePredictions(testing_dataset, classifier):
    return classifier.predict(testing_dataset)

In [88]:
X_training, X_validation, Y_training, Y_validation = DataPreprocessing(df)
print('[OK] DataPreprocessing')

final_score = -1
for _ in range(5):
    auto_param, auto_score = AutoTuning(X_training, Y_training, num_iter=500)
    if auto_score > final_score:
        final_param, final_score = auto_param, auto_score
    print(f'{_}/4 AutoTuning | Score: {final_score} % | Param: {final_param}')
print('[OK] AutoTuning')

classifier = RandomForest(final_param, autoMode=True)
print('[OK] Build Forest')

Y_predictions_training = MakePredictions(X_training, classifier)
acc_training = round((accuracy_score(Y_training, Y_predictions_training) * 100), 3)

# Y_predictions_validation = MakePredictions(X_validation, classifier)
# acc_validation = round((accuracy_score(Y_validation, Y_predictions_validation) * 100), 3)

if DEBUGMODE:
    print(f'Training Accuracy: {acc_training} %')
    # print(f'Validation Accuracy: {acc_validation} %')

[OK] DataPreprocessing
[OK] fitting param...




0/4 AutoTuning | Score: 3.792 % | Param: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 124, 'random_state': 163}
[OK] fitting param...




1/4 AutoTuning | Score: 3.972 % | Param: {'max_depth': 16, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 102, 'random_state': 195}
[OK] fitting param...




2/4 AutoTuning | Score: 3.972 % | Param: {'max_depth': 16, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 102, 'random_state': 195}
[OK] fitting param...




3/4 AutoTuning | Score: 3.972 % | Param: {'max_depth': 16, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 102, 'random_state': 195}
[OK] fitting param...




4/4 AutoTuning | Score: 3.972 % | Param: {'max_depth': 16, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 102, 'random_state': 195}
[OK] AutoTuning
[ ] start build classifier
[OK] Bulid classifier, going to fit
[OK] Build Forest
Training Accuracy: 73.375 %


[TODO] log param and score, overfitting problem