In [4]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

from keras.utils import to_categorical

import sklearn
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
priceData = pd.read_csv('melb_data.csv')


#Discretize the data based on house price
priceData['Label'] = pd.qcut(priceData['Price'],5,labels = False)

#drop the price column
priceData.drop(['Price'], axis = 1)

#encode the categorical variables

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
priceData['encoded_suburb'] = encoder.fit_transform(priceData['Suburb'])
priceData['encoded_type'] = encoder.fit_transform(priceData['Type'])
priceData['encoded_sellerG'] = encoder.fit_transform(priceData['SellerG'])
priceData['encoded_councilarea'] = encoder.fit_transform(priceData['CouncilArea'])
priceData['encoded_regionname'] = encoder.fit_transform(priceData['Regionname'])



#impute missing values for Car, Landsize, BuildingArea, YearBuilt, CouncilArea
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

imputer0 = KNNImputer()
imputer1 = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer2 = SimpleImputer(missing_values=np.nan, strategy='median')
imputer3 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

priceData[['Car']] = imputer2.fit_transform(priceData[['Car']])
priceData[['Landsize']] = imputer1.fit_transform(priceData[['Landsize']])
priceData[['BuildingArea']] = imputer0.fit_transform(priceData[['BuildingArea']])
priceData[['YearBuilt']] = imputer0.fit_transform(priceData[['YearBuilt']])
priceData[['CouncilArea']] = imputer3.fit_transform(priceData[['CouncilArea']])

#prepare data for the model based on selected features

x = priceData[['Rooms', 'Bedroom2', 'Bathroom','Car', 'YearBuilt', 'Lattitude', 'Longtitude', 'encoded_type']].copy()
y = priceData[['Label']].copy()

#scale the data
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

#labelencode the y_values
lab_enc = LabelEncoder()
encoded = lab_enc.fit_transform(y)

#split into train and test 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 1)
y = to_categorical(y)


Using TensorFlow backend.
  return f(*args, **kwargs)


In [5]:
#categorically encode the labels for multi-class classification problem 
train_y1 = to_categorical(y_train)
test_y1 = to_categorical(y_test)


from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

def create_model(neurons=1, dropout_rate=0.0, learn_rate=0.01):
    dnn = Sequential()
    dnn.add(Dense(neurons ,input_shape=(x.shape[1],), activation = 'relu')) 
    dnn.add(Dropout(dropout_rate))
    dnn.add(Dense(5, activation = 'softmax'))
    dnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return dnn

dnn = KerasClassifier(build_fn=create_model, verbose=0)




In [6]:
batch_size = [10, 20, 40]
epochs = [10, 50]
neurons = [16, 32, 40]
dropout_rate = [0.0, 0.1, 0.2]
learn_rate = [0.001, 0.01, 0.1, 0.2]

param_grid = dict(batch_size=batch_size, epochs=epochs, neurons = neurons,dropout_rate = dropout_rate, learn_rate=learn_rate)
search = RandomizedSearchCV(estimator=dnn, param_distributions=param_grid, n_jobs=-1, cv=3, scoring = "accuracy")
search_result = search.fit(x_train, train_y1)
print("Best: %f using %s" % (search_result.best_score_, search_result.best_params_))

means = search_result.cv_results_['mean_test_score']
stds = search_result.cv_results_['std_test_score']
params = search_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



Best: nan using {'neurons': 40, 'learn_rate': 0.2, 'epochs': 50, 'dropout_rate': 0.0, 'batch_size': 20}
nan (nan) with: {'neurons': 40, 'learn_rate': 0.2, 'epochs': 50, 'dropout_rate': 0.0, 'batch_size': 20}
nan (nan) with: {'neurons': 40, 'learn_rate': 0.01, 'epochs': 10, 'dropout_rate': 0.1, 'batch_size': 10}
nan (nan) with: {'neurons': 32, 'learn_rate': 0.001, 'epochs': 10, 'dropout_rate': 0.1, 'batch_size': 40}
nan (nan) with: {'neurons': 32, 'learn_rate': 0.001, 'epochs': 50, 'dropout_rate': 0.0, 'batch_size': 10}
nan (nan) with: {'neurons': 40, 'learn_rate': 0.001, 'epochs': 10, 'dropout_rate': 0.0, 'batch_size': 20}
nan (nan) with: {'neurons': 40, 'learn_rate': 0.001, 'epochs': 10, 'dropout_rate': 0.1, 'batch_size': 40}
nan (nan) with: {'neurons': 40, 'learn_rate': 0.01, 'epochs': 50, 'dropout_rate': 0.1, 'batch_size': 20}
nan (nan) with: {'neurons': 40, 'learn_rate': 0.001, 'epochs': 50, 'dropout_rate': 0.1, 'batch_size': 20}
nan (nan) with: {'neurons': 40, 'learn_rate': 0.001,

In [7]:
search_result

RandomizedSearchCV(cv=3,
                   estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7fd2315d0780>,
                   n_jobs=-1,
                   param_distributions={'batch_size': [10, 20, 40],
                                        'dropout_rate': [0.0, 0.1, 0.2],
                                        'epochs': [10, 50],
                                        'learn_rate': [0.001, 0.01, 0.1, 0.2],
                                        'neurons': [16, 32, 40]},
                   scoring='accuracy')

In [None]:

import joblib

#save your model or results
joblib.dump(search_result, 'dnn_regressor.pkl')



### Support Vector Machine


In [19]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(x_train, y_train.values.ravel())
  
# print prediction results
svm_pred = svm.predict(x_test)
print(classification_report(y_test, svm_pred))

# defining parameter range
param_grid_svm = {'C': [0.1, 1, 10, 100], 
              'gamma': [1, 0.1, 0.01],
              'kernel': ['rbf', 'sigmoid', 'poly']} 
  
SVMgrid = GridSearchCV(SVC(), param_grid = param_grid_svm, refit = True, verbose = 3)
  
# fitting the model for grid search

SVMgridresult = SVMgrid.fit(x_train, y_train.values.ravel())

              precision    recall  f1-score   support

           0       0.71      0.71      0.71       550
           1       0.41      0.47      0.44       529
           2       0.42      0.34      0.37       554
           3       0.39      0.41      0.40       532
           4       0.63      0.64      0.64       551

    accuracy                           0.51      2716
   macro avg       0.51      0.51      0.51      2716
weighted avg       0.51      0.51      0.51      2716

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   7.4s
[CV 2/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   7.8s
[CV 3/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   7.5s
[CV 4/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   7.3s
[CV 5/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   7.4s
[CV 1/5] END .................C=0.1, gamma=1

[CV 1/5] END ..................C=10, gamma=1, kernel=sigmoid; total time=   3.7s
[CV 2/5] END ..................C=10, gamma=1, kernel=sigmoid; total time=   3.6s
[CV 3/5] END ..................C=10, gamma=1, kernel=sigmoid; total time=   3.7s
[CV 4/5] END ..................C=10, gamma=1, kernel=sigmoid; total time=   3.7s
[CV 5/5] END ..................C=10, gamma=1, kernel=sigmoid; total time=   3.5s
[CV 1/5] END .....................C=10, gamma=1, kernel=poly; total time=   4.2s
[CV 2/5] END .....................C=10, gamma=1, kernel=poly; total time=   4.2s
[CV 3/5] END .....................C=10, gamma=1, kernel=poly; total time=   4.2s
[CV 4/5] END .....................C=10, gamma=1, kernel=poly; total time=   4.2s
[CV 5/5] END .....................C=10, gamma=1, kernel=poly; total time=   4.2s
[CV 1/5] END ....................C=10, gamma=0.1, kernel=rbf; total time=   6.5s
[CV 2/5] END ....................C=10, gamma=0.1, kernel=rbf; total time=   6.4s
[CV 3/5] END ...............

In [21]:
print("Best: %f using %s" % (SVMgridresult.best_score_, SVMgridresult.best_params_))

means = SVMgridresult.cv_results_['mean_test_score']
stds = SVMgridresult.cv_results_['std_test_score']
params = SVMgridresult.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.560934 using {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
0.431700 (0.011629) with: {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
0.350975 (0.012944) with: {'C': 0.1, 'gamma': 1, 'kernel': 'sigmoid'}
0.463918 (0.011534) with: {'C': 0.1, 'gamma': 1, 'kernel': 'poly'}
0.389267 (0.009825) with: {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
0.340298 (0.005637) with: {'C': 0.1, 'gamma': 0.1, 'kernel': 'sigmoid'}
0.319864 (0.005776) with: {'C': 0.1, 'gamma': 0.1, 'kernel': 'poly'}
0.333671 (0.003773) with: {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
0.319956 (0.005937) with: {'C': 0.1, 'gamma': 0.01, 'kernel': 'sigmoid'}
0.203608 (0.000213) with: {'C': 0.1, 'gamma': 0.01, 'kernel': 'poly'}
0.498710 (0.017046) with: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
0.322628 (0.014911) with: {'C': 1, 'gamma': 1, 'kernel': 'sigmoid'}
0.512058 (0.012426) with: {'C': 1, 'gamma': 1, 'kernel': 'poly'}
0.432713 (0.009207) with: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.419366 (0.008170) with: {'C': 1, 'gamma': 0.1, 'ke

In [8]:
SVMgridresult.best_score_

0.5613018130598276

In [22]:
#SVMgridresult.save('SVMclassifier.h5')  # creates a HDF5 file 'my_model.h5'

joblib.dump(SVMgridresult, 'SVMclassifier.pkl')

['SVMclassifier.pkl']

### Decision Tree Classifier

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree


# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()


param_dist = {"max_depth": [3, None],
              "max_features": [np.random.randint(1, 9)],
              "min_samples_leaf": [np.random.randint(1, 9)],
              "criterion": ["gini", "entropy"]}



# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

# Train it on the data
tree_cv = tree_cv.fit(x_train,y_train)

#Predict the response for test dataset
y_pred = tree_cv.predict(x_test)


# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))



Tuned Decision Tree Parameters: {'min_samples_leaf': 5, 'max_features': 5, 'max_depth': None, 'criterion': 'gini'}
Best score is 0.5504412516240246


In [24]:
joblib.dump(tree_cv, 'DecisionTreeclassifier.pkl')

['DecisionTreeclassifier.pkl']

In [None]:

# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = [('DNN', search_result),('SVM', SVMgridresult), ('Decision Tree', tree_cv)]

# evaluate each model in turn
results = []
names = []

kfold5 = KFold(n_splits=5, random_state=1, shuffle=True)                        
scoring = [('Accuracy', 'accuracy'), 
           ('Recall', 'recall'), 
           ('Precision', 'precision'),
            ('F1', 'f1')]

In [25]:
#load your model for further usage
joblib.load('SVMclassifier.pkl')

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01],
                         'kernel': ['rbf', 'sigmoid', 'poly']},
             verbose=3)

In [None]:
for name, model in models:
    cv_results = cross_val_score(model, x, y, scoring='f1',cv = kfold5, n_jobs=-1)
    results.append(cv_results)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    # boxplot algorithm comparison
    fig = plt.figure()
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()    

### Classification report

In [None]:
import joblib
SVMgridresult = joblib.load('SVMclassifier.pkl')
search_result = joblib.load('dnn_regressor.pkl')
tree_cv = joblib.load('DecisionTreeclassifier.pkl')

In [17]:
dnn_pred = search_result.predict(x_test)
print(classification_report(y_test, dnn_pred))


              precision    recall  f1-score   support

           0       0.74      0.73      0.73       550
           1       0.46      0.55      0.50       529
           2       0.44      0.39      0.42       554
           3       0.46      0.42      0.44       532
           4       0.71      0.73      0.72       551

    accuracy                           0.56      2716
   macro avg       0.56      0.56      0.56      2716
weighted avg       0.56      0.56      0.56      2716





In [16]:
tree_pred = tree_cv.predict(x_test)
print(classification_report(y_test, tree_pred))


              precision    recall  f1-score   support

           0       0.72      0.75      0.73       550
           1       0.43      0.46      0.44       529
           2       0.42      0.40      0.41       554
           3       0.48      0.47      0.48       532
           4       0.75      0.72      0.73       551

    accuracy                           0.56      2716
   macro avg       0.56      0.56      0.56      2716
weighted avg       0.56      0.56      0.56      2716

