In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn import tree
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection
from sklearn import ensemble
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut 
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.pipeline import make_pipeline


from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
import graphviz



In [17]:
housingData = pd.read_table('property_prices.csv', ",")

#Removing noisy elements of dataset
housingData = housingData[(housingData[['price_bands']] != "0K-200K").all(axis=1)]
housingData = housingData[(housingData[['price_bands']] != "Unknown").all(axis=1)]


In [6]:
#Dropping columns that are independentant, or not useful
X = housingData

X = X.drop(columns="id")
X = X.drop(columns="lattitude")
X = X.drop(columns="longtitude")
X = X.drop(columns="address")
X = X.drop(columns="date")

In [7]:
#PreProcess Data
#Let's prepare the data with just replacing the missing data with the median


im_median = sklearn.preprocessing.Imputer(strategy='median', axis=0)
X[['rooms', 'bathrooms', 'car_parks', 'bedrooms', 'landsize', 'building_area', 'year_built']] = im_median.fit_transform(
    X[['rooms', 'bathrooms', 'car_parks', 'bedrooms', 'landsize', 'building_area', 'year_built']])



#Remove all rooms that are 0

X = X[(X[['rooms', 'bathrooms', 'bedrooms']] != 0).all(axis=1)]
Y = X['price_bands']
X = X.drop(columns='price_bands')

for col in X.columns:
    le = preprocessing.LabelEncoder()
    X[col] = le.fit_transform(X[col])


In [8]:
select = SelectKBest(k=15)
select.fit(X, Y)
newFeat = []
toRemove = []
mask = select.get_support(indices=True)
featNames = list(X.columns.values)
newX = X
for bool, feat in zip(mask, featNames):
    if bool:
        newX[feat] = X[feat]
        newFeat.append(feat)
            
newX = X[newFeat]

In [9]:
#Base Decision Tree
model = sklearn.tree.DecisionTreeClassifier(criterion="entropy")
prediction = cross_val_predict(model, newX, Y, cv=5)

print(metrics.classification_report(Y, prediction))

             precision    recall  f1-score   support

      1M-2M       0.57      0.56      0.57      8982
  200K-400K       0.45      0.46      0.45      1082
     2M-20M       0.40      0.41      0.41      1955
  400K-600K       0.47      0.45      0.46      4452
  600K-800K       0.41      0.41      0.41      6053
    800K-1M       0.25      0.27      0.26      4664

avg / total       0.45      0.45      0.45     27188



In [10]:
#Pre Prunded Decision Tree
model = sklearn.tree.DecisionTreeClassifier(criterion="entropy", max_depth=20)
prediction = cross_val_predict(model, newX, Y, cv=5)

print(metrics.classification_report(Y, prediction))

             precision    recall  f1-score   support

      1M-2M       0.58      0.57      0.57      8982
  200K-400K       0.45      0.44      0.45      1082
     2M-20M       0.40      0.40      0.40      1955
  400K-600K       0.47      0.46      0.46      4452
  600K-800K       0.40      0.40      0.40      6053
    800K-1M       0.26      0.27      0.26      4664

avg / total       0.45      0.45      0.45     27188



In [11]:
#Ensemble Bagging Decision Tree (Also pre pruned)
model = sklearn.ensemble.BaggingClassifier(sklearn.tree.DecisionTreeClassifier(criterion="entropy", max_depth=20), max_samples=1.0, max_features=0.9, n_estimators = 50)

prediction = cross_val_predict(model, newX, Y, cv=5)

print(metrics.classification_report(Y, prediction))

             precision    recall  f1-score   support

      1M-2M       0.62      0.68      0.65      8982
  200K-400K       0.61      0.43      0.50      1082
     2M-20M       0.52      0.46      0.49      1955
  400K-600K       0.53      0.53      0.53      4452
  600K-800K       0.46      0.48      0.47      6053
    800K-1M       0.30      0.26      0.28      4664

avg / total       0.51      0.51      0.51     27188



In [12]:
#Ensemble AdaBoost Decision Tree (Also pre pruned)
model = sklearn.ensemble.AdaBoostClassifier(sklearn.tree.DecisionTreeClassifier(criterion="entropy", max_depth=20),
                                            n_estimators = 50)
prediction = cross_val_predict(model, newX, Y, cv=5)

print(metrics.classification_report(Y, prediction))

             precision    recall  f1-score   support

      1M-2M       0.60      0.65      0.63      8982
  200K-400K       0.55      0.33      0.41      1082
     2M-20M       0.35      0.43      0.38      1955
  400K-600K       0.51      0.49      0.50      4452
  600K-800K       0.46      0.48      0.47      6053
    800K-1M       0.29      0.24      0.26      4664

avg / total       0.48      0.49      0.48     27188



In [13]:
#Random Forest Ensemble Classifier
model = sklearn.ensemble.RandomForestClassifier(criterion="entropy", n_estimators = 50, min_samples_split=25, 
                                                                                  random_state = 0)

prediction = cross_val_predict(model, newX, Y, cv=5)

print(metrics.classification_report(Y, prediction))

             precision    recall  f1-score   support

      1M-2M       0.62      0.76      0.69      8982
  200K-400K       0.65      0.42      0.51      1082
     2M-20M       0.63      0.45      0.52      1955
  400K-600K       0.56      0.55      0.55      4452
  600K-800K       0.49      0.53      0.51      6053
    800K-1M       0.33      0.22      0.27      4664

avg / total       0.53      0.55      0.53     27188



In [14]:
#Random Forest Ensemble Classifier with Pre-Pruning
model = sklearn.ensemble.RandomForestClassifier(criterion="entropy", n_estimators = 50, max_depth=20, min_samples_split=25, 
                                                                                  random_state = 0)
prediction = cross_val_predict(model, newX, Y, cv=5)

print(metrics.classification_report(Y, prediction))

             precision    recall  f1-score   support

      1M-2M       0.62      0.77      0.69      8982
  200K-400K       0.66      0.43      0.52      1082
     2M-20M       0.62      0.44      0.52      1955
  400K-600K       0.56      0.55      0.55      4452
  600K-800K       0.49      0.53      0.51      6053
    800K-1M       0.33      0.22      0.26      4664

avg / total       0.53      0.55      0.53     27188



In [15]:
#Ensemble Bagging Random Forest (with Pre-Pruning)
model = sklearn.ensemble.BaggingClassifier(sklearn.ensemble.RandomForestClassifier(criterion="entropy", n_estimators = 20,
                                                                                 max_depth=50, min_samples_split=25, 
                                                                                  random_state = 0), n_estimators = 50)
prediction = cross_val_predict(model, newX, Y, cv=5)

print(metrics.classification_report(Y, prediction))

             precision    recall  f1-score   support

      1M-2M       0.63      0.79      0.70      8982
  200K-400K       0.67      0.41      0.51      1082
     2M-20M       0.67      0.43      0.52      1955
  400K-600K       0.56      0.54      0.55      4452
  600K-800K       0.50      0.57      0.53      6053
    800K-1M       0.35      0.20      0.26      4664

avg / total       0.54      0.56      0.54     27188



In [16]:
#Ensemble AdaBoost Random Forest
model = sklearn.ensemble.AdaBoostClassifier(sklearn.ensemble.RandomForestClassifier(criterion="entropy", n_estimators = 10,
                                                                                 max_depth=50, min_samples_split=25, 
                                                                                  random_state = 0), n_estimators = 50)
prediction = cross_val_predict(model, newX, Y, cv=5)

print(metrics.classification_report(Y, prediction))

             precision    recall  f1-score   support

      1M-2M       0.59      0.70      0.64      8982
  200K-400K       0.63      0.25      0.36      1082
     2M-20M       0.47      0.10      0.16      1955
  400K-600K       0.51      0.48      0.49      4452
  600K-800K       0.43      0.55      0.48      6053
    800K-1M       0.28      0.23      0.25      4664

avg / total       0.48      0.49      0.47     27188

