# Create Decision Tree Model

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import joblib


# make all output interactive
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import helper

In [2]:
# Define file structure constants
# ================================
DATA_PATH = os.path.join(os.getcwd(), "../data/")
RAW_DATA_PATH = os.path.join(DATA_PATH, "raw")
DERIVED_DATA_PATH = os.path.join(DATA_PATH, "derived")

In [3]:
# Read datasets
# ================================
# training_data
training_data = helper.getTrainingData()

## Decision Tree Model

In [7]:
# X -> features, y -> label
X = training_data.loc[:, training_data.columns != 'suburb']
y = training_data['suburb']
  
# Dividing X, y into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
  
# Training a DecisionTreeClassifier
dtree_model = DecisionTreeClassifier()

dtree_model.fit(X_train, y_train)
# Save the model
joblib.dump(dtree_model, '../src/dtree_model.pkl')

DecisionTreeClassifier()

['../src/dtree_model.pkl']

## Predict on test set (REMOVE LATER)

In [5]:
dtree_predictions = dtree_model.predict(X_test)
  
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print(accuracy_score(y_test, dtree_predictions))

0.56796


## Predict on new user input preference

In [8]:
# Load the decision tree model
dtree_model = joblib.load('../src/dtree_model.pkl')

# Example new user preference data
input_data = training_data.sample(1)
# Ensure order of columns in user preference data is same as training data
input_data = input_data[dtree_model.feature_names_in_]

# Predicted suburb with highest probability
predicted_suburb = dtree_model.predict(input_data)
print("Recommended Suburb:", predicted_suburb[0])

# List of predicted suburbs ranked by probability
predicted_classes = dtree_model.classes_
predicted_probabilities = dtree_model.predict_proba(input_data)

recommended_suburb_list = pd.DataFrame(list(zip(predicted_classes, predicted_probabilities[0])), columns=['suburb','probability']).sort_values(by='probability', ascending=False)
print(recommended_suburb_list)

Recommended Suburb: Essendon Fields
              suburb  probability
75   Essendon Fields     0.836053
104  Heathcote South     0.045843
31       Calder Park     0.039239
186        Ravenhall     0.012432
136    Lake Eppalock     0.010878
..               ...          ...
107  Heidelberg West     0.000000
108        High Camp     0.000000
109          Highett     0.000000
110         Hilldene     0.000000
0         Abbotsford     0.000000

[258 rows x 2 columns]
