# Assignment Decision Trees

Analyzing data for airbnb with a target of price_gte_150 to fit an SVM(with Poly kernel) and Decision Tree Model using RandomSearch Approach

## 1.0 Setup
Import modules


In [29]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC

np.random.seed(100)

## 2.0 Load data
Load data (it's already cleaned and preprocessed)


In [30]:
X_train = pd.read_csv('airbnb_train_X_price_gte_150.csv') 
y_train = pd.read_csv('airbnb_train_y_price_gte_150.csv') 
X_test = pd.read_csv('airbnb_test_X_price_gte_150.csv') 
y_test = pd.read_csv('airbnb_test_y_price_gte_150.csv') 

## 3.0 Model the data

In [31]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### 3.1SVM classification model using polynomial kernal (Random Search)

In [32]:
score_measure = "precision"
kfolds = 5

param_rand = {
    'C': np.arange(5,15),
    'degree': [3,4,5],
    'gamma': ['scale', 'auto'],
    'coef0': np.arange(1,3)
}

svm_poly_model = SVC(kernel="poly")
rand_search = RandomizedSearchCV(estimator = svm_poly_model, param_distributions=param_rand, cv=kfolds, n_iter=20,
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = rand_search.fit(X_train,np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_SVM_poly = rand_search.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits
The best precision score is 0.8530059479356351
... with parameters: {'gamma': 'auto', 'degree': 5, 'coef0': 1, 'C': 7}


In [33]:
model_preds = rand_search.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Poly SVM Random", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

### 3.1SVM classification model using polynomial kernal (Grid Search)

### 3.2 Decision Tree classification model (Random Search)

In [34]:
score_measure = "precision"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(7,20),  
    'min_samples_leaf': np.arange(7,20),
    'min_impurity_decrease': np.arange(0.0001, 0.01 , 0.005),
    'max_leaf_nodes': np.arange(5, 11), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
    
}

dtree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=100,
                           scoring=score_measure, verbose=0, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_DTree = rand_search.best_estimator_

The best precision score is 0.8445275440461636
... with parameters: {'min_samples_split': 13, 'min_samples_leaf': 8, 'min_impurity_decrease': 0.0001, 'max_leaf_nodes': 10, 'max_depth': 33, 'criterion': 'gini'}


In [35]:
c_matrix = confusion_matrix(y_test, rand_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Decision Tree Random ", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.3f} Precision={TP/(TP+FP):.3f} Recall={TP/(TP+FN):.3f} F1={2*TP/(2*TP+FP+FN):.3f}")

Accuracy=0.846 Precision=0.838 Recall=0.857 F1=0.847


## Summary

In [36]:
performance.sort_values(by=['Precision'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Decision Tree Random,0.846298,0.837937,0.856874,0.8473
0,Poly SVM Random,0.859419,0.843243,0.881356,0.861878


# 4.0 Analysis

As per the above summary after running the models for SVM Poly and Decision Tree through Random Search the best model turns out to be Poly SVM with Precision score 0.843243 with the best parameter as {'gamma': 'auto', 'degree': 5, 'coef0': 1, 'C': 7} on the other hand Decision Tree has the Precision Score 0.837937 with the best parameters as {'min_samples_split': 13, 'min_samples_leaf': 8, 'min_impurity_decrease': 0.0001, 'max_leaf_nodes': 10, 'max_depth': 33, 'criterion': 'gini'} both using Random Search.