# Building and Comparing Decision Trees

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

In [3]:
#Rebuild Original Table for Modeling

In [4]:
aisles = pd.read_csv('aisles.csv')

In [5]:
departments = pd.read_csv('departments.csv')

In [6]:
order_products__prior = pd.read_csv('order_products__prior.csv')

In [7]:
order_products__train = pd.read_csv('order_products__train.csv')

In [8]:
orders = pd.read_csv('orders.csv')

In [9]:
products = pd.read_csv('products.csv')

In [10]:
order_products__prior_df = pd.merge(order_products__prior, products, on='product_id', how='left')
order_products__prior_df = pd.merge(order_products__prior_df, aisles, on='aisle_id', how='left')
order_products__prior_df = pd.merge(order_products__prior_df, departments, on='department_id', how='left')
order_products__prior_df = pd.merge(order_products__prior_df, orders, on='order_id', how='left')
order_products__prior_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,Organic Egg Whites,86,16,eggs,dairy eggs,202279,prior,3,5,9,8.0
1,2,28985,2,1,Michigan Organic Kale,83,4,fresh vegetables,produce,202279,prior,3,5,9,8.0
2,2,9327,3,0,Garlic Powder,104,13,spices seasonings,pantry,202279,prior,3,5,9,8.0
3,2,45918,4,1,Coconut Butter,19,13,oils vinegars,pantry,202279,prior,3,5,9,8.0
4,2,30035,5,0,Natural Sweetener,17,13,baking ingredients,pantry,202279,prior,3,5,9,8.0


In [12]:
#Extract from main table numerical data only (no strings/object-type data)
order_products__prior_df_mod = order_products__prior_df.drop(['product_name', 'aisle', 'department', 'eval_set'], axis=1)
order_products__prior_df_mod.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,aisle_id,department_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,86,16,202279,3,5,9,8.0
1,2,28985,2,1,83,4,202279,3,5,9,8.0
2,2,9327,3,0,104,13,202279,3,5,9,8.0
3,2,45918,4,1,19,13,202279,3,5,9,8.0
4,2,30035,5,0,17,13,202279,3,5,9,8.0


In [13]:
order_products__prior_df_mod2 = order_products__prior_df_mod.dropna()

In [18]:
order_products__prior_df_mod2.isnull().sum()

order_id                  0
product_id                0
add_to_cart_order         0
reordered                 0
aisle_id                  0
department_id             0
user_id                   0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

In [19]:
#This is the first table we will build a decision tree off of
final = order_products__prior_df_mod2.drop(['order_id', 'product_id', 'aisle_id', 'department_id', 'user_id'], axis=1)

final.head()

Unnamed: 0,add_to_cart_order,reordered,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1,1,3,5,9,8.0
1,2,1,3,5,9,8.0
2,3,0,3,5,9,8.0
3,4,1,3,5,9,8.0
4,5,0,3,5,9,8.0


# Data Set-Up to make Model 1

In [21]:
X = final.drop('reordered', axis = 1)  #reordered is target valuet
y = final['reordered']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [22]:
#This took around 30 minutes to run...
param_grid = [{'max_depth': [3, 5, 7, 9, 11]}]

tree_classififer = DecisionTreeClassifier()

grid_search = GridSearchCV(tree_classififer, param_grid, cv = 5, scoring = 'neg_mean_squared_error')

grid_search.fit(X_train, y_train)

#Getting best estimator directly. Inputting results in refit Decision Tree model.
grid_search.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

# Training Decision Tree Model 1

In [25]:
###MAKE SURE TO CHECK THAT MAX_DEPTH MATCHES GRID SEARCH BEST ESTIMATOR

tree_classifier = DecisionTreeClassifier(max_depth = 11)

tree_classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

# Predicting with Model 1

In [36]:
#Predicting on test data
predictions1 = tree_classifier.predict(X_test)
predictions1

array([1, 1, 0, ..., 1, 1, 1])

# Assessing Model 1 Performance

In order to determine the accuracy of a model, need to look at how many **correct predictions were made out of all predictions made**.

In [48]:
#Accuracy report
from sklearn.metrics import accuracy_score

pred1_accuracy = accuracy_score(y_test, predictions1)
pred1_accuracy

0.6982017383571885

**69.82%** of the time, the model predicted correct classes given sample. This is pretty high accuracy, but that leaves 30% of the predictions being false positives. In the context of our problem, that means that 30% of the predictions **(3,023,300)** would indicate that there was a reorder when there really was not. This would lead to a huge amount of misdirected business funds/investments. 

Let us find out more about how accurate our model is by assessing it with several other performance metrics.

In [53]:
#Confusion matrix/contingency table
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_test, predictions1)
print(matrix)

[[1490896 2217460]
 [ 805840 5503423]]


In [55]:
#T/T
1490896/10017619

0.1488273810373503

In [56]:
#F/F
5503423/10017619

0.5493743573198382

In [57]:
#F/T
2217460/10017619

0.2213559928761515

In [58]:
#T/F
805840/10017619

0.08044226876666002

This is a binary classification problem. The top row are observed and down the side are predicted. Each cell are the number of predictions made by classifier that fall into each category.

So these would be the matrix top row labels:

obs T, obs F

And the matrix side column labels:

pred T
pred F

So ideally, you want all predictions to fall into obs T, pred T and obs F, pred F. So you want a strong diagonal from top left to bottom right and nothing in the other diagonal from top right to left bottom. Otherwise, you have Type 1 (obs F, pred T) and/or Type 2 (obs T, pred F) errors mixed in with all predictions.

From the output above: 
**True predictions make up 15% of total predictions, false predictions 55%, Type 1 error 22% and Type 2 8%.

So 70% of the correctly predicted are broken up in this way. This makes sense since not a whole lot of people are reordering.

With confusion matrix, get a good idea of class breakdown, predicted class breakdown as well as error type breakdowns.

In [61]:
#Classification Report
from sklearn.metrics import classification_report

report = classification_report(y_test, predictions1)
print(report)

             precision    recall  f1-score   support

          0       0.65      0.40      0.50   3708356
          1       0.71      0.87      0.78   6309263

avg / total       0.69      0.70      0.68  10017619



Precision (Positive Predicted Value-PPV):  ratio of tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. 
t
**The precision is the accuracy of the positive predictions.**

The best value is 1 and the worst value is 0.

Recall (Sensitivity/True Positive Rate): ratio of tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. 

**The recall is intuitively the ability of the classifier to find all the positive samples from test data.**

The best value is 1 and the worst value is 0.

F-1 Score: weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. Put it another way, it is the balance between precision and recall.

The formula for the F1 score is:

F1 = 2 X (precision X recall) / (precision + recall)

In the multi-class and multi-label case, this is the weighted average of the F1 score of each class.

Based on the report output, PPV is .69. It could be higher if it were not for high amount of false positives. True Positive Rate is pretty high at .70. It could be higher if it were not for the false negatives. F1 score .68, which needs to be compared to following model performance metrics to see if this first model is indeed a good model for predicting customer reorders.

Moving on to the second table to compare tree prediction performance with that run with the first table.

In [16]:
final['order_number*days_since_prior_order'] = final['order_number']*final['days_since_prior_order']

In [17]:
final['order_number*reordered'] = final['order_number']*final['reordered']

In [18]:
final['add_to_cart_order2'] = final['add_to_cart_order'] + 15

In [19]:
final['order_dow2'] = final['order_dow'] + 8

In [20]:
final['order_hour_of_day2'] = final['order_hour_of_day']*.5
final.head()

Unnamed: 0,add_to_cart_order,reordered,order_number,order_dow,order_hour_of_day,days_since_prior_order,order_number*days_since_prior_order,order_number*reordered,add_to_cart_order2,order_dow2,order_hour_of_day2
0,1,1,3,5,9,8.0,24.0,3,16,13,4.5
1,2,1,3,5,9,8.0,24.0,3,17,13,4.5
2,3,0,3,5,9,8.0,24.0,0,18,13,4.5
3,4,1,3,5,9,8.0,24.0,3,19,13,4.5
4,5,0,3,5,9,8.0,24.0,0,20,13,4.5


# Data Set-Up to make Model 2

In [None]:
X2 = final.drop('reordered', axis = 1)  #reordered is target valuet
y2 = final['reordered']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.33, random_state=42)

In [None]:
#This took around __ minutes to run...

grid_search.fit(X2_train, y2_train)

#Getting best estimator directly. Inputting results in refit Decision Tree model.
grid_search.best_estimator_

# Training Decision Tree Model 2

In [None]:
###MAKE SURE TO CHECK THAT MAX_DEPTH MATCHES GRID SEARCH BEST ESTIMATOR

tree_classifier = DecisionTreeClassifier(max_depth = 11)

tree_classifier.fit(X2_train, y2_train)

# Predicting with Model 2

In [None]:
#Predicting on test data
predictions2 = tree_classifier.predict(X2_test)
predictions2

# Assessing Model 2 Performance

In [None]:
#Accuracy report

pred1_accuracy = accuracy_score(y2_test, predictions2)
pred1_accuracy

In [None]:
#Confusion matrix/contingency table

matrix = confusion_matrix(y2_test, predictions2)
print(matrix)

In [None]:
#Classification Report
from sklearn.metrics import classification_report

report = classification_report(y_test, predictions1)
print(report)