In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC # not stochastic
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import export_graphviz
import re
from sqlalchemy import create_engine
import pickle
import warnings
# import requests
# from bs4 import BeautifulSoup
# import time
# from fake_useragent import UserAgent
# import sys, os
import flask
import json
from collections import defaultdict

warnings.filterwarnings("ignore")
%matplotlib inline

RANDOM_STATE = 1
TEST_SIZE = 0.3

### Make arbitrary data

In [2]:
with open("cleaned2_data.pkl", 'rb') as picklefile: 
    df = pickle.load(picklefile)
X_cols = [col for col in df.columns if col not in ['q_id', 'q_user', 'a_id', 'a_user', 'label_col']]
X = df[X_cols]
y = df['label_col']

In [65]:
rf = RandomForestClassifier(n_estimators = 1, max_depth = 3)
rf.fit(X, y)
one_tree = rf[0].tree_

In [66]:
export_graphviz(one_tree)

In [67]:
one_tree.children_left

array([ 1,  2,  3, -1, -1,  6, -1, -1,  9, 10, -1, -1, 13, -1, -1])

In [68]:
one_tree.children_right

array([ 8,  5,  4, -1, -1,  7, -1, -1, 12, 11, -1, -1, 14, -1, -1])

In [69]:
one_tree.feature

array([ 3,  3, 32, -2, -2, 41, -2, -2, 10,  0, -2, -2, 36, -2, -2])

In [76]:
one_tree.threshold

array([  1.18500000e+04,   1.09700000e+03,   5.50000000e+00,
        -2.00000000e+00,  -2.00000000e+00,   4.92500000e+02,
        -2.00000000e+00,  -2.00000000e+00,   6.02500000e+02,
         7.50000000e+00,  -2.00000000e+00,  -2.00000000e+00,
         9.50000000e+00,  -2.00000000e+00,  -2.00000000e+00])

In [77]:
one_tree.n_node_samples

array([9008, 6917, 2776,  447, 2329, 4141,  683, 3458, 2091, 1633, 1387,
        246,  458,  307,  151])

In [34]:
one_tree.feature
X.columns[35]

'a_year'

### Turn randon forest’s one tree object into a nested list so json can use it

In [83]:
def tree_to_li(one_tree, featname_li, parent_index = None, direction = None):
    node = {}
    if direction == "l":
        node['name'] = int(one_tree.children_left[parent_index])
    elif direction == "r":
        node['name'] = int(one_tree.children_right[parent_index])
    else:
        node['name'] = 0
    n = int(one_tree.n_node_samples[node['name']])
    if one_tree.children_left[node['name']] != -1:
        feat_label = featname_li[one_tree.feature[node['name']]]
        thres = one_tree.threshold[node['name']]
        node['feat'] = feat_label + r" <= " + str(thres) + r" (n = " + str(n) + r")"
        node['children'] = [tree_to_li(one_tree, featname_li, node['name'], 'l'), 
                            tree_to_li(one_tree, featname_li, node['name'], 'r')] 
    else:
        node['feat'] = r"(n = " + str(n) + r")"
    return node

In [84]:
tree_to_li(one_tree, X.columns)

{'children': [{'children': [{'children': [{'feat': '(n = 447)', 'name': 3},
      {'feat': '(n = 2329)', 'name': 4}],
     'feat': 'q_day <= 5.5 (n = 2776)',
     'name': 2},
    {'children': [{'feat': '(n = 683)', 'name': 6},
      {'feat': '(n = 3458)', 'name': 7}],
     'feat': 'q_body_len <= 492.5 (n = 4141)',
     'name': 5}],
   'feat': 'q_rep <= 1097.0 (n = 6917)',
   'name': 1},
  {'children': [{'children': [{'feat': '(n = 1387)', 'name': 10},
      {'feat': '(n = 246)', 'name': 11}],
     'feat': 'q_score <= 7.5 (n = 1633)',
     'name': 9},
    {'children': [{'feat': '(n = 307)', 'name': 13},
      {'feat': '(n = 151)', 'name': 14}],
     'feat': 'a_mon <= 9.5 (n = 458)',
     'name': 12}],
   'feat': 'a_bronze <= 602.5 (n = 2091)',
   'name': 8}],
 'feat': 'q_rep <= 11850.0 (n = 9008)',
 'name': 0}

In [37]:
tree = tree_to_li(one_tree, X.columns)
tree

TypeError: 'int' object is not subscriptable

In [21]:
#flask.jsonify(tree)

## Stuff I ended up not using

In [None]:
# Source: sf17_ds6/class_lectures/week01-benson/03-python_jupyter/Python2.ipynb
# def tree(): 
#     return defaultdict(tree)

In [None]:
# Source: http://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html
# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
# node_depth = np.zeros(shape=n_nodes)
# is_leaves = np.zeros(shape=n_nodes, dtype=bool)
# stack = [(0, -1)]  # seed is the root node id and its parent depth
# while len(stack) > 0:
#     node_id, parent_depth = stack.pop()
#     node_depth[node_id] = parent_depth + 1

#     # If we have a test node
#     if (children_left[node_id] != children_right[node_id]):
#         stack.append((children_left[node_id], parent_depth + 1))
#         stack.append((children_right[node_id], parent_depth + 1))
#     else:
#         is_leaves[node_id] = True