In [1]:
%matplotlib inline
import pandas as pd

# Purpose

In [2]:
# Identify groups of observations that are different than others, then generate a list of these groups with their characteristics

In [5]:
def decision_tree(X,y,max_depth,min_samples_leaf):

    import numpy as np

    from sklearn.tree import DecisionTreeClassifier

    from sklearn.tree import DecisionTreeRegressor

    import pandas as pd

#     X=pd.get_dummies(X)

 

# Make a decision tree (classifier or regression)

    if y.dtype.name in ('float32','float64'):

        estimator = DecisionTreeRegressor(max_depth =max_depth,min_samples_leaf=min_samples_leaf)

    else:

        estimator = DecisionTreeClassifier(max_depth =max_depth,min_samples_leaf=min_samples_leaf)

    estimator.fit(X,y)

    return estimator

def leaf_aggregates(y,leave_id):

    import pandas as pd

    import numpy as np

    leaves_y=pd.DataFrame({'y': y ,'leaf_id':leave_id,'count':1})

    leaf_df =leaves_y.groupby("leaf_id").agg({"y": "mean","count":"count"})

    comparison_y= np.average(y)

    leaf_df["divergence"]=np.abs(leaf_df["y"]-comparison_y)

    leaf_df.reset_index(inplace=True)

    leaf_df = leaf_df.sort_values(by=['divergence'],ascending=False)

    return leaf_df

    leaf_df.plot.scatter("count","divergence")

def get_rules(estimator,X,target_id,leave_id):

    import pandas as pd

    # A matrix with an indicator for sample i whether node j is in its path

    node_indicator = estimator.decision_path(X)

    # Find the first sample that is in that leaf

    for i in range(len(X)):

        if node_indicator[i,target_id]==1:

            sample_id=i

            break

    # feature and threshold for every node

    feature = estimator.tree_.feature

    threshold = estimator.tree_.threshold

 

    # All of the nodes leading to the leaf

    node_index = node_indicator.indices[node_indicator.indptr[sample_id]:

                                        node_indicator.indptr[sample_id + 1]]

    # dataframe of rules

    rules = pd.DataFrame(columns = ['node_id','feature','sign','threshold'])

    for node_id in node_index:

        if leave_id[sample_id] == node_id:

            continue

        if (X.iloc[sample_id, feature[node_id]] <= threshold[node_id]):

            threshold_sign = "<="

        else:

            threshold_sign = ">"

 

        rules = rules.append({'node_id':node_id,'feature':X.columns[feature      [node_id]],'sign':threshold_sign,'threshold':threshold[node_id]}, ignore_index=True)   

    # organize rules

#     rules =rules[rules.sign=='>'].groupby(["feature","sign"]).agg({"threshold":"max"}).append(rules[rules.sign=='<='].groupby(["feature","sign"]).agg({"threshold":"min"}) ).reset_index().sort_values("feature")

    return rules

# for a given depth, returns all of the leaves

def decision_tree_analyzer(X,y,max_depth=5,min_samples_leaf = 1000):

    estimator = decision_tree(X,y,max_depth,min_samples_leaf)

    leave_id = estimator.apply(X)

    leaf_df = leaf_aggregates(y,leave_id)

    target_id = int(leaf_df.loc[leaf_df["divergence"].idxmax()].leaf_id)

    for target_id in leaf_df.leaf_id:

        rules = get_rules(estimator,X,target_id,leave_id)

        print(leaf_df[leaf_df.leaf_id==target_id])

        print(rules)

        print('')

 

# gives the most divergent leaf for every tree from depth 1 to 'deepest'       

def decision_tree_analyzer2(X,y,deepest=5,min_samples_leaf = 1000):

    for max_depth in range(1,deepest):

        estimator = decision_tree(X,y,max_depth,min_samples_leaf)

        leave_id = estimator.apply(X)

        leaf_df = leaf_aggregates(y,leave_id)

        target_id = int(leaf_df.loc[leaf_df["divergence"].idxmax()].leaf_id)

        rules = get_rules(estimator,X,target_id,leave_id)

        print(leaf_df[leaf_df.leaf_id==target_id])

        print(rules)

        print('')

        
def divergence_table(data,y_column,count_column,min_count=1000):

    import numpy as np
    avg_y=np.mean(data[y_column])

    big_var_table=pd.DataFrame(columns=[y_column, count_column, 'divergence', 'value',

           'variable'])

    for col in data.columns.drop(y_column):

        var_table = data.groupby(col).agg({y_column:"mean",count_column:"count"})

        var_table["divergence"]=np.abs(var_table[y_column]-avg_y)

        var_table=var_table[var_table[count_column]>min_count]

        var_table["value"]=var_table.index

        var_table = var_table.reset_index(drop=True)

        var_table = var_table.sort_values(by=['divergence'],ascending=False)

        var_table["variable"]=col

    #     var_table=var_table.rename(index=str, columns={col: "value"})

        big_var_table=big_var_table.append(var_table,ignore_index=True)

    return big_var_table.sort_values(by=['divergence'],ascending=False)

#todo: output the graph of leaves: count vs average
#todo: use out of sample count and average
#todo: prettify rules - smooth out >2.5 as >=3 if this is an int, use = if there is only one option


# Examples

# Gender Survey

In [34]:
# https://www.kaggle.com/hb20007/gender-classification
data=pd.read_csv('gender survey.csv')

In [38]:
data["count"]=1

In [40]:
X=pd.get_dummies(data,drop_first=True).fillna(0)

## Divergence Table

In [43]:
divergence_table(X,"Gender_M","count",min_count=10)

Unnamed: 0,Gender_M,count,divergence,value,variable
7,0.235294,17,0.264706,1,Favorite Music Genre_Pop
12,0.642857,14,0.142857,1,Favorite Beverage_Doesn't drink
14,0.363636,11,0.136364,1,Favorite Beverage_Other
8,0.591837,49,0.091837,0,Favorite Music Genre_Pop
2,0.409091,22,0.090909,1,Favorite Color_Warm
21,0.571429,14,0.071429,1,Favorite Soft Drink_Fanta
5,0.448276,58,0.051724,0,Favorite Music Genre_Hip hop
3,0.545455,44,0.045455,0,Favorite Color_Warm
13,0.461538,52,0.038462,0,Favorite Beverage_Doesn't drink
19,0.46875,32,0.03125,1,Favorite Soft Drink_Coca Cola/Pepsi


## Decision Tree Analyzer 

In [46]:
decision_tree_analyzer(X.drop("Gender_M",axis=1),X.Gender_M,max_depth=5,min_samples_leaf = 20)

   leaf_id         y  count  divergence
0        2  0.590909     22    0.090909
  node_id                              feature sign  threshold
0       0                  Favorite Color_Warm   <=        0.5
1       1  Favorite Soft Drink_Coca Cola/Pepsi   <=        0.5

   leaf_id         y  count  divergence
2        4  0.409091     22    0.090909
  node_id              feature sign  threshold
0       0  Favorite Color_Warm    >        0.5

   leaf_id    y  count  divergence
1        3  0.5     22         0.0
  node_id                              feature sign  threshold
0       0                  Favorite Color_Warm   <=        0.5
1       1  Favorite Soft Drink_Coca Cola/Pepsi    >        0.5



# Ames Housing Data

In [49]:
# https://www.kaggle.com/samaxtech/ames-housing-data
data=pd.read_csv('AmesHousing/AmesHousing.tsv',sep='\t')

In [67]:
data.shape

(2930, 83)

In [50]:
data.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

In [51]:
data["count"]=1

## Divergence Table

In [54]:
divergence_table(data,"SalePrice","count",min_count=100)

Unnamed: 0,SalePrice,count,divergence,value,variable
93,377918.616822,107,197122.556754,Ex,Exter Qual
54,368336.766355,107,187540.706287,9,Overall Qual
146,337339.341463,205,156543.281395,Ex,Kitchen Qual
101,333369.313953,258,152573.253885,Ex,Bsmt Qual
30,322018.265060,166,141222.204992,NridgHt,Neighborhood
170,310304.622995,374,129508.562926,3,Garage Cars
207,275751.309623,239,94955.249555,New,Sale Type
209,273374.371429,245,92578.311360,Partial,Sale Condition
55,270913.594286,350,90117.534217,8,Overall Qual
107,267507.644366,284,86711.584298,Gd,Bsmt Exposure


In [55]:
X=pd.get_dummies(data,drop_first=True).fillna(0)

## Decision Tree Analyzer 

In [56]:
decision_tree_analyzer(X.drop("SalePrice",axis=1),X.SalePrice,max_depth=5,min_samples_leaf = 20)

    leaf_id              y  count     divergence
26       52  395179.015625     64  214382.955557
  node_id       feature sign  threshold
0       0     Full Bath    >        1.5
1      28  Overall Qual    >        7.5
2      40    Year Built    >     1998.5
3      46    1st Flr SF    >     1684.5
4      50  Mas Vnr Area    >      346.0

    leaf_id              y  count     divergence
25       51  337405.785714     70  156609.725646
  node_id       feature sign  threshold
0       0     Full Bath    >        1.5
1      28  Overall Qual    >        7.5
2      40    Year Built    >     1998.5
3      46    1st Flr SF    >     1684.5
4      50  Mas Vnr Area   <=      346.0

    leaf_id              y  count     divergence
22       45  329658.823529     34  148862.763461
  node_id        feature sign  threshold
0       0      Full Bath    >        1.5
1      28   Overall Qual    >        7.5
2      40     Year Built   <=     1998.5
3      41  Garage Yr Blt    >     1994.5

    leaf_id       

## Decision Tree Analyzer For Remainders of Regression

In [60]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X.drop("SalePrice",axis=1),data.SalePrice)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [62]:
y_pred = lr.predict(X.drop("SalePrice",axis=1))
y_error = data.SalePrice-y_pred

In [63]:
decision_tree_analyzer(X.drop("SalePrice",axis=1),y_error,max_depth=5,min_samples_leaf = 20)

    leaf_id             y  count    divergence
11       23  47637.164789     34  47637.164789
  node_id        feature sign  threshold
0       0  Total Bsmt SF    >     1924.0
1      20        Mo Sold   <=        7.5
2      21    Garage Area    >      739.0

   leaf_id             y  count    divergence
9       19  29182.563646     21  29182.563646
  node_id        feature sign  threshold
0       0  Total Bsmt SF   <=     1924.0
1       1    Garage Area    >      434.5
2      11      Full Bath    >        2.5
3      17  Total Bsmt SF    >     1268.5

   leaf_id             y  count    divergence
2        7  16382.579543     26  16382.579543
  node_id        feature sign  threshold
0       0  Total Bsmt SF   <=     1924.0
1       1    Garage Area   <=      434.5
2       2          Order    >       88.5
3       4   Overall Qual   <=        3.5
4       5       Lot Area    >     8121.0

    leaf_id             y  count    divergence
12       24 -15060.067835     25  15060.067835
  node_id 

## Decision Analyzer 2: Gives the most divergent leaf of every tree up to X deep

In [64]:
decision_tree_analyzer2(X.drop("SalePrice",axis=1),y_error,deepest=5,min_samples_leaf = 20)

   leaf_id             y  count    divergence
1        2  16790.644426     80  16790.644426
  node_id        feature sign  threshold
0       0  Total Bsmt SF    >     1924.0

   leaf_id             y  count    divergence
2        5  31268.240908     55  31268.240908
  node_id        feature sign  threshold
0       0  Total Bsmt SF    >     1924.0
1       4        Mo Sold   <=        7.5

   leaf_id             y  count    divergence
5       11  47637.164789     34  47637.164789
  node_id        feature sign  threshold
0       0  Total Bsmt SF    >     1924.0
1       8        Mo Sold   <=        7.5
2       9    Garage Area    >      739.0

   leaf_id             y  count    divergence
8       17  47637.164789     34  47637.164789
  node_id        feature sign  threshold
0       0  Total Bsmt SF    >     1924.0
1      14        Mo Sold   <=        7.5
2      15    Garage Area    >      739.0



## Boston Housing Data

In [65]:
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
boston = load_boston()
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(boston.data, boston.target)


DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [66]:
X=pd.DataFrame(boston.data)
decision_tree_analyzer(X,boston.target,max_depth=5,min_samples_leaf = 10)

    leaf_id       y  count  divergence
18       37  47.975     16   25.442194
  node_id feature sign  threshold
0       0       5    >      6.941
1      28       5    >      7.437
2      36      10   <=     15.400

    leaf_id          y  count  divergence
19       38  41.807143     14   19.274337
  node_id feature sign  threshold
0       0       5    >      6.941
1      28       5    >      7.437
2      36      10    >     15.400

    leaf_id      y  count  divergence
14       31  37.19     10   14.657194
  node_id feature sign  threshold
0       0       5    >     6.9410
1      28       5   <=     7.4370
2      29      12   <=     5.4950
3      30       7   <=     5.3229

    leaf_id        y  count  divergence
13       27  9.81087     46   12.721937
  node_id feature sign  threshold
0       0       5   <=      6.941
1       1      12    >     14.400
2      13       4    >      0.607
3      21      12    >     19.645
4      25       9    >    551.500

   leaf_id      y  count  diverg