#### Notebook for testing proximity matrix

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier

In [3]:
from scipy.spatial.distance import pdist, squareform

#### Read in data and preprocess

In [4]:
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [5]:
# Create a list of the feature column's names
features = df.columns[:4]

In [6]:
# Add a new column with the species names, this is what we are going to try to predict
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

In [7]:
# train['species'] contains the actual species names. Before we can use it,
# we need to convert each species name into a digit. So, in this case there
# are three species, which have been coded as 0, 1, or 2.
y = pd.factorize(df['species'])[0]

#### Fit random forest on full dataset

In [8]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(random_state=0, n_estimators=100, oob_score=True)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(df[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=0, verbose=0,
                       warm_start=False)

In [9]:
clf.oob_score_

0.9533333333333334

#### build function for proximity matrix

In [10]:
def proximityMatrix(model, df, normalize=True, dist=True):
    
    '''
    Takes in a model and the model data matrix and calculates the proximity matrix
    The proximity matrix is converted into a distance matrix by taking 1 - proximity
    
    Parameters
    ----------
    model : model object
        A trained sklearn model with apply method

    df : Pandas DataFrame
        Model Matrix for calculating proximity matrix. 
        Must have the same features as matrix used to train the model

    normalize : bool, optional (default=True)
        Converts the entries in the matrix to values between 0 and 1
        
    dist : bool, optional (default=True)
        Converts proximity matrix into a distance matrix.
        The smaller the value, the closer the distance.

    '''
    
    # Returns the number of trees for which a pair of observations 
    # are in the same terminal node (have the same leaf index)
    def dfun(u, v):
        ret = (u == v).sum()
        return ret
    
    leaf_indices_df = model.apply(df)
    prox_mat = pdist(leaf_indices_df, dfun)
    prox_mat = squareform(prox_mat)
    
    # Set diagonals to non-zero
    np.fill_diagonal(prox_mat, leaf_indices_df.shape[1])

    if dist:
        prox_mat = leaf_indices_df.shape[1] - prox_mat

    if normalize:
        prox_mat = prox_mat / leaf_indices_df.shape[1]
    
    return prox_mat

In [11]:
pd.DataFrame(proximityMatrix(clf, df[features], normalize=True, dist=True))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.02,0.09,0.42,0.05,0.03,0.00,0.30,0.03,0.12,0.40
146,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.30,0.25,0.38,0.31,0.31,0.30,0.00,0.27,0.30,0.42
147,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.04,0.10,0.39,0.05,0.05,0.03,0.27,0.00,0.09,0.37
148,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.11,0.17,0.35,0.10,0.10,0.12,0.30,0.09,0.00,0.33


In [12]:
pd.DataFrame(proximityMatrix(clf, df[features], normalize=False, dist=True))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,2.0,9.0,42.0,5.0,3.0,0.0,30.0,3.0,12.0,40.0
146,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,30.0,25.0,38.0,31.0,31.0,30.0,0.0,27.0,30.0,42.0
147,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,4.0,10.0,39.0,5.0,5.0,3.0,27.0,0.0,9.0,37.0
148,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,11.0,17.0,35.0,10.0,10.0,12.0,30.0,9.0,0.0,33.0


In [13]:
pd.DataFrame(proximityMatrix(clf, df[features], normalize=True, dist=False))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.98,0.91,0.58,0.95,0.97,1.00,0.70,0.97,0.88,0.60
146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.70,0.75,0.62,0.69,0.69,0.70,1.00,0.73,0.70,0.58
147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.96,0.90,0.61,0.95,0.95,0.97,0.73,1.00,0.91,0.63
148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.89,0.83,0.65,0.90,0.90,0.88,0.70,0.91,1.00,0.67


In [14]:
pd.DataFrame(proximityMatrix(clf, df[features], normalize=False, dist=False))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,98.0,91.0,58.0,95.0,97.0,100.0,70.0,97.0,88.0,60.0
146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,70.0,75.0,62.0,69.0,69.0,70.0,100.0,73.0,70.0,58.0
147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,96.0,90.0,61.0,95.0,95.0,97.0,73.0,100.0,91.0,63.0
148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,89.0,83.0,65.0,90.0,90.0,88.0,70.0,91.0,100.0,67.0
