In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from pathlib import Path

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from joblib import dump, load
import pickle

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [5]:
from src.features.proximityMatrix import proximityMatrix

In [6]:
project_dir = Path(os.path.abspath('')).resolve().parents[0]

#### Load processed data

In [7]:
infile = open(project_dir / 'data' / 'processed' / 'datasets', 'rb')
new_dict = pickle.load(infile)

In [8]:
new_dict.keys()

dict_keys(['X_train', 'X_test', 'y_train', 'y_test'])

In [9]:
X_train = new_dict['X_train']
X_test = new_dict['X_test']
y_train = new_dict['y_train']
y_test = new_dict['y_test']

#### Load model

In [10]:
infile = open(project_dir / 'models' / 'clf', 'rb')
clf_old = pickle.load(infile)

#### Retrain model on full dataset to get proximity matrix

In [11]:
X = pd.concat([X_train, X_test], axis=0)

In [12]:
y = pd.concat([y_train, y_test], axis=0)

In [13]:
clf = RandomForestClassifier(n_estimators=100, random_state=0, oob_score=True)

In [14]:
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=0, verbose=0,
                       warm_start=False)

In [15]:
clf.oob_score_

0.9945615227736234

#### Get proximity matrix

In [16]:
# Subsample to reduce computation time
_, X_sub, _, y_sub = train_test_split(X, y, test_size=0.1, random_state=42)

In [17]:
X_sub.shape

(3972, 92)

In [18]:
prox_matrix = proximityMatrix(clf, X_sub)

In [23]:
pd.concat([pd.DataFrame(prox_matrix), y_sub.reset_index(drop=True)], axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3963,3964,3965,3966,3967,3968,3969,3970,3971,loan_status
0,0.00,0.99,0.96,0.99,0.95,0.99,1.00,0.99,0.96,0.99,...,0.95,0.92,0.98,0.96,0.77,1.0,1.00,0.98,0.99,0
1,0.99,0.00,0.99,1.00,1.00,0.99,1.00,1.00,0.99,0.76,...,0.98,1.00,1.00,1.00,0.99,1.0,1.00,1.00,0.95,0
2,0.96,0.99,0.00,0.95,0.84,0.89,1.00,0.95,1.00,1.00,...,1.00,0.85,0.85,0.85,0.90,1.0,0.97,0.92,1.00,0
3,0.99,1.00,0.95,0.00,0.93,0.92,0.99,0.94,1.00,1.00,...,1.00,0.98,0.92,0.83,0.99,1.0,0.99,0.95,1.00,0
4,0.95,1.00,0.84,0.93,0.00,0.68,1.00,1.00,1.00,1.00,...,1.00,0.80,0.91,0.88,0.84,1.0,1.00,0.98,1.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3967,0.77,0.99,0.90,0.99,0.84,0.95,1.00,0.98,0.96,1.00,...,0.84,0.88,0.95,0.94,0.00,1.0,1.00,0.97,0.99,0
3968,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,1.00,1.00,1.00,0.0,1.00,1.00,1.00,1
3969,1.00,1.00,0.97,0.99,1.00,0.99,0.91,0.99,1.00,1.00,...,1.00,0.96,1.00,0.98,1.00,1.0,0.00,0.86,1.00,0
3970,0.98,1.00,0.92,0.95,0.98,1.00,1.00,0.94,1.00,1.00,...,1.00,0.95,0.98,0.96,0.97,1.0,0.86,0.00,1.00,0
