In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from pathlib import Path

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from joblib import dump, load
import pickle

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [5]:
from src.features.proximityMatrix import proximityMatrix

In [6]:
project_dir = Path(os.path.abspath('')).resolve().parents[0]

#### Load processed data

In [7]:
infile = open(project_dir / 'data' / 'processed' / 'datasets', 'rb')
new_dict = pickle.load(infile)

In [8]:
new_dict.keys()

dict_keys(['X_train', 'X_test', 'y_train', 'y_test'])

In [9]:
X_train = new_dict['X_train']
X_test = new_dict['X_test']
y_train = new_dict['y_train']
y_test = new_dict['y_test']

#### Load model

In [10]:
infile = open(project_dir / 'models' / 'clf', 'rb')
clf_old = pickle.load(infile)

#### Retrain model on full dataset to get proximity matrix

In [11]:
X = pd.concat([X_train, X_test], axis=0)

In [12]:
y = pd.concat([y_train, y_test], axis=0)

In [13]:
clf = RandomForestClassifier(n_estimators=100, random_state=0, oob_score=True)

In [14]:
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=0, verbose=0,
                       warm_start=False)

In [15]:
clf.oob_score_

0.9945615227736234

#### Get proximity matrix

In [16]:
# Subsample to reduce computation time
_, X_sub, _, y_sub = train_test_split(X, y, test_size=0.25, random_state=42)

In [17]:
X_sub.shape

(9930, 92)

In [18]:
prox_matrix = proximityMatrix(clf, X_sub)

In [19]:
prox_matrix_df = pd.DataFrame(prox_matrix)

In [20]:
prox_matrix_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9920,9921,9922,9923,9924,9925,9926,9927,9928,9929
0,0.00,0.99,0.96,0.99,0.95,0.99,1.00,0.99,0.96,0.99,...,0.99,0.99,1.00,1.00,0.95,1.00,0.94,0.94,1.00,1.00
1,0.99,0.00,0.99,1.00,1.00,0.99,1.00,1.00,0.99,0.76,...,1.00,1.00,1.00,1.00,1.00,1.00,0.99,1.00,1.00,1.00
2,0.96,0.99,0.00,0.95,0.84,0.89,1.00,0.95,1.00,1.00,...,0.98,1.00,0.98,0.88,0.89,1.00,0.89,0.87,0.98,1.00
3,0.99,1.00,0.95,0.00,0.93,0.92,0.99,0.94,1.00,1.00,...,1.00,1.00,1.00,1.00,0.94,1.00,0.98,0.98,0.90,1.00
4,0.95,1.00,0.84,0.93,0.00,0.68,1.00,1.00,1.00,1.00,...,0.99,1.00,0.99,0.98,0.80,1.00,0.72,0.76,0.99,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9925,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,1.00,1.00,1.00,0.00,1.00,1.00,1.00,0.47
9926,0.94,0.99,0.89,0.98,0.72,0.84,1.00,1.00,0.99,1.00,...,0.99,1.00,1.00,1.00,0.89,1.00,0.00,0.86,0.99,1.00
9927,0.94,1.00,0.87,0.98,0.76,0.85,1.00,0.97,1.00,1.00,...,0.98,1.00,0.97,0.97,0.80,1.00,0.86,0.00,0.99,1.00
9928,1.00,1.00,0.98,0.90,0.99,0.98,0.99,1.00,1.00,0.99,...,0.92,1.00,0.94,1.00,0.95,1.00,0.99,0.99,0.00,1.00


#### output as pickle

In [22]:
data = {'X_sub':X_sub, 
        'y_sub':y_sub, 
        'prox_matrix_df':prox_matrix_df}

out_file = open(project_dir / 'data' / 'processed' / "prox_datasets", "wb")
pickle.dump(data, out_file)

In [23]:
out_file = open(project_dir / 'models' / "clf_full", "wb")
pickle.dump(clf, out_file)