In [19]:
import os
import sys
import json
import argparse
from time import time
from functools import partial

import numpy as np
import pandas as pd

from tqdm import tqdm

from sklearn.preprocessing import normalize
from graspologic.cluster import GaussianCluster as GMM

from rsq.samplers import *
from rsq.samplers import _Sampler
from rsq.helpers import set_seeds

from pulearn import BaggingPuClassifier
from pulearn import ElkanotoPuClassifier

from sklearn.random_projection import GaussianRandomProjection
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import ClassifierMixin

import pickle

from joblib import Parallel, delayed

def stratified_sample(y, p=0.67, replace=False):
    unique_y, counts = np.unique(y, return_counts=True)
    n_per_class = np.array([int(np.math.floor(p*c)) for c in counts])
    n_per_class = np.array([max([npc, 1]) for npc in n_per_class])
    
    inds = [np.random.choice(np.where(y == unique_y[i])[0], size=npc, replace=replace) for i, npc in enumerate(n_per_class)]
    
    return np.concatenate(inds)

In [20]:
class SemiSupervisedTreeClassifier(ClassifierMixin):
    def __init__(self, max_depth=10, induce=True, 
                 induce_class=GMM, induce_kwargs={'min_components':100, 'max_components':100},
                 random_projection=False,
                 projection_kwargs={}):
        
        self.max_depth=max_depth
        self.fitted=False
        
        self.induce=induce
        self.induce_class=induce_class
        self.induce_kwargs=induce_kwargs
        
        self.random_projection=random_projection
        self.projection_kwargs=projection_kwargs
        self.projector=None
        
    def get_params(self, deep=True):
    # suppose this estimator has parameters "alpha" and "recursive"
    
        params_dict = {"max_deth": self.max_depth, "fitted": self.fitted, 
            'induce': self.induce, 'induce_class': self.induce_class, 'induce_kwargs':self.induce_kwargs,
            'random_projection': self.random_projection, 'projection_kwargs': self.projection_kwargs,
           }
        return params_dict

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
        
    def fit(self, X, y, y_induced=None):
        self.classes_ = np.array([i for i in np.unique(y) if i != -1])
        
        if y_induced is None and self.induce:
            if self.random_projection:
                self.projector = GaussianRandomProjection(**self.projection_kwargs)
                self.projector.fit(X)
                X = self.projector.transform(X.copy())
                
            y_induced = self.induce_class(**self.induce_kwargs).fit_predict(X)
            
        self.tree = DecisionTreeClassifier(max_depth=self.max_depth).fit(X, y_induced)
        decision_paths = self.tree.decision_path(X)
        self.decision_paths = [dp.nonzero()[1] for dp in decision_paths]
        
        labeled_indices = np.where(y != -1)[0]
        self.labeled_decision_paths = [self.decision_paths[i] for i in labeled_indices]
        
        self.labels = y[labeled_indices]
            
        self.nodes_with_labeled_data = np.unique(np.concatenate(self.labeled_decision_paths))
        self.projection_matrix = None
        
        self._get_mapping()
        
        self.fitted=True
    
    def _get_mapping(self):
        self.mapping = {}
                
        for dp in self.decision_paths:
            leaf_node = dp[-1]
                            
            if leaf_node in list(self.mapping.keys()):
                continue
            
            for i in range(1, len(dp)+1):
                temp_node = dp[-i]
                
                if temp_node in self.nodes_with_labeled_data:        
                    temp_counts = np.zeros(len(self.classes_))

                    tree_distance_to_labeled_data = np.zeros(len(self.labeled_decision_paths))
                    for j, labeled_dp in enumerate(self.labeled_decision_paths):
                        if temp_node in labeled_dp:
                            tree_distance_to_labeled_data[j] = i + len(labeled_dp) - 1 - np.where(labeled_dp == temp_node)[0][0] - 1
                        else:
                            tree_distance_to_labeled_data[j] = 100
                                                        
                    min_tree_distance = np.min(tree_distance_to_labeled_data)
                    argmins = np.where(tree_distance_to_labeled_data == min_tree_distance)[0]

                    for index in argmins.astype(int):
                        temp_counts[int(self.labels[index])] += 1
                    
                    self.mapping[leaf_node] = temp_counts / np.sum(temp_counts)
                    break
    
    def predict_proba(self, X):
        if not self.fitted:
            raise ValueError('Not fitted')
            
        if self.random_projection:
            X = self.projector.transform(X.copy())
            
        leaf_nodes = self.tree.apply(X)
        posteriors = np.zeros((len(leaf_nodes), len(self.classes_)))
        
        for i, leaf_node in enumerate(leaf_nodes):
            posteriors[i] = self.mapping[leaf_node]
            
        return posteriors
        
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)
        
class SSRF(ClassifierMixin):
    def __init__(self, n_trees=100, supervised_max_depth=10, semi_supervised_max_depth=10, tree_split=None,
                 induce=True, induce_class=GMM, induce_kwargs={'min_components':100, 'max_components':100},
                 random_projection=False, projection_kwargs={}
                ):
        
        self.n_trees=n_trees
        self.sdepth=supervised_max_depth
        self.ssdepth=semi_supervised_max_depth
        self.tree_split = tree_split
        
        self.induce=induce
        self.induce_class=induce_class
        self.induce_kwargs=induce_kwargs
        
        self.random_projection=random_projection
        self.projection_kwargs=projection_kwargs
        
        self.forest = []
        
    def get_params(self, deep=True):
    # suppose this estimator has parameters "alpha" and "recursive"
    
        params_dict = {'n_trees': self.n_trees,
            'supervised_max_depth': self.sdepth, 'semi_supervised_max_depth': self.ssdepth,
            'tree_split': self.tree_split,
            'induce': self.induce, 'induce_class': self.induce_class, 'induce_kwargs':self.induce_kwargs,
            'random_projection': self.random_projection, 'projection_kwargs': self.projection_kwargs,
           }
        return params_dict

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
        
    def _induce_labels(self, X):
        return self.induce_class(**self.induce_kwargs).fit_predict(X)
        
    def fit(self, X, y, y_induced=None, n_cores=1):
        self.classes_ = np.array([i for i in np.unique(y) if i != -1])
        n = len(y)
        
        labeled_indices = np.where(y != -1)[0]
        
        if self.tree_split is None:
            self.tree_split = len(np.where(y != -1)[0]) / n
                    
        n_supervised_trees = int(self.n_trees * self.tree_split)
        n_semi_supervised_trees = self.n_trees - n_supervised_trees
        
        condensed_func = lambda x: self._build_tree(X, y, y_induced, x, stratified=True)
        func_tuples = np.concatenate((np.ones(n_supervised_trees), np.zeros(n_semi_supervised_trees))).astype(int)
        
        self.forest = Parallel(n_jobs=n_cores)(delayed(condensed_func)(tuple_) for tuple_ in func_tuples)
                                
    def predict_proba(self, X):
        posteriors = np.zeros((X.shape[0], len(self.classes_)))
                
        for i, tree in enumerate(self.forest):
            temp = tree.predict_proba(X)
            posteriors += temp
            
        return posteriors / len(self.forest)
    
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)
            
    def _build_tree(self, X, y, y_induced, supervised=True, stratified=True):        
        labeled_indices = np.where(y != -1)[0]
        unlabeled_indices = np.where(y == -1)[0]
        
        if len(labeled_indices) == len(y):
            all_supervised=True
        else:
            all_supervised=False
            
        if supervised:
            bag_inds = stratified_sample(y[labeled_indices], p=0.67, replace=False)
            tree = DecisionTreeClassifier(max_depth=self.sdepth)
            tree.fit(X[labeled_indices[bag_inds]], y[labeled_indices[bag_inds]])
            
        else:
            sbag_inds = stratified_sample(y[labeled_indices], p=0.67, replace=False)
            if all_supervised:
                bag_inds = sbag_inds
            else:
                ssbag_inds = np.random.choice(len(unlabeled_indices), size=int(X.shape[0]*0.67), replace=True)
                bag_inds = np.concatenate((labeled_indices[sbag_inds], ssbag_inds))
            
            tree = SemiSupervisedTreeClassifier(max_depth=self.ssdepth, 
                                                induce=self.induce, induce_class=self.induce_class, induce_kwargs=self.induce_kwargs,
                                               random_projection=self.random_projection, projection_kwargs=self.projection_kwargs)
            if y_induced is None:
                tree.fit(X[bag_inds], y[bag_inds])
            else:
                tree.fit(X[bag_inds], y[bag_inds], y_induced[bag_inds])
                
        return tree

In [21]:
data = pickle.load(open('../../../data/train_frontal_Bit_m-r101x1_with_labels.p', 'rb'))

In [43]:
X_all = np.array([vec for vec in data['vector']])

# category_indices = np.array([6,8,10,11,12,13,14,15,16,17,18])
category_indices = np.array([6,8,10,11,12,13,14,15,16,18])

conditions = np.array(list(data.iloc[0, category_indices].keys()))

competition_conditions = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']
competition_conditions_indices = [np.where(conditions == c)[0][0] for c in conditions]


labels = np.arange(len(category_indices))

n, d = X_all.shape
y_all = np.zeros(n)

fly_list = []
for i in range(n):
    temp_conditions = data.iloc[i, category_indices]
    positive_conditions = np.where(temp_conditions.values.astype(int) > 0)[0]
    
    if len(positive_conditions) > 1:
        temp_competition_condition_indices = []
        for pc in positive_conditions:
            if pc in competition_conditions_indices:
                temp_competition_condition_indices.append(pc)
        if len(temp_competition_condition_indices) == 1:
            y_all[i] = temp_competition_condition_indices[0]
            fly_list.append(i)
    elif len(positive_conditions) == 1:
        y_all[i] = positive_conditions[0]
        fly_list.append(i)
        
fly_list = np.array(fly_list)
X = X_all[fly_list]
y = y_all[fly_list]

In [44]:
idx_by_label = [np.where(y == c)[0] for c in np.unique(y)]

[print(len(ibl), conditions[i]) for i, ibl in enumerate(idx_by_label)]

16974 No Finding
5591 Cardiomegaly
2535 Lung Lesion
13836 Edema
3378 Consolidation
1622 Pneumonia
7590 Atelectasis
7708 Pneumothorax
27420 Pleural Effusion
3079 Fracture


[None, None, None, None, None, None, None, None, None, None]

In [None]:
np.random.seed(1)

prop_labeled=[0, 0.1, 0.2, 0.5, 0.8, 1]
n_cores=90
n_mc=int(n_cores / len(prop_labeled))

experiment_tuples = []

for i, p in enumerate(prop_labeled):
    for _ in range(n_mc):
        all_inds = stratified_sample(y, p=0.1, replace=False)
        n=len(all_inds)

        inds_sup = stratified_sample(y[all_inds], p, replace=False)

        X_ = X[all_inds]
        y_ = -1 * np.ones(n)
        y_[inds_sup] = y[all_inds[inds_sup]]

        experiment_tuples.append((X_, y_))
    

condensed_func = lambda x: random_forest_exp(x[0], x[1])
start_time = time()
try:
    accuracies = Parallel(n_jobs=n_cores)(delayed(condensed_func)(tupl) for tupl in experiment_tuples)
    print("finished in %1.1f"%(time() - start_time))
except:
    print("error after %1.1f"%(time() - start_time))

In [46]:
def random_forest_exp(X, y, acorn=None, algorithm_indices=np.arange(4)):
    accs = np.zeros(len(algorithm_indices))
        
    gmm=GMM(min_components=30, max_components=30)
    gmm.fit(X)
    y_induced = gmm.predict(X)
    
    n = len(y)
    
    test_inds = np.where(y == -1)[0]
    
    if 0 in algorithm_indices:
        urf = SSRF(n_trees=100, supervised_max_depth=None, semi_supervised_max_depth=None, tree_split=0)
        urf.fit(X, y, y_induced)
        accs[np.where(algorithm_indices == 0)[0][0]] = np.mean(urf.predict(X[test_inds]) == y[test_inds])

    if 1 in algorithm_indices:
        halfrf = SSRF(n_trees=100, supervised_max_depth=None, semi_supervised_max_depth=None, tree_split=0.5)
        halfrf.fit(X, y, y_induced)
        accs[np.where(algorithm_indices == 1)[0][0]] = np.mean(halfrf.predict(X[test_inds]) == y[test_inds])

    if 2 in algorithm_indices:
        ratiorf = SSRF(n_trees=100, supervised_max_depth=None, semi_supervised_max_depth=None, tree_split=None)
        ratiorf.fit(X, y, y_induced)
        accs[np.where(algorithm_indices == 2)[0][0]] = np.mean(ratiorf.predict(X[test_inds]) == y[test_inds])

    if 3 in algorithm_indices:
        rf = SSRF(n_trees=100, supervised_max_depth=None, semi_supervised_max_depth=None, tree_split=1)
        rf.fit(X, y, y_induced)
        accs[np.where(algorithm_indices == 3)[0][0]] = np.mean(rf.predict(X[test_inds]) == y[test_inds])
    
    if 4 in algorithm_indices:
        randrf = SSRF(n_trees=100, supervised_max_depth=None, semi_supervised_max_depth=None, tree_split=0.5, 
                      induce=True, induce_class=GMM, induce_kwargs={'min_components': 30, 'max_components':30},
                      random_projection=True, projection_kwargs={'n_components': 5})
        randrf.fit(X, y, y_induced=None)
        accs[np.where(algorithm_indices == 4)[0][0]] = np.mean(randrf.predict(X[test_inds]) == y[test_inds])
        
    
    return accs