In [20]:
# Status - As of 4/19/2019 QDA > HHRF > HHKNN > RF > KNN
# Having trouble getting RF to perform well.

from covariates_gclass import *
import seaborn as sns
import networkx as nx
sns.set()
import _pickle as pickle
truth = pickle.load(open('LL1_truth.pkl', 'rb'))
train = pickle.load(open('LL1_train.pkl', 'rb'))
test = pickle.load(open('LL1_test.pkl', 'rb'))

In [12]:
def classify(X, Z, normal_params, fitted_model, m = None):
    """
    Classifies vertices.

    X - n x p; Normally distributed random variables.
    Z - n x d; Not normally distributed random variables.
    fitted_model - A sklearn model that can return posterior estimates.
    m - number of training data used to train fitted_model.
    """

    n, p = X.shape
    m, d = Z.shape
    
    K = len(normal_params)
    
    if n != m:
        raise ValueError('different number of samples for X, Z')
    
    if p == 1:
        norm_pdf = norm.pdf
        X = X.reshape((1, -1))[0]
    else:
        norm_pdf = mvn.pdf
        
    posteriors = fitted_model.predict_proba(Z)
    
    predictions=-1*np.zeros(n)

    for i in range(n):
        print(posteriors[i])
        if m is None:
            smoothed_posterior = posteriors[i]
        else:
            posterior_plus = posteriors[i] + np.ones(K)/m
            smoothed_posterior = posterior_plus / np.sum(posterior_plus)
        temp_pdfs = np.array([norm_pdf(X[i], normal_params[j][0], normal_params[j][1]) for j in range(K)])
        posterior_pdf_prod = temp_pdfs * smoothed_posterior
        predictions[i] = int(np.argmax(posterior_pdf_prod))
        
    return predictions

In [17]:
true_labels = truth['learningData']['classLabel'].values.astype(int)
G = train['0']
n = len(G)

A = nx.to_numpy_array(G)
train_labels = train['learningData']['classLabel'].values.astype(int)
training_idx = train['learningData']['d3mIndex'].values.astype(int)

unique_labels, n_seeds = np.unique(train_labels, return_counts=True)
K = len(unique_labels)
class_train_idx = [np.where(train_labels == i)[0] for i in unique_labels]

train_idx = np.concatenate((class_train_idx)).astype(int)
test_idx = [k for k in range(n) if k not in train_idx]
labels = true_labels[true_labels[test_idx]]

MORE_ATTR = True
attr_number = 1
attrs = []
while MORE_ATTR:
    attr = 'attr'
    temp_attr = list(nx.get_node_attributes(G, 'attr' + str(attr_number)).values())
    if len(temp_attr) == 0:
        MORE_ATTR = False
    else:
        attrs.append(temp_attr)
        attr_number += 1
attrs = np.array(attrs).T
for i in range(attrs.shape[1]):
    attrs[:, i] = attrs[:, i]/max(attrs[:, i])

In [18]:
#- Total number of seeds
m = np.sum(n_seeds)

#- estimate class probabilities
pi_hats = n_seeds / m

ase_obj = ASE(n_elbows=1)
X = ase_obj.fit_transform(A)

Z = attrs

XZ = np.concatenate((X, Z), axis=1)

#- Store mvn samples corresponding to seeds
seeds_norm = X[train_idx]

#- Estimate normal parameters using seeds
mu1, cov1 = estimate_normal_parameters(X[class_train_idx[0]])
params1 = [mu1, cov1]

mu2, cov2 = estimate_normal_parameters(X[class_train_idx[1]])
params2 = [mu2, cov2]

mu3, cov3 = estimate_normal_parameters(X[class_train_idx[2]])
params3 = [mu3, cov3]

#- Convenient way to store
params=[params1, params2, params3]

#- Store uniform samples corresponding to seeds
seeds_beta = Z[train_idx]

#- Using conditional indendence assumption (RF, KNN used for posterior estimates)
# if errors is None:
errors = [[] for i in range(5)]

temp_pred = QDA(X[test_idx], pi_hats, params)
temp_error = 1 - np.sum(temp_pred == true_labels[test_idx])/len(test_idx)
errors[0].append(temp_error)

rf1 = RF(n_estimators=100, max_depth=int(np.round(np.log(seeds_beta.shape[0]/2))))
rf1.fit(seeds_beta, true_labels[train_idx])

knn1 = KNN(n_neighbors=int(np.round(np.log(seeds_beta.shape[0]))))
knn1.fit(seeds_beta, true_labels[train_idx])

smooth = True

if smooth:
    temp_pred = classify(X[test_idx], Z[test_idx], params, rf1, m = m)
    temp_error = 1 - np.sum(temp_pred == true_labels[test_idx])/len(test_idx)
    errors[1].append(temp_error)

    temp_pred = classify(X[test_idx], Z[test_idx], params, knn1, m = m)
    temp_error = 1 - np.sum(temp_pred == true_labels[test_idx])/len(test_idx)
    errors[2].append(temp_error)
else:
    temp_pred = classify(X[test_idx], Z[test_idx], params, rf1)
    temp_error = 1 - np.sum(temp_pred == true_labels[test_idx])/len(test_idx)
    errors[1].append(temp_error)

    temp_pred = classify(X[test_idx], Z[test_idx], params, knn1)
    temp_error = 1 - np.sum(temp_pred == true_labels[test_idx])/len(test_idx)
    errors[2].append(temp_error)

#- Not using conditional independence assumption (RF, KNN used for classification)
XZseeds = np.concatenate((seeds_norm, seeds_beta), axis=1)

rf2 = RF(n_estimators=10, max_depth=int(np.round(np.log(m))))
rf2.fit(XZseeds, true_labels[train_idx])
temp_pred = rf2.predict(XZ[test_idx])
temp_error = 1 - np.sum(temp_pred == true_labels[test_idx])/len(test_idx)
errors[3].append(temp_error)

knn2 = KNN(n_neighbors=int(np.round(np.log(m))))
knn2.fit(XZseeds, true_labels[train_idx])

temp_pred = knn2.predict(XZ[test_idx])
temp_error = 1 - np.sum(temp_pred == true_labels[test_idx])/len(test_idx)
errors[4].append(temp_error)

errors

[0.20192491 0.54788931 0.25018579]
[0.03785714 0.23812404 0.72401881]
[0.12462762 0.25061783 0.62475455]
[0.25798477 0.60726068 0.13475455]
[0.29382831 0.49088231 0.21528938]
[0.03833333 0.18337279 0.77829387]
[0.03833333 0.19337279 0.76829387]
[0.05457207 0.15611508 0.78931285]
[0.21368407 0.59896379 0.18735215]
[0.05607143 0.26473119 0.67919739]
[0.67389835 0.27069689 0.05540476]
[0.18389835 0.63690426 0.17919739]
[0.03785714 0.22812404 0.73401881]
[0.15514835 0.60565426 0.23919739]
[0.24389835 0.62240426 0.13369739]
[0.12272677 0.34708744 0.53018579]
[0.0990252  0.15568543 0.74528938]
[0.28825733 0.58407093 0.12767174]
[0.25531777 0.56449645 0.18018579]
[0.03212762 0.22113889 0.74673349]
[0.03833333 0.19337279 0.76829387]
[0.68416026 0.21019689 0.10564286]
[0.18368407 0.60229712 0.21401881]
[0.0990252  0.15568543 0.74528938]
[0.20472835 0.6171044  0.17816725]
[0.05607143 0.26473119 0.67919739]
[0.2611511  0.57866311 0.16018579]
[0.28531777 0.56116311 0.15351912]
[0.03833333 0.183372

[[0.5375],
 [0.42500000000000004],
 [0.38749999999999996],
 [0.3125],
 [0.17500000000000004]]