In [None]:
import importlib
import numpy as np
from matplotlib import pyplot as plt
import sklearn
from sklearn.mixture import GaussianMixture
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import check_array, check_random_state
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import mahalanobis
import gmm_agent
from sklearn.datasets import make_classification
from random import choices
importlib.reload(gmm_agent)

In [None]:
# function to generate classification problems of varying difficulty
def problem_maker(n_samples=2000, n_classes=2, n_features=2, class_sep=1.0, flip_y=0.00, random_state=8):
    X, Y = make_classification(n_samples=n_samples, 
                            n_features=n_features, 
                            n_informative=n_features, 
                            n_redundant=0, 
                            n_repeated=0, 
                            n_classes=n_classes, 
                            n_clusters_per_class=1, 
                            class_sep=class_sep, 
                            flip_y=flip_y, 
                            hypercube=True,
                            random_state=random_state)
    return X, Y

In [None]:
# critical number of samples for detecting a novel class with increasing class separation
np.random.seed(1)
critical_samples = []
separations = [1, 2, 3, 4, 5, 6, 7, 8, 9]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4)
fig.suptitle('Class separation')
axis = [ax1, ax2, ax3, ax4]
axis_index = 0
for i in separations:
    print("processing separation " + str(i))
    X, Y = problem_maker(class_sep=1, n_classes=2, random_state=1)
    X_novel = np.random.normal(loc=[0.0, -1], scale=0.4, size=(500, 2))
    cov = np.array([[0.0, -0.4], [1.7, 0.4]])
    X_novel = np.dot(X_novel, cov)
    X_novel = X_novel + np.array([1+i, 0])

    # find the critical number of samples many times 
    # using different samples from the novel distribution and average the results
    critical_sample_list = []
    averaging_run = 0
    failure_count = 0  # tracks number of times the new component was not discovered
    num_averaging_runs = 10
    total_runs = 0
    while averaging_run < num_averaging_runs and total_runs < 100:
        starting_num_components = 2
        a = gmm_agent.Agent(train_data=X, num_components=starting_num_components, recluster_limit=1)

        index = 1
        while a.num_components == starting_num_components and index < len(X_novel):
            a.classify(X_novel[np.random.choice(range(len(X_novel)))]) # randomly select each novel sample with replacement
            index += 1
        if index != len(X_novel): # only use the value if the model successfully updated
            critical_sample_list.append(index)
            averaging_run += 1
            total_runs += 1
        else:
            failure_count += 1
            total_runs += 1
    
    if total_runs < 100:
        critical_sample_value = np.mean(critical_sample_list)

        print("For class_sep=" + str(i) + ", critical sample for detection: " + str(critical_sample_value))
        print("Failure rate: " + str(failure_count / total_runs))
        critical_samples.append(critical_sample_value)
        print("class_sep: " + str(i))
#         plt.plot(X[:,0], X[:,1], ls='none', marker='.', color='blue')
#         plt.plot(X_novel[:,0], X_novel[:,1], ls='none', marker='.', color='green')
#         plt.show()
        if axis_index < 4:
            axis[axis_index].plot(X[:,0], X[:,1], ls='none', marker='.', color='blue')
            axis[axis_index].plot(X_novel[:,0], X_novel[:,1], ls='none', marker='.', color='green')
            axis_index += 1
    else:
        print()
        print("*** 100 runs complete without reaching " + str(num_averaging_runs) + " successes  ***")
        print()
#     plt.clf()
# plt.plot(separations, critical_samples, ls='none', marker='o')
# plt.title("Class separation vs. critical number of samples")
# plt.xlabel("Class Separation")
# plt.ylabel("Critical number of samples")
# plt.savefig("img_critcal_sample_vs_class_separation", format='pdf')

fig.show()

In [None]:
# critical number of samples for detecting a novel class with increasing variance
np.random.seed(1)
critical_samples = []
separations = [2, 3, 4, 5, 6, 7, 8]
for i in separations:
    print()
    X, Y = problem_maker(class_sep=1, n_classes=2, random_state=1)
    X_novel = np.random.normal(loc=[0.0, -1], scale=0.1*i, size=(500, 2))
#     print(np.mean(X_novel, axis=0))
    cov = np.array([[0.0, -0.4], [1.7, 0.4]])
    X_novel = np.dot(X_novel, cov)
    X_novel = X_novel + np.array([2,0])
    

    # find the critical number of samples many times 
    # using different samples from the novel distribution and average the results
    critical_sample_list = []
    averaging_run = 0
    failure_count = 0  # tracks number of times the new component was not discovered
    num_averaging_runs = 25
    total_runs = 0
    while averaging_run < num_averaging_runs and total_runs < 100:
        starting_num_components = 2
        a = gmm_agent.Agent(train_data=X, num_components=starting_num_components, recluster_limit=1)

        index = 1
        while a.num_components == starting_num_components and index < len(X_novel):
            a.classify(X_novel[np.random.choice(range(len(X_novel)))]) # randomly select each novel sample with replacement
            index += 1
        if index != len(X_novel): # only use the value if the model successfully updated
            critical_sample_list.append(index)
            averaging_run += 1
            total_runs += 1
        else:
            failure_count += 1
            total_runs += 1
    
    if total_runs < 100:
        critical_sample_value = np.mean(critical_sample_list)

        print("For scale=" + str(0.1*i) + ", critical sample for detection: " + str(critical_sample_value))
        print("Failure rate: " + str(failure_count / total_runs))
        critical_samples.append(critical_sample_value)
        plt.plot(X[:,0], X[:,1], ls='none', marker='.', color='blue')
        plt.plot(X_novel[:,0], X_novel[:,1], ls='none', marker='.', color='green')
        plt.show()
    else:
        print()
        print("*** 100 runs complete without reaching " + str(num_averaging_runs) + " successes  ***")
        print()
        critical_samples.append(-1) # so the plot will still work, but will be obviously weird
plt.plot([0.1*i for i in separations], critical_samples, ls='none', marker='o')
plt.title("Critical number of samples vs. variance")
plt.xlabel("variance")
plt.ylabel("critical number of samples")
# plt.savefig("critical_number_vs_scale.pdf", format='pdf')    

In [None]:
# detecting novel distributions in "open space" with different variance
np.random.seed(1)
critical_samples = []
scales = [0.1, 0.5, 1, 2, 3, 4, 5]
for i in scales:
    X, Y = problem_maker(class_sep=1, n_classes=2, random_state=1)
    X_novel = np.random.normal(loc=[0.0, -1], scale=i, size=(500, 2))
    cov = np.array([[0.0, -0.4], [1.7, 0.4]])
    X_novel = np.dot(X_novel, cov)
    X_novel = X_novel + np.array([12, 0])

    # find the critical number of samples many times 
    # using different samples from the novel distribution and average the results
    critical_sample_list = []
    averaging_run = 0
    failure_count = 0  # tracks number of times the new component was not discovered
    num_averaging_runs = 25
    total_runs = 0
    while averaging_run < num_averaging_runs and total_runs < 100:
        starting_num_components = 2
        a = gmm_agent.Agent(train_data=X, num_components=starting_num_components, recluster_limit=1)

        index = 1
        while a.num_components == starting_num_components and index < len(X_novel):
            a.classify(X_novel[np.random.choice(range(len(X_novel)))]) # randomly select each novel sample with replacement
            index += 1
        if index != len(X_novel): # only use the value if the model successfully updated
            critical_sample_list.append(index)
            averaging_run += 1
            total_runs += 1
        else:
            failure_count += 1
            total_runs += 1
    
    if total_runs < 100:
        critical_sample_value = np.mean(critical_sample_list)

        print("For scale=" + str(i) + ", critical sample for detection: " + str(critical_sample_value))
        print("Failure rate: " + str(failure_count / total_runs))
        critical_samples.append(critical_sample_value)
        plt.plot(X[:,0], X[:,1], ls='none', marker='.', color='blue')
        plt.plot(X_novel[:,0], X_novel[:,1], ls='none', marker='.', color='green')
        plt.show()
    else:
        print()
        print("*** 100 runs complete without reaching " + str(num_averaging_runs) + " successes  ***")
        print()
        critical_samples.append(-1) # so the plot will still work, but will be obviously weird
plt.plot([i for i in scales], critical_samples, ls='none', marker='o')
plt.title("Critical number of samples vs. variance - open space")
plt.xlabel("scale")
plt.ylabel("critical number of samples")
# plt.savefig("critical_number_variance_open.pdf", format='pdf')