In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, HDBSCAN
from umap import UMAP
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K
from tensorflow.keras.losses import KLDivergence
import random
import os
import time


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
%matplotlib inline
plt.figure(figsize=(10,6))


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

In [4]:
def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax=None):
    if ax is None:
        _, ax = plt.subplots(figsize=(10, 4))
    labels = labels if labels is not None else np.ones(X.shape[0])
    probabilities = probabilities if probabilities is not None else np.ones(X.shape[0])
    unique_labels = set(labels)
    colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
    proba_map = {idx: probabilities[idx] for idx in range(len(labels))}
    for k, col in zip(unique_labels, colors):
        if k == -1:
            col = [0, 0, 0, 1]
        class_index = (labels == k).nonzero()[0]
        for ci in class_index:
            ax.plot(
                X[ci, 0],
                X[ci, 1],
                "x" if k == -1 else "o",
                markerfacecolor=tuple(col),
                markeredgecolor="k",
                markersize=4 if k == -1 else 1 + 5 * proba_map[ci],
            )
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    preamble = "True" if ground_truth else "Estimated"
    title = f"{preamble} number of clusters: {n_clusters_}"
    if parameters is not None:
        parameters_str = ", ".join(f"{k}={v}" for k, v in parameters.items())
        title += f" | {parameters_str}"
    ax.set_title(title)
    plt.tight_layout()


In [None]:
# Prepare data
features = [
    'beam_id',
    'ra_hours',
    'dec_degrees',
    'signal_frequency',
    'signal_beam',
    'signal_drift_rate',
    'signal_snr',
    'signal_power',
    'signal_incoherent_power',
]

pd.set_option('display.max_columns', None)
df = pd.read_pickle("../data/clean_df.pkl")
print(df.head(50))
df_without_incoherent = df[(df.source_name != 'Incoherent')]
df_numeric = df_without_incoherent.select_dtypes(include=[np.number]).fillna(0)
filtered_column_df = df_numeric[features]

                                              file_uri  observation_id  \
0    /mnt/cosmic-storage-1/data3/vlass_target/VLASS...           33232   
5    /mnt/cosmic-storage-1/data3/vlass_target/VLASS...           33232   
6    /mnt/cosmic-storage-1/data3/vlass_target/VLASS...           33232   
7    /mnt/cosmic-storage-1/data3/vlass_target/VLASS...           33232   
8    /mnt/cosmic-storage-1/data3/vlass_target/VLASS...           33232   
11   /mnt/cosmic-storage-1/data3/vlass_target/VLASS...           33232   
12   /mnt/cosmic-storage-1/data3/vlass_target/VLASS...           33232   
13   /mnt/cosmic-storage-1/data3/vlass_target/VLASS...           33232   
14   /mnt/cosmic-storage-1/data3/vlass_target/VLASS...           33232   
15   /mnt/cosmic-storage-1/data3/vlass_target/VLASS...           33232   
16   /mnt/cosmic-storage-1/data3/vlass_target/VLASS...           33232   
17   /mnt/cosmic-storage-1/data3/vlass_target/VLASS...           33232   
23   /mnt/cosmic-storage-1/data3/vlass

In [29]:
def cluster(start_index, end_index, dataframe, seed, graph=True):
    # set random seeds for consistency
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    K.clear_session()

    # take a slice of the original dataframe for clustering (3000 recommended)
    df_subset = dataframe[start_index:end_index]

    # scale the subset dataframe
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(df_subset)

    # Build autoencoder
    input_dim = X_scaled.shape[1]
    encoding_dim = 4

    input_layer = layers.Input(shape=(input_dim,))
    encoded = layers.Dense(16, activation='relu')(input_layer)
    encoded = layers.Dense(encoding_dim, activation='relu')(encoded)
    decoded = layers.Dense(16, activation='relu')(encoded)
    decoded = layers.Dense(input_dim, activation='sigmoid')(decoded)

    autoencoder = models.Model(inputs=input_layer, outputs=decoded)
    encoder = models.Model(inputs=input_layer, outputs=encoded)

    autoencoder.compile(optimizer='adam', loss='mse')

    dataset = tf.data.Dataset.from_tensor_slices((X_scaled, X_scaled))
    dataset = dataset.shuffle(buffer_size=len(X_scaled), seed=seed).batch(32)
    autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=32, shuffle=True, verbose=0)

    # encode the dataframe subset
    X_encoded = encoder.predict(X_scaled)

    clusterer = HDBSCAN(
        min_cluster_size=10,
        n_jobs=-1,
    )

    # Fit HDBSCAN
    clusterer.fit(X_encoded)
    labels = clusterer.labels_
    probabilities = getattr(clusterer, "probabilities_", None)
    num_outliers = np.count_nonzero(np.where(labels == -1))

    if graph == True:
        # Project to 2D for visualization
        X_2d = UMAP(n_components=2, random_state=seed).fit_transform(X_encoded)

        # Plot clusters
        plot(X_2d, labels, probabilities=probabilities, parameters={"min_cluster_size":10, "num_outliers":num_outliers})
        plt.show()

    return num_outliers



In [37]:
def cluster_with_idec(start_index, end_index, dataframe, seed, graph=True, n_clusters=10, encoding_dim=4, update_interval=5):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    K.clear_session()

    df_subset = dataframe[start_index:end_index]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(df_subset)
    input_dim = X_scaled.shape[1]

    input_layer = layers.Input(shape=(input_dim,))
    encoded = layers.Dense(16, activation='relu')(input_layer)
    bottleneck = layers.Dense(encoding_dim, activation='relu', name='bottleneck')(encoded)
    decoded = layers.Dense(16, activation='relu')(bottleneck)
    decoded = layers.Dense(input_dim, activation='sigmoid')(decoded)

    autoencoder = models.Model(inputs=input_layer, outputs=decoded)
    encoder = models.Model(inputs=input_layer, outputs=bottleneck)

    clustering_layer = layers.Dense(n_clusters, activation='softmax', name='clustering')
    clustering_output = clustering_layer(bottleneck)

    idec_model = models.Model(inputs=input_layer, outputs=[clustering_output, decoded])

    autoencoder.compile(optimizer='adam', loss='mse')
    autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

    features = encoder.predict(X_scaled)
    kmeans = KMeans(n_clusters=n_clusters, n_init=20, random_state=seed)
    y_pred = kmeans.fit_predict(features)
    clustering_layer.set_weights([kmeans.cluster_centers_.T, np.zeros(n_clusters)])

    idec_model.compile(
        loss=[KLDivergence(), 'mse'],
        loss_weights=[0.1, 1],
        optimizer='adam'
    )

    batch_size = 128
    maxiter = 1500
    index_array = np.arange(X_scaled.shape[0])

    for ite in range(maxiter):
        if ite % update_interval == 0:
            q, _ = idec_model.predict(X_scaled, verbose=0)
            weight = q ** 2 / q.sum(0)
            p = (weight.T / weight.sum(1)).T

        idx = np.random.choice(index_array, batch_size, replace=False)
        X_batch = X_scaled[idx]
        p_batch = p[idx]
        idec_model.train_on_batch(X_batch, [p_batch, X_batch])

    X_encoded = encoder.predict(X_scaled)
    clusterer = HDBSCAN(min_cluster_size=10, n_jobs=-1)
    clusterer.fit(X_encoded)

    labels = clusterer.labels_
    probabilities = getattr(clusterer, "probabilities_", None)
    num_outliers = np.count_nonzero(labels == -1)

    if graph == True:
        X_2d = UMAP(n_components=2, random_state=seed).fit_transform(X_encoded)

        plot(X_2d, labels, probabilities=probabilities, parameters={
            "min_cluster_size": 10,
            "num_outliers": num_outliers
        })
        plt.show()

    return num_outliers


# Tracking results of clustering based on random seed for points 0 - 21000 with batch sizes of 3000
Seed = 100:
 - 356
 - 248
 - 566
 - 3
 - 577
 - 1120
 - 1

Seed = 42:
 - 481
 - 280
 - 486
 - 8
 - 412
 - 0
 - 552


In [38]:
# testing clustering 3000 points at a time 10 times vs 30000 points
start_time = time.perf_counter()
three_thousand_data_outliers = 0
for i in range(0, 27001, 3000):
    print(f'Clustering rows {i} to {i+3000}...')
    three_thousand_data_outliers += cluster_with_idec(i, i + 3000, filtered_column_df, 50, graph=False)
    print('Finished')
end_time = time.perf_counter()
three_thousand_time = end_time - start_time

start_time = time.perf_counter()
print(f'Clustering rows 0 to 30000...')
thirty_thousand_rows_outliers = cluster_with_idec(0, 30000, filtered_column_df, 50, graph=False)
print('Finished')
end_time = time.perf_counter()
thirty_thousand_time = end_time - start_time

print(f'30000 points clustered 3000 points at a time with idec number of outliers: {three_thousand_data_outliers} -> took {three_thousand_time} seconds')
print(f'30000 points clustered together with idec number of outliers: {thirty_thousand_rows_outliers} -> took {thirty_thousand_time} seconds')

"""
# testing how random seeds affect clustering performance
print("Same data different random seed (seed=100 -> seed=42)")
cluster(15000, 18000, filtered_column_df, 100)
cluster(15000, 18000, filtered_column_df, 42)

# testing the idec autoencoder
print("Same data different random seed with idec (seed=100 -> seed=42)")
cluster_with_idec(0, 30000, filtered_column_df, 100)
cluster_with_idec(0, 30000, filtered_column_df, 42)
"""
print('', end='')

Clustering rows 0 to 3000...
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 993us/step
Finished
Clustering rows 3000 to 6000...
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 902us/step
Finished
Clustering rows 6000 to 9000...
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 906us/step
Finished
Clustering rows 9000 to 12000...
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 966us/step
Finished
Clustering rows 12000 to 15000...
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 996us/step
Finished
Clustering rows 15000 to 18000...
[1m94/94[0m [3