In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

import sys
try:
    import cartopy.crs as ccrs
except ImportError:
    pass

if 'cartopy.crs' not in sys.modules:
    get_cartopy = False
    print('You have not imported the {} module'.format('cartopy.crs'))
else:
    get_cartopy = True

NameError: name 'modulename' is not defined

In [6]:
class Clusterer:
    """
    Container for methods of selecting physically homogeneous regions of the map to emulate as though they observe the
    same physical processes.    
    """

    def __init__(
        self,
        emulator,
        fetch_parameters_job=None,
        lengthScales=None
    ):
        """
        After having obtained emulators for each point, we use a clustering method in the parameter space to define the
        above-mentioned regions of the map. We assume that the first round of emulator training has been performed and
        therefore expect a table of length scale parameters on which to perform clustering.


        Arguments
        
        emulator : Emulator
            All clustering choices will require a fixed pixelwise emulation whose length scale parameters are assumed to
            have some physically meaningful interpretation.
        fetch_parameters_job : C3 type
            A map-reduce job the completion of which is required to build the "lengthscales" table.


        Value

        None
        """

        self.emulator = emulator
        
        if lengthScales is not None:
            lat1 = emulator.lat1
            lat2 = emulator.lat2
            lon1 = emulator.lon1
            lon2 = emulator.lon2
            
            self.lengthscales = lengthScales.loc[
                (lengthScales.latitude >= lat1) &
                (lengthScales.latitude <= lat2) &
                (lengthScales.longitude >= lon1) &
                (lengthScales.longitude <= lon2)
            ].reset_index(drop=True)

            times = [datetime.strptime(time, '%Y-%m-%dT%H:%M:%S').timetuple() for time in self.lengthscales.time]
            self.lengthscales["hours"] = [
                timedelta(
                    days=time.tm_yday,
                    minutes=time.tm_min,
                    hours=time.tm_hour
                ).total_seconds() / 3600
                for time in times
            ]
        elif fetch_parameters_job is not None:
            self.lengthscales = self.emulator.__retrieve_pixelwise_parameters__(fetch_parameters_job)
        else:
            return
        
        return


    """
    Perform clustering
    """


    def coarsen_grid(
        self,
        radius
    ):
        return
    
    
    def cluster_pixels(
        self,
        clusteringMethod="kmeans", # or "gmm" or "tsne"
        k=5,
        handleSmallClusters=True,
        handleLargeClusters=True
    ):
        """
        Learn clusters in the parameter space and label pixels according to the cluster to which they best belong.


        Arguments

        clusteringMethod : str
            Default is "kmeans." The name of the method with which to perform clustering in the parameter space.
        k : int
            The number of clusters to learn.
        handleSmallClusters : bool
            If true, then reassign members of clusters which occupy less than 10% of all points in the spatiotemporal
            mesh by majority vote of neighboring points. If else, do not reassign members of these small clusters.


        Value

        None
        """
        
        inv_length_scales = self.lengthscales.drop(
            ["modelId", "latitude", "longitude", "time"],
            axis=1
        ).applymap(
            lambda x: 1/x
        )

        if clusteringMethod == "kmeans":
            init_labels, init_centers = self.__k_means__(inv_length_scales, k=k)
        elif clusteringMethod == "gmm":
            init_labels, init_centers = self.__mixture__(inv_length_scales, k=k)

        invert = lambda x: 1/x
        vfunc = np.vectorize(invert)
        init_centers = vfunc(init_centers)
        
        if handleSmallClusters:
            self.labels, self.centers = self.__handle_small_clusters__(
                init_labels,
                init_centers
            )
        if handleLargeClusters:
            self.labels, self.centers = self.__handle_large_clusters__(
                init_labels,
                init_centers
            )
        else:
            self.labels, self.centers = init_labels, init_centers

        self.lengthscales['region'] = self.labels
        self.lengthscales = self.lengthscales.sort_values('region').reset_index(drop=True)
        self.labels = np.array(self.lengthscales['region'])

        return

    
    def parameter_importance(
        self
    ):
        """
        Overview of which parameters are deemed important in each cluster, where smaller lengthscales indicate higher
        importance.


        Arguments

        None


        Value

        list
            List of pandas DataFrames, each of which report the cluster centers in ascending order of length scale.
        """

        import pandas as pd
        
        data = []
        
        for center, which_center in zip(self.centers, range(len(self.centers))):
            param_values = dict(zip([x for x in self.emulator.__get_input_names__() if x in self.emulator.inputs], center))
            param_values = dict(sorted(param_values.items(), key=lambda item: item[1]))
            print("Center of cluster " + str(which_center))
            data.append(pd.DataFrame(
                param_values.items(),
                columns=["name", "value"]))
            print(data[which_center])
            print('\n')
        
        return data
    

    def __k_means__(
        self,
        lengthscales,
        k=5
    ):
        """
        Given a DataFrame of length scales, return a list of DataFrames, each DataFrame corresponding to a cluster and
        each row corresponding to a GSTP belonging to that cluster.


        Arguments

        lengthscales : pandas DataFrame
            A table of length scale parameters, the clusters of which we want.
        k : int
            The number of clusters to learn.


        Value

        list
            List of predictions and cluster centers obtained from the KMeans method in sklearn.
        """
        from sklearn.cluster import KMeans

        my_k_means = KMeans(n_clusters=k).fit(lengthscales)

        return [my_k_means.predict(lengthscales), my_k_means.cluster_centers_]
    
    
    def __mixture__(
        self,
        lengthscales,
        k=5
    ):
        """
        Given a DataFrame of length scales, return a list of DataFrames, each DataFrame corresponding to a cluster and
        each row corresponding to a GSTP belonging to that cluster.


        Arguments

        lengthscales : pandas DataFrame
            A table of length scale parameters, the clusters of which we want.
        k : int
            The number of clusters to learn.


        Value

        list
            List of predictions and means from the GaussianMixture method in sklearn.
        """

        from sklearn.mixture import GaussianMixture

        my_gmm = GaussianMixture(n_clusters=k).fit(lengthscales)

        return([my_gmm.predict(lengthscales), my_gmm.means_])


    def __handle_small_clusters__(
        self,
        init_labels,
        init_centers
    ):
        """
        Reassign members of clusters which occupy less than 10% of all points in the spatiotemporal mesh by majority vote
        of neighboring points.


        Arguments

        init_labels : list
            The labels which were aquired from an above described clustering method without intervening with relatively
            small clusters.
        init_centers : list
            The centers of the clusters which were aquired from an above described clustering method.


        Value

        list
            List of lists. The first is of updated labels after reassigning members of small clusters; the second, of
            updated centers.
        """

        # Check if any clusters are very small
        small_clusters = np.unique(init_labels)[
            np.unique(init_labels, return_counts=True)[1] <= 0.01 * len(init_labels)
        ]
        try_count = 0

        while len(small_clusters) > 0 or try_count < 10:
            # Reassign members of small clusters by vote of neighbors
            for label, row in zip(init_labels, range(len(init_labels))):
                if label in small_clusters:
                    point = self.lengthscales.loc[row, ["latitude", "longitude"]]
                    neighbors = list(set(self.lengthscales[
                        (self.lengthscales.latitude >= point.latitude - 3) &
                        (self.lengthscales.latitude <= point.latitude + 3) &
                        (self.lengthscales.longitude >= point.longitude - 3) &
                        (self.lengthscales.longitude <= point.longitude + 3)
                    ].index) - set([row]))
                    neighbor_labels = list(init_labels[neighbors])
                    init_labels[row] = max(set(neighbor_labels), key=neighbor_labels.count)

            new_labels = pd.Series(init_labels)
            new_labels = np.array(new_labels.map(dict(zip(np.unique(init_labels), range(len(np.unique(init_labels)))))))
            new_centers = init_centers[np.unique(init_labels)]
            
            small_clusters = np.unique(new_labels)[
                np.unique(new_labels, return_counts=True)[1] <= 0.01 * len(new_labels)
            ]
            
            try_count += 1
        
        return new_labels, new_centers


    def __handle_large_clusters__(
        self,
        init_labels,
        init_centers
    ):
        max_label = np.max(np.unique(init_labels))
        
        new_lengthscales_table = self.lengthscales.copy()

        # Reassign labels for large clusters
        for cluster in np.unique(init_labels):
            len_cluster = sum(init_labels==cluster)

            if len_cluster > 50:
                new_labs = list(range(max_label + 1, max_label + int(len_cluster/30) + 1))
                max_label += int(len_cluster/30) + 1

                new_lengthscales_table.loc[init_labels==cluster, "region"] = np.random.choice(
                    new_labs,
                    size=len_cluster,
                    replace=True
                )

            else:
                
                new_lengthscales_table.loc[init_labels==cluster, "region"] = cluster

        return new_lengthscales_table.region, None # Compute centers for regions



    """
    Visualize
    """


    def plot_clusters(
        self,
        folder_title,
        save=False
    ):
        """
        Visualize where geographic clusters are defined.


        Arguments

        folder_title : string
            Where to save the figure(s).
        save : bool
            If true, save the figure(s) in the folder. If else, just display -- don't save.


        Value

        None
        """
        
        #labels, counts = np.unique(self.labels, return_counts=True)
        #new_labels = dict(zip(labels, sorted(range(3), key=lambda k: -counts[k])))
        
        # Set colors for all clusters so that they are the same between frames

        levels, categories = pd.factorize(self.labels)
        #levels, categories = pd.factorize([new_labels[b.labels[k]] for k in range(len(self.labels))])

        #colors = [plt.cm.tab20(i) for i in levels]
        colors_dict = {
                0:plt.cm.Set2(0),
                1:plt.cm.Set2(6),
                2:plt.cm.Set2(4)
        }
        colors = [colors_dict[i] for i in levels]

        handles = [mpl.patches.Patch(color=colors_dict[i], label=c) for i, c in enumerate(categories)]
        categories, handles = zip(*sorted(zip(categories, handles), key=lambda t: t[0]))
        
        df = pd.concat([self.lengthscales.copy(), pd.DataFrame({'colors':colors})], axis=1)

        # Plot each timeframe
        for time in np.unique(df.time): #set(df.time.astype(str)):
            my_data = df[np.array(df.time==time)].copy()
            
            if get_cartopy:
                projection = ccrs.PlateCarree(central_longitude=0)
            
            BBox = [self.emulator.lon1, self.emulator.lon2, self.emulator.lat1, self.emulator.lat2 - 0.5]
            fig = plt.figure(figsize=(10,10), facecolor='yellow')
            
            if get_cartopy:
                ax = fig.add_subplot(1, 1, 1, projection=projection)
                ax.coastlines()
                ax.set_extent(BBox, ccrs.PlateCarree())
                ax.gridlines(draw_labels=True, crs=projection)

            plt.scatter(
                my_data.longitude,
                my_data.latitude,
                c=my_data.colors,
                marker='s',
                s=150,
                alpha=0.7
            )
            plt.title(time)
            """
            plt.legend(
                bbox_to_anchor=(1.02, 1),
                loc='upper left',
                handles=handles,
                title='Cluster')
            """
            plt.xlabel('longitude')
            plt.ylabel('latitude')
            if save:
                plt.savefig(folder_title + time) #'../figures/clustering_maps/' + folder_title + '/map_at_time' + time)
            plt.show()
        
        return
    
    
    def plot_length_scales(
        self,
        inputs_of_interest=None
    ):
        """
        Visualize where lengthscales for (a) given input(s) are large or small.


        Arguments

        inputs_of_interest : list
            Which inputs whose lengthscales to plot over space and time


        Value

        None
        """

        import matplotlib.pyplot as plt

        for input_name in inputs_of_interest if inputs_of_interest is not None else self.emulator.inputs:
            
            fig = plt.figure(figsize=(13, 6))
            plt.title(input_name, fontsize="x-large")

            plt.scatter(self.lengthscales.longitude, self.lengthscales.latitude,
                      c=np.log(self.lengthscales[input_name]))
            plt.colorbar()

            fig.show()
        
        return

    
    def clustering_accuracy(
        self,
        inputs_of_interest=None
    ):
        """
        TO-DO: What does this method do?


        Arguments

        None


        Value

        None
        """
        
        import matplotlib.pyplot as plt
        import pandas as pd
        import math
        import numpy as np
        
        clusters = []
        
        for which_center in range(len(self.centers)):

                clusters.append(self.lengthscales[
                    np.array(self.labels==which_center)
                ].copy(
                ).drop(
                    ["modelId", "latitude", "longitude", "time"],
                    axis=1
                ))
        
        for input_name in inputs_of_interest if inputs_of_interest is not None else self.emulator.inputs:
            
            fig, axs = plt.subplots(1, len(self.centers), figsize=(12, 5))
            fig.tight_layout(h_pad=2)
            st = fig.suptitle(input_name, fontsize="x-large")
            
            max_y = 0
            
            for which_center in range(len(self.centers)):
                
                N = axs[which_center].hist(np.log(clusters[which_center][input_name]), bins=25)[0]
                axs[which_center].set_title("Cluster " + str(which_center))
                
                if max(N) > max_y:
                    max_y = max(N)

            for which_center in range(len(self.centers)):
                axs[which_center].set_ylim(top=max_y*1.05)

            st.set_y(0.95)
            fig.subplots_adjust(top=0.85)
            
            plt.show()
        
        return


    def t_sne_plot(
        self,
        learningRate=100,
        perplexity=30
    ):
        """
        Visualize the well-definedness of the clusters.


        Arguments

        learningRate : int
            Default is 100. A tuning parameter for the TSNE algorithm, which is optimized using gradient descent.
        perplexity : int
            Default is 30. A tuning parameter for the TSNE algorithm.


        Value

        None
        """
        import numpy as np
        from sklearn.manifold import TSNE
        import matplotlib.pyplot as plt
        import matplotlib as mpl
        import pandas as pd

        # Set colors for all clusters so that they are the same between frames
        levels, categories = pd.factorize(self.labels)
        colors = [plt.cm.tab20(i) for i in levels]
        handles = [mpl.patches.Patch(color=plt.cm.tab20(i), label=c) for i, c in enumerate(categories)]
        
        df = pd.concat([self.lengthscales.copy(), pd.DataFrame({'colors':colors})], axis=1)

        # Cluster data
        X = np.array(
            df[self.emulator.inputs + ["latitude", "longitude"]].copy()
        )

        X_embedded = TSNE(n_components=2,
                          learning_rate=learningRate,
                          init='random',
                          perplexity=30).fit_transform(X)

        # Plot each timeframe
        for time in set(df.time.astype(str)):
            my_data = pd.DataFrame(X_embedded[np.array(df.time==time)].copy())
            my_data["colors"] = df.colors[np.array(df.time==time)]
            
            BBox = [self.emulator.lat1, self.emulator.lat2, self.emulator.lon1, self.emulator.lon2]
            fig = plt.figure(figsize=(10,10))
            # ax.set_extent(BBox)

            plt.scatter(my_data[0],
                        my_data[1],
                        c=my_data.colors)
            plt.legend(
                bbox_to_anchor=(1.02, 1),
                loc='upper left',
                handles=handles,
                title='Cluster')
            # plt.xlabel('longitude')
            # plt.ylabel('latitude')
            plt.show()
            # plt.savefig('clustering_maps/k_means_' + time)
        
        return
