In [1]:
%matplotlib qt
import numpy as np
import matplotlib.pyplot as plt

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import LassoSelector
from matplotlib.path import Path
from collections import Counter

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import LassoSelector
from matplotlib.path import Path
from collections import Counter

def colors_from_lbs(lbs, colors=None):
    mpl_20 = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
              '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
              '#3397dc', '#ff993e', '#3fca3f', '#df5152', '#a985ca',
              '#ad7165', '#e992ce', '#999999', '#dbdc3c', '#35d8e9']

    if colors is None:
        colors = np.array(mpl_20)
    else:
        colors = np.array(colors)
    lbs = np.array(lbs) % len(colors)
    return colors[lbs]

def _update_histogram(ax, features, feature_name):
    ax.clear()
    feature_counts = Counter(features)
    sorted_counts = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)
    ax.bar([item[0] for item in sorted_counts], [item[1] for item in sorted_counts], color='red')
    ax.set_ylabel('Count')
    ax.set_title(f'Histogram of {feature_name}', fontsize=8)
    ax.tick_params(axis='x', labelrotation=45, labelsize=8)

def _update_histograms(axs, features, feature_names):
    for ax, feature, name in zip(axs, features.T, feature_names):
        _update_histogram(ax, feature, name)
    plt.tight_layout()

class InteractiveCluster:

    def __init__(self, fig, pts, ps, features, feature_names, lbs=None, **kwargs):
        self.fig = fig
        self.ax_cluster = fig.axes[0]
        self.ax_histograms = fig.axes[1:]

        if lbs is None:
            self.lbs_ = np.array([0]*len(pts))
        else:
            self.lbs_ = lbs
        self.colors = colors_from_lbs(self.lbs_)

        self.path_collection = self.ax_cluster.scatter(pts[:, 0], pts[:, 1], c=self.colors, **kwargs)
        self.ax_cluster.axis('equal')

        self.pts = pts
        self.ps = ps
        self.features = features
        self.feature_names = feature_names

        self.ind = np.arange(len(pts))  # Initially, all points are selected
        self.pts_selected = self.pts

        self.lbs = np.array(len(pts) * [-1])

        self.num_clusters = 0

        self.lasso = LassoSelector(self.ax_cluster, onselect=self.onselect)
        self.press = self.fig.canvas.mpl_connect("key_press_event", self.press_key)

        # Initialize histograms for all data points
        self.plot_initial_histograms()

    def plot_initial_histograms(self):
        # Plot histograms of all data points
        for i, feature_name in enumerate(self.feature_names):
            _update_histogram(self.ax_histograms[i], self.features[:, i], feature_name)

    def onselect(self, verts):
        path = Path(verts)
        self.ind = np.nonzero(path.contains_points(self.pts))[0]
        if self.ind.size != 0:
            self.pts_selected = self.pts[self.ind]

            # get the histograms of features
            selected_features = self.features[self.ind]
            _update_histograms(self.ax_histograms, selected_features, self.feature_names)
            self.fig.canvas.draw_idle()

    def press_key(self, event):
        if event.key == "enter":
            if self.ind.any():
                self.lbs[self.ind] = self.num_clusters
                self.num_clusters += 1
                print("One cluster has been selected.")

def interactive_clusters(pts, ps, features, feature_names, lbs=None, **kwargs):
    num_features = features.shape[1]
    
    if (num_features+1) % 2 == 0:
        n = int((num_features+1) / 2)
        m = 2
    else:
        n = int((num_features+2) / 2)
        m = 2
    
    if num_features > 2:
        fig, ax = plt.subplots(n, m, figsize=(8, 8))
    else:
        fig, ax = plt.subplots(1, num_features + 1, figsize=(6 * (num_features + 1), 6))

    for i, feature_name in enumerate(feature_names):
        if num_features > 2:
            _update_histogram(ax.flatten()[i+1], features[:, i], feature_name)
        else:
            _update_histogram(ax[i + 1], features[:, i], feature_name)

    app = InteractiveCluster(fig, pts, ps, features, feature_names, lbs, **kwargs)
    return app


In [None]:
# generate random points
pts = np.random.random((1000, 2))

# generate random patches
ps = np.random.random((1000, 45, 45))

# generate random non-numeric features
md = np.column_stack([
    np.random.choice(['10', '20', '30'], size=(1000,)),
    np.random.choice(['M', 'F', 'NB'], size=(1000,)),
    np.random.choice(['A', 'B'], size=(1000,))
])
md_names = ['Age', 'Sex', 'Stage']

# Create the interactive clusters application
app = interactive_clusters(pts, ps, md, md_names)
plt.show()

In [31]:
print(md.shape[1])

3


In [26]:
import pandas as pd
Data=pd.DataFrame(columns=['Age','Sex','Stage','X','Y'])
Data['Sex']=np.random.choice(['M', 'F', 'NB'], size=(1000,))
Data['Stage']=np.random.choice(['A', 'B'], size=(1000,))
Data['Age'] = np.random.randint(0,50, size=1000)
Data['X'] = np.random.uniform(0,1, size=1000)
Data['Y'] = np.random.uniform(0,1, size=1000)

Data.to_csv('/Users/IndrajitWadgaonkar/Desktop/TrialData.csv',index=False)  

In [38]:
# This is the plotter class
import pandas as pd
class InteractiveCluster:

    def __init__(self,data,projectedDataPath,colors=None,lbs=None, **kwargs):
        mpl_20 = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
              '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf',
              '#3397dc', '#ff993e', '#3fca3f', '#df5152', '#a985ca',
              '#ad7165', '#e992ce', '#999999', '#dbdc3c', '#35d8e9']
        
        self.projData=pd.read_csv(projectedDataPath)
        
        if colors is None:
            colors = np.array(mpl_20)
        else:
            colors = np.array(colors)

        if lbs is None:
             lbs = np.array(np.array([0]*len(self.projData))) % len(colors)
        self.colors = colors[lbs]
        fig, ax = plt.subplots(len(list(self.projData)), 2, figsize=(8, 8)) # sets the figures to be plotted
        
        self.ind = np.arange(len(self.projData))  # Initially, all points are selected
        self.pts_selected = self.projData[['X','Y']]

        self.num_clusters = 0

        self.lasso = LassoSelector(fig.axes[0], onselect=self.onselect)
        self.press = fig.canvas.mpl_connect("key_press_event", self.press_key)

        # Initialize histograms for all data points
        self.plot_initial_histograms()
    def plot_initial_histograms(self):
        # Plot histograms of all data points
        for i, feature_name in enumerate(list(self.projData)):
            _update_histogram(fig.axes[1:][i], self.features[:, i], feature_name)
            
    def _update_histogram(ax, features, feature_name):
        ax.clear()
        feature_counts = Counter(features)
        sorted_counts = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)
        ax.bar([item[0] for item in sorted_counts], [item[1] for item in sorted_counts], color='red')
        ax.set_ylabel('Count')
        ax.set_title(f'Histogram of {feature_name}', fontsize=8)
        ax.tick_params(axis='x', labelrotation=45, labelsize=8)

    def _update_histograms(axs, features, feature_names):
        for ax, feature, name in zip(axs, features.T, feature_names):
            _update_histogram(ax, feature, name)
        plt.tight_layout()
    
    def onselect(self, verts):
        path = Path(verts)
        self.ind = np.nonzero(path.contains_points(self.pts))[0]
        if self.ind.size != 0:
            self.pts_selected = self.pts[self.ind]

            # get the histograms of features
            selected_features = self.features[self.ind]
            _update_histograms(fig.axes[1:], selected_features, self.feature_names)
            self.fig.canvas.draw_idle()

    def press_key(self, event):
        if event.key == "enter":
            if self.ind.any():
                self.lbs[self.ind] = self.num_clusters
                self.num_clusters += 1
                print("One cluster has been selected.")
    

In [27]:
projectedDataPath='/Users/IndrajitWadgaonkar/Desktop/TrialData.csv'
PData=pd.read_csv(projectedDataPath)
print(PData)

     Age Sex Stage         X         Y
0     48   F     A  0.901135  0.048055
1     48   F     B  0.377591  0.830606
2     20   F     B  0.478594  0.916629
3     12  NB     A  0.441247  0.811421
4      1   F     A  0.636957  0.765922
..   ...  ..   ...       ...       ...
995   26  NB     A  0.420665  0.386563
996   23   M     A  0.042848  0.568193
997   46   M     A  0.632939  0.167379
998   38  NB     A  0.719621  0.190171
999   18   M     B  0.212246  0.179544

[1000 rows x 5 columns]


In [35]:
print(list(PData))

['Age', 'Sex', 'Stage', 'X', 'Y']


In [34]:
p=PData[['X','Y']]
print(p)

            X         Y
0    0.901135  0.048055
1    0.377591  0.830606
2    0.478594  0.916629
3    0.441247  0.811421
4    0.636957  0.765922
..        ...       ...
995  0.420665  0.386563
996  0.042848  0.568193
997  0.632939  0.167379
998  0.719621  0.190171
999  0.212246  0.179544

[1000 rows x 2 columns]
