In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import cm
from matplotlib.colors import LogNorm
from matplotlib.ticker import FuncFormatter
from scipy.ndimage.interpolation import geometric_transform
import statsmodels.api as sm
from pathlib import Path
import re
import os
from simcore_tools import FileRenamer
from sklearn.decomposition import PCA
from sklearn.cluster import (
    KMeans, SpectralClustering, AffinityPropagation, AgglomerativeClustering)
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import matplotlib.ticker as ticker
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
import matplotlib.patches as mpatches
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def find_param(string, param):
    string = string.rpartition('/')[-1]
    string = string.rpartition('.')[0]
    start = string.find(param)
    if start == -1:
        if param == 'reload':
            return '000'
        else:
            return 'NA'
    end = string[start:].find('_')
    if end == -1:
        return string[start+len(param):]
    else:
        return string[start+len(param):start+end]


def createFileList(extension, dirName='.'):
    """Returns a pandas series consisting of all files with the provided
    extension in directory dirName, defaulting to the current working
    directory.
    """
    file_names = [os.path.join(dirName, file) for file in os.listdir(dirName) 
                  if file.rpartition('.')[-1] == extension]
    if len(file_names) == 0:
        raise ValueError("No '" + extension + "' files found in directory '"
              + dirName + "'.")
    if re.search('v[0-9]{3}', file_names[0]):
        raise ValueError("Filename found with 'vXXX' in name. "
                         "Did you forget to rename the filenames before running the analysis?")
    file_names.sort()
    return pd.Series(file_names)

def check_directory_exists(dirname):
    if not Path(dirname).is_dir():
        print("Save directory not found:", dirname)
        var = input("Create it? (y/N) ")
        if (var == 'y' or var == 'Y'):
            try:
                os.mkdir(dirname)
            except Exception as e:
                print("Creation of directory", dirname, "failed!")
                raise e
        else:
            raise ValueError("Save directory does not exist:", dirname)

def initializeDataFrame(dirName='.',
                        params=['pf', 'sp', 'lp'],
                        analyses=['global_order']):
    """Generates dataframe of all simcore analysis files, assuming
    file naming convention of containing the substrings listed in the
    params list, followed by the parameter quantity and an underscore.
    This function tabulates these quantities using the file names of any
    bitmaps (final state snapshots) found in snapshotDir. It then looks
    for the corresponding analyses files whose extension is given by the
    substrings found in the analyses list.
    """
    df = None
    params = params + ['reload']
    for analysis in analyses:
        analysis_df = pd.DataFrame(data=createFileList(analysis, dirName), columns=[analysis])
        for param in params:
            analysis_df[param] = analysis_df[analysis].apply(find_param, args=(param,))
        if df is not None:
            df = pd.merge(df, analysis_df, how='outer', on=params)
        else:
            df = analysis_df
    return df[params + analyses]

def check_dataframe(df):
    for fname in df.global_order:
        try:
            pd.read_csv(fname, delim_whitespace=True, 
                        skiprows=1, index_col='time').dropna()
        except Exception:
            raise ValueError("Failed to read file {}".format(fname))
    for fname in df.polar_order_avg:
        try:
            pd.read_csv(fname, skiprows=1, delim_whitespace=True)
        except Exception:
            raise ValueError("Failed to read file {}".format(fname))
    for fname in df.flock:
        try:
            pd.read_csv(fname, header=3, low_memory=False, delim_whitespace=True)
        except Exception:
            raise ValueError("Failed to read file {}".format(fname))
    print("Everything checks out")

In [None]:
def get_global_order_data(df, saveDirName=".", make_plots=True, params=['pf', 'sp', 'lp'], 
                          late_fraction=0.1, rolling_window=20):
    """Generates two grids of plots displaying time series of the global
    order parameters, including the global polar/nematic order on one
    figure and global spiral number/spiral handedness on a second figure.
    """
    
    analyze = 'global_order'
    check_directory_exists(saveDirName)
    gby = df.groupby(params)
    row_list = []
    for values, group in gby:
        param_values = [i for pair in zip(params, values) for i in pair]
        string_values = str.join('_', ['{}{}' for i in range(len(params))])
        string_values = string_values.format(*param_values)
        display_values = str.join(', ', ['{}={}' for i in range(len(params))])
        display_values = display_values.format(*param_values)
        
        print("Gathering", analyze, "data for parameters", display_values)
        goDF = None
        for file in group[analyze].sort_values():
            if goDF is not None: 
                goDF = goDF.append(GetGlobalOrderDF(file), ignore_index=True)
            else:
                goDF = GetGlobalOrderDF(file)

        if (make_plots):
            plotDF = goDF.rolling(rolling_window).mean().dropna()
            fig, ax = plt.subplots(1, 2, figsize=(12, 6))
            plotDF.plot(y="nematic_order_mag",color='blue',linewidth=1,ax=ax[0])
            plotDF.plot(y="polar_order_mag",color='red',linewidth=1,ax=ax[0])
            ax[0].set_xlabel('Time')
            ax[0].set_ylabel('Orientational order')
            ax[0].legend(['Nematic order', 'Polar order'])
            ax[0].set_ylim(0, 1)
            plotDF.plot(y="spiral_order",color='blue',linewidth=1,ax=ax[1])
            plotDF.plot(y="signed_spiral_order",color='red',linewidth=1,ax=ax[1])
            ax[1].set_xlabel('Time')
            ax[1].legend(['Spiral order', 'Spiral handedness'])
            ax[1].set_ylabel('Spiral order')
            fig.tight_layout(rect=[0, 0.03, 1, 0.95])
            fig.suptitle("Global order parameters: " + display_values)
            print("Saving", analyze, "plots for parameters", display_values)
            fig.savefig(Path(saveDirName, string_values + "_global_order.png"))
            plt.close(fig)
            
        result_names = ['global_polar', 'global_polar_std', 
                        'global_nematic', 'global_nematic_std',
                        'global_spiral', 'global_spiral_std']
        late_time = int((1 - late_fraction)*goDF.shape[0])
        results = (goDF['polar_order_mag'].iloc[late_time:].mean(), 
                   goDF['polar_order_mag'].iloc[late_time:].std(),
                   goDF['nematic_order_mag'].iloc[late_time:].mean(), 
                   goDF['nematic_order_mag'].iloc[late_time:].std(),
                   goDF['spiral_order'].iloc[late_time:].mean(), 
                   goDF['spiral_order'].iloc[late_time:].std())
        row = {key:value 
               for key, value 
               in (list(zip(params, values)) + list(zip(result_names, results)))}
        row_list.append(row)
    return pd.DataFrame(row_list)


def CalculateGlobalOrderMagnitudes(df):
    """Given a global order dataframe, calculates magnitude of polar
    order vector and maximum eigenvalue of nematic order tensor.
    """
    df['polar_order_mag'] = np.sqrt(df.polar_order_x**2 
                                    + df.polar_order_y**2 
                                    + df.polar_order_z**2)
    df['nematic_order_mag'] = df.apply(lambda x: maxEig(
        x['nematic_order_xx'],
        x['nematic_order_xy'],
        x['nematic_order_yx'],
        x['nematic_order_yy']), axis=1)

    
def maxEig(xx,xy,yx,yy):
    """Returns the max eigenvalue of 2D matrix with elements xx, xy,
    yx, yy.
    """
    return max(np.linalg.eig(np.array([[xx,xy],[yx,yy]]))[0])

    
def GetGlobalOrderDF(fname):
    """Calculates time series of global orders parameters (polar order
    vector magnitude and maximum eigenvalues of nematic order tensor Q)
    from .global_order file with name 'fname' and returns global order
    dataframe.
    """
    assert isinstance(fname,str), "'fname' must be a string!"
    df = pd.read_csv(fname,delim_whitespace=True,skiprows=1,
                     index_col='time').dropna()
    CalculateGlobalOrderMagnitudes(df)
    return df


In [None]:
# Requires time step due to output file header not providing the data...
def get_polar_order_avg_data(df, time_step, saveDirName=".", make_plots=True, 
                             params=['pf', 'sp', 'lp'], late_fraction=0.1, rolling_window=20):
    """Generates two grids of plots displaying time series of the global
    order parameters, including the global polar/nematic order on one
    figure and global spiral number/spiral handedness on a second figure.
    """
    analyze = 'polar_order_avg'
    check_directory_exists(saveDirName)
    gby = df.groupby(params)
    row_list = []
    for values, group in gby:
        param_values = [i for pair in zip(params, values) for i in pair]
        string_values = str.join('_', ['{}{}' for i in range(len(params))])
        string_values = string_values.format(*param_values)
        display_values = str.join(', ', ['{}={}' for i in range(len(params))])
        display_values = display_values.format(*param_values)
        print("Gathering", analyze, "data for parameters", display_values)

        goDF = None
        for file in group[analyze].sort_values():
            if goDF is not None: 
                goDF = goDF.append(pd.read_csv(file, skiprows=1, delim_whitespace=True), 
                                   ignore_index=True)
            else:
                goDF = pd.read_csv(file, skiprows=1, delim_whitespace=True)
        goDF['time'] = goDF.index * time_step
        if make_plots:
            fig, ax = plt.subplots(1, 1, figsize=(6, 6))
            plotDF = goDF.rolling(rolling_window).mean().dropna()
            plotDF.plot(x="time", y="avg_polar_order", color='red',
                      linewidth=1, ax=ax, label=r'$\langle p_i \rangle$')
            plotDF.plot(x="time", y="avg_contact_number", color='blue',
                      linewidth=1, ax=ax, label=r'$\langle c_i \rangle$')
            ax.set_xlabel('Time')
            ax.set_ylabel('Order parameter')
            ax.set_title('Average local polar order: ' + display_values)
            ax.legend(loc='best')
            fig.tight_layout(rect=[0, 0.03, 1, 0.95])
            print("Saving", analyze, "plots for parameters", display_values)
            fig.savefig(Path(saveDirName, string_values + "_local_polar_order_avg.png"))
            plt.close(fig)
        result_names = ['avg_polar_order', 'avg_polar_order_std', 
                        'avg_contact_number', 'avg_contact_number_std']
        late_time = 1 - late_fraction
        results = (goDF['avg_polar_order'].iloc[int(late_time*goDF.shape[0]):].mean(), 
                   goDF['avg_polar_order'].iloc[int(late_time*goDF.shape[0]):].std(),
                   goDF['avg_contact_number'].iloc[int(late_time*goDF.shape[0]):].mean(), 
                   goDF['avg_contact_number'].iloc[int(late_time*goDF.shape[0]):].std())
        row = {key:value 
               for key, value 
               in (list(zip(params, values)) + list(zip(result_names, results)))}
        row_list.append(row)
    return pd.DataFrame(row_list)

In [None]:
def get_flock_df(fname):
    df = pd.read_csv(fname, header=3, low_memory=False, delim_whitespace=True)
    header = pd.read_csv(fname, header=1, nrows=1, delim_whitespace=True)
    filcols = [col for col in df.columns if col[:3] == 'fil']
    flockcols = [col for col in df.columns if col[:3] != 'fil']

    flock_global = df[flockcols].dropna()
    flock_global = flock_global[['n_flocking', 'n_exterior', 'n_interior']]

    flockstates = df[filcols].dropna().values
    n_filaments = flockstates.shape[1]

    flockstates[flockstates == 2] = 3
    diffs = pd.DataFrame(np.diff(flockstates, axis=0))

    freqs = ['f_not_ext', 'f_not_int', 'f_ext_int','f_ext_not', 'f_int_ext', 'f_int_not']
    flock_state = ['n_not', 'n_ext', 'n_int']
    change_state = [1, 3, 2, -1, -2, -3]
    #change_state = [-3, -2, -1, 1, 2, 3]
    df = pd.DataFrame(columns=freqs + flock_state)
    df['n_not'] = n_filaments - flock_global['n_flocking']
    df['n_ext'] = flock_global['n_exterior']
    df['n_int'] = flock_global['n_interior']
    for freq, state in zip(freqs, change_state):
        if freq[2:5] == 'int':
            denom = df['n_int']
        elif freq[2:5] == 'ext':
            denom = df['n_ext']
        elif freq[2:5] == 'not':
            denom = df['n_not']
        else:
            raise ValueError("Unexpected frequency")
        df[freq] = diffs[diffs==state].count(axis=1) / denom
    df = df.iloc[1:-1, :].fillna(0)
    step = 0.5 * header['nspec'][0] * header['delta'][0]
    df['time'] = df.index * step
    for state in flock_state:
        df[state] = df[state] / n_filaments
    df['n_tot'] = 1 - df['n_not']
    return df

def plot_flock_state(df, display_string, save_string, rolling_window=20):
    # Rolling time average with window = 20, 1 tau for nspec = 1000, delta = 0.0001
    time = df['time']
    plotDF = df.rolling(rolling_window).mean().dropna()
    plotDF['time'] = time.iloc[:plotDF.shape[0]]
    # Now plot them
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))
    ax[0].set_title('Fraction of flocking filaments')
    ax[0].plot(plotDF['time'], plotDF['n_tot'], label='total')
    ax[0].plot(plotDF['time'], plotDF['n_ext'], label='exterior')
    ax[0].plot(plotDF['time'], plotDF['n_int'], label='interior')
    ax[0].set_xlabel('Time')
    ax[0].set_ylabel('Filament fraction')
    ax[0].legend(loc='best')
    freq_cols = [col for col in df.columns if col[:2] == 'f_']
    plotDF[['time'] + freq_cols].plot(x='time', ax=ax[1], title='Normalized flock switching rates')
    ax[1].set_ylabel('Frequency')
    ax[1].set_xlabel('Time')
    fig.suptitle("Flock dynamics: "+display_string)
    fig.savefig(save_string)
    print("Saving plots for parameters", display_string)
    plt.close(fig)

def get_flock_data(df, saveDirName=".", make_plots=True, params=['pf', 'sp', 'lp'],
                   late_fraction=0.1, rolling_window=20):
    """TODO"""
    analyze = 'flock'
    check_directory_exists(saveDirName)
    gby = df.groupby(params)
    row_list = []
    for values, group in gby:
        param_values = [i for pair in zip(params, values) for i in pair]
        string_values = str.join('_', ['{}{}' for i in range(len(params))])
        string_values = string_values.format(*param_values)
        display_values = str.join(', ', ['{}={}' for i in range(len(params))])
        display_values = display_values.format(*param_values)
        print("Gathering", analyze, "data for parameters", display_values)

        flock_df = None
        for file in group[analyze].sort_values():
            if flock_df is not None: 
                flock_df = flock_df.append(get_flock_df(file), ignore_index=True)
            else:
                flock_df = get_flock_df(file)
        flock_df['time'] = flock_df.index * flock_df['time'].iloc[0]
        
        if make_plots:
            plot_flock_state(flock_df, display_values,
                             Path(saveDirName, string_values + "_flock.png"),
                             rolling_window)
            
        flock_col_names = [col for col in flock_df.columns if col != 'time']
        result_names_std = [name + '_std' for name in flock_col_names]
        result_names = [label 
                        for label_tuple 
                        in zip(flock_col_names, result_names_std) 
                        for label in label_tuple]
        late_time = int((1 - late_fraction)*flock_df.shape[0])
        results = [result
                   for result_tuple
                   in zip([flock_df[name].iloc[late_time:].mean() for name in flock_col_names],
                          [flock_df[name].iloc[late_time:].std() for name in flock_col_names]) 
                   for result in result_tuple]
        row = {key:value 
               for key, value 
               in (list(zip(params, values)) + list(zip(result_names, results)))}
        row_list.append(row)
    return pd.DataFrame(row_list)

In [None]:
params = ['pf', 'sp', 'lp', 'dr']
analyses = [
    'global_order',
    'polar_order_avg',
    'flock',
]
data_dir = Path('data/order_params/')

In [None]:
yamls = list(data_dir.glob("*_v*.yaml"))
yamls

In [None]:
to_replace = "pf0.05_dr3_v[0-9]{3}"
replacement = "pf{}_sp{:03d}_lp{:04d}_dr{:02d}"
formatter_contents = ['filament:packing_fraction', 'soft_potential_mag',
                      'filament:perlen_ratio', 'filament:driving_factor']
renamer = FileRenamer(to_replace, replacement, formatter_contents)

In [None]:
p = Path(data_dir, 'old_params')
p.mkdir(exist_ok=True)
for file in yamls:
    renamer.rename(file, confirm=True)
    file.replace(Path(p, file.name))

In [None]:
df = initializeDataFrame(data_dir, params, analyses)
df.head()

In [None]:
check_dataframe(df)

In [None]:
# tmpdf = df[((df.lp.astype(int) < 40) & (df.reload == '000'))]
# tmp_go_results = get_global_order_data(tmpdf, 'order_params/plots', make_plots=False, params=params,
#                                   late_fraction=0.1, rolling_window=20)
# tmp_poa_results = get_polar_order_avg_data(tmpdf, time_step=0.05, saveDirName='order_params/plots', make_plots=False)
# tmp_flock_results = get_flock_data(tmpdf, 'order_params/plots', make_plots=False)

In [None]:
# labels_to_drop = []
# for ix in df[((df.reload=='000') & (df.lp.astype(int) < 20))].index:
#     labels_to_drop.append(ix+1)
#     labels_to_drop.append(ix+2)
#     labels_to_drop.append(ix+3)

In [None]:
#df = df.drop(df[(df.sp.astype(int) == 40) & ((df.lp.astype(int) == 10) | (df.lp.astype(int) == 20))].index)

In [None]:
#df = df.drop(df[(df.sp.astype(int) == 200) & ((df.lp.astype(int) == 10) | (df.lp.astype(int) == 20))].index)

In [None]:
#df = df.drop(df[(df.sp.astype(int) == 90) & (df.lp.astype(int) == 200)].index)

In [None]:
# df = df.drop(index=labels_to_drop)

In [None]:
# df

In [None]:
go_results = get_global_order_data(df, Path(data_dir, 'plots'), make_plots=True, params=params,
                                  late_fraction=0.1, rolling_window=20)
pd.to_pickle(go_results, Path(data_dir, 'go_results.pkl'))

In [None]:
poa_results = get_polar_order_avg_data(df, time_step=0.05, saveDirName=Path(data_dir, 'plots'),
                                       make_plots=True, params=params)
pd.to_pickle(poa_results, Path(data_dir, 'poa_results.pkl'))


In [None]:
flock_results = get_flock_data(df, Path(data_dir, 'plots'), make_plots=True, params=params)

In [None]:
pd.to_pickle(flock_results, Path(data_dir, 'flock_results.pkl'))

In [None]:
k1 = pd.merge(go_results, poa_results, how='inner', on=['pf', 'sp', 'lp', 'dr'])
k2 = pd.merge(k1, flock_results, how='inner', on=['pf', 'sp', 'lp', 'dr'])

In [None]:
results = k2[[col for col in k2.columns if col[-3:] != 'std']]
results_std = k2[['pf', 'sp', 'lp', 'dr'] + [col for col in k2.columns if col[-3:] == 'std']]
pd.to_pickle(results, Path(data_dir, 'results.pkl'))
pd.to_pickle(results_std, Path(data_dir, 'results_std.pkl'))

In [None]:
results = pd.read_pickle(Path(data_dir,'pkls/results_no_lowlp'))
results_train = results[results.sp.astype(int) < 200]
results_train = results_train[results_train.lp.astype(int) <= 100]
#results_train = results

In [None]:
n_clusters = 5
scaler = StandardScaler()
X_train = scaler.fit_transform(results_train.iloc[:, len(params):])
X = scaler.fit_transform(results.iloc[:, len(params):])

In [None]:
#X = scaler.fit_transform(results.iloc[:, len(params):])
pca = PCA(n_components=5)
pca_fit = pca.fit(X_train)
X_train = np.matmul(X_train, pca.components_.transpose())
X = np.matmul(X, pca.components_.transpose())

In [None]:
def plot_cluster(cluster_fit, tsne_fit, X, results, save_to_dir='.', color_bg=False,
                 bg_fitter=KNeighborsClassifier(n_neighbors=3, metric='manhattan'),
                 bg_interpolation='gaussian', color_order = None, forced_labels=None):
    cluster_algo = str(cluster_fit.__class__).rpartition('.')[-1][:-2].lower()
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))
    if forced_labels is not None:
        labels=forced_labels
    else:
        labels = cluster_fit.fit_predict(X)
    light_colors = get_light_colors(labels, color_order)
    dark_colors = get_dark_colors(labels, color_order)
    predict_colors = [dark_colors[i] for i in labels]
    sps = results['sp'].astype(int)
    lps = results['lp'].astype(int)
    if color_bg:
        try:
            assert bg_fitter.__module__.partition('.')[0] == 'sklearn'
        except Exception:
            raise ValueError("bg_fitter variable must point to initialized sklearn fitting function!")
        grid = []
        for s in sps.unique():
            for l in lps.unique():
                grid.append([s,l])
        grid = np.array(grid)
        x_min, x_max = grid[:, 0].min() - 5, grid[:, 0].max() + 5
        y_min, y_max = grid[:, 1].min() - 5, grid[:, 1].max() + 5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, 1),
                             np.arange(y_min, y_max, 1))
        try:
            grid_fit = bg_fitter.fit(grid, labels)
        except:
            raise ValueError("bg_fitter failed to fit bg color grid!")
        grid_fit_predict = grid_fit.predict(np.c_[xx.ravel(), yy.ravel()])

        bg_colors = np.array([dark_colors[i] for i in grid_fit_predict]).reshape((xx.shape[0], xx.shape[1], 4))
        ax[0].imshow(bg_colors, interpolation=bg_interpolation, aspect='auto', alpha=0.4, zorder=-100)
        ax[0].invert_yaxis()
    ax[0].scatter(sps, lps, c=[dark_colors[i] for i in labels])#, s=150)
    ax[0].set_xlabel("$\epsilon$ ($k_BT$)")#, fontsize=30)
    ax[0].set_ylabel("$L_p/L$")#, fontsize=30)
    ax[0].set_title("Clustering classification: " + cluster_algo)#, fontsize=30)
    #ax[0].tick_params(labelsize=20)

    #sps = sorted(results['sp'].unique().astype(int))
    #lps = sorted(results['lp'].unique().astype(int))
    #ax[0].tick_params(which='major', width=1.00, length=6)
    #ax[0].xaxis.set_major_formatter(FuncFormatter(lambda a, b: sps[b]))
    #ax[0].yaxis.set_major_formatter(FuncFormatter(lambda a, b: lps[b]))
    #ax[0].xaxis.set_major_locator(ticker.FixedLocator(6*np.array(sps)/max(sps)))
    #ax[0].yaxis.set_major_locator(ticker.FixedLocator(6*np.array(lps)/max(lps)))

    ax[1].scatter(tsne_fit.embedding_[:,0], tsne_fit.embedding_[:,1], c=[light_colors[i] for i in labels])
    ax[1].set_xlabel("Embedded space, x")
    ax[1].set_ylabel("Embedded space, y")
    ax[1].set_title("TSNE Projection")
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    fig.savefig(Path(save_to_dir, "clustering_fit_" + cluster_algo + ".png"), dpi=600)
    #ax[0].tick_params(labelsize=20)
    plt.show()
    plt.close(fig)

In [None]:
def get_index(sp, lp):
    return results[(results.lp.astype(int) == lp) & (results.sp.astype(int) == sp)].index[0]

def get_dark_colors(labels, order=None):
#    cmap = plt.get_cmap('tab10')
    alpha = 255
    red = [139,0,0, alpha]
    green = [0,100,0, alpha]
    blue = [0,0,128,alpha]
    teal = [0,128,128, alpha]
    orange = [184,134,11, alpha]
    if order is not None:
        cmap = {'r': red,
               'b': blue,
               'g': green,
               'c': teal,
               'o': orange}
        colors = [cmap[c] for c in order]
    else:
        colors = [red, green, blue, teal, orange]
    return [[rgb/255 for rgb in color] for color in colors]

def get_light_colors(labels, order=None):
    alpha = 255
    red = [205,0,0, alpha]
    green = [0,205,0, alpha]
    blue = [0,0,205,alpha]
    teal = [0,150,150, alpha]
    orange = [218,165,32, alpha]
    if order is not None:
        cmap = {'r': red,
               'b': blue,
               'g': green,
               'c': teal,
               'o': orange}
        colors = [cmap[c] for c in order]
    else:
        colors = [red, green, blue, teal, orange]
    return [[rgb/255 for rgb in color] for color in colors]

#    colors = cmap(np.linspace(0, 1, len(np.unique(labels))))
    #return [colors[i] for i in labels]


In [None]:
state = np.random.randint(100, 10000)
#state = 1158
tsne = TSNE(perplexity=14, random_state=state, metric='euclidean').fit(X_train)
plt.figure()
plt.scatter(tsne.embedding_[:,0], tsne.embedding_[:,1])
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=1, weights='uniform', p=1, )
svc = SVC(kernel=30*RBF(25), gamma='auto', tol=1e-6,
          probability=True, max_iter=1e8)


In [None]:
labels = kmeans.fit_predict(X_train)

In [None]:
labels[-36] = 2

In [None]:
labels[-11-1] = 0

In [None]:
labels[-55+2] = 2

In [None]:
#state = np.random.raandint(100, 10000)
# default color_order = ['r', 'g', 'b', 'c', 'o']
kmeans = KMeans(n_clusters=5, init='k-means++', n_init=20, max_iter=500,
                random_state=state).fit(X)
plot_cluster(kmeans, tsne, X_train, results_train,
             Path(data_dir, 'plots'), True, bg_fitter=svc, bg_interpolation='gaussian',
             color_order = ['b','g','r','c','o'], forced_labels=labels)

In [None]:
spectral = SpectralClustering(n_clusters=4, gamma=0.01).fit(X)
plot_cluster(spectral, tsne, X_train, results_train, Path(data_dir, 'plots'), True, 
             bg_fitter=svc, bg_interpolation='gaussian',)

In [None]:
agglom = AgglomerativeClustering(n_clusters=4).fit(X)
plot_cluster(agglom, tsne, X_train, results_train, Path(data_dir, 'plots'), True,
             bg_fitter=svc, bg_interpolation='gaussian',)

In [None]:
gmix = GaussianMixture(n_components=4, max_iter=1000, n_init=10).fit(X)
plot_cluster(gmix, tsne, X_train, results_train, Path(data_dir, 'plots'), True,
            bg_fitter=svc, bg_interpolation='gaussian',)

In [None]:
kmeans.inertia_

In [None]:
kmeans.inertia_

In [None]:
Path('./tmp_plots').mkdir()

In [None]:
inertias = []
for nc in range(1, 10):
    km_tmp = KMeans(n_clusters=nc, init='k-means++', n_init=20, max_iter=500, random_state=state).fit(X)
    inertias.append(km_tmp.inertia_)
    plot_cluster(km_tmp, tsne, X, Path('./tmp_plots'))

In [None]:
plt.plot(inertias)

In [None]:
#Xdata = X.copy()

# Labeling the training data for supervised learning
#y=[0,0,0, 0,0,1, 1,1,2, 1,2,2, 3,4,4, 3,4,4, 2,2] 
y = kmeans.fit_predict(X)
clf = KNeighborsClassifier(n_neighbors=5, metric='manhattan').fit(xx, y)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = np.array([colors[i] for i in Z])

# Put the result into a color plot
Z = Z.reshape((xx.shape[0], xx.shape[1], 4))

In [None]:
xx = []
sps = results.sp.astype(int).unique()
lps = results.lp.astype(int).unique()
# Build the dataset of training points
for s in sps:
    for l in lps:
        xx.append([s,l])
        
#X.append([30,100])
#X.append([60,100])

xx = np.array(xx)

# Fit the data using a gaussian kernel
# The variance and magnitude of the kernel was found by
# trial and error to generate sensible phase boundaries.
kernel = 50*RBF(20)
#clf = SVC(kernel=kernel, gamma='auto', tol=1e-6,
#          probability=True, max_iter=1e8).fit(xx, y)
clf = KNeighborsClassifier(n_neighbors=5, metric='manhattan').fit(xx, y)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = np.array([colors[i] for i in Z])

# Put the result into a color plot
Z = Z.reshape((xx.shape[0], xx.shape[1], 4))
# create a mesh for the colorplot
x_min, x_max = xx[:, 0].min() - 10, xx[:, 0].max() + 10
y_min, y_max = xx[:, 1].min() - 10, xx[:, 1].max() + 10

h = .01  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
alpha = 0.4
red = [1,0,0,alpha]
lime = [0,1,0,alpha]
blue = [0,0,1,alpha]
cyan = [0,1,1,alpha]
magenta = [1,0,1,alpha]

red_patch = mpatches.Patch(color=red[:3],
                           label='Active isotropic')
lime_patch = mpatches.Patch(color=lime[:3],
                            label='Flocking')
blue_patch = mpatches.Patch(color=blue[:3],
                            label='Polar band')
cyan_patch = mpatches.Patch(color=cyan[:3],
                            label='Spooling')
magenta_patch = mpatches.Patch(color=magenta[:3],
                               label='Turbulent')
colors = np.array([red, lime, blue, cyan, magenta])

#plt.figure(figsize=(8, 8))

# Plot the predicted probabilities. For that, we will 
# assign a color to each point in the mesh
# [x_min, m_max]x[y_min, y_max].

Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = np.array([colors[i] for i in Z])

# Put the result into a color plot
Z = Z.reshape((xx.shape[0], xx.shape[1], 4))


In [None]:
fig = plt.figure()
#ax = fig.gca(projection='3d')

plt.imshow(Z, origin="lower")

# Plot the training points

In [None]:
fig = plt.figure()
#ax = fig.gca(projection='3d')

plt.imshow(Z, origin="lower",extent=(x_min, x_max, y_min, y_max))

# Plot the training points
#plt.scatter(xx[:, 0], xx[:, 1])#, #s=80,
            #c=[list(colors[i][:3]) for i in y], 
            #edgecolors=(0, 0, 0))

In [None]:
sps = results.sp.astype(int).unique()
lps = results.lp.astype(int).unique()

In [None]:
sps.max()

In [None]:
labels.shape

In [None]:
labels = kmeans.predict(X)



#bg_colors = np.rot90(
#   np.array(predict_colors).reshape((results['sp'].nunique(), results['lp'].nunique(), 4))
#)[::-1]
ax[0].imshow(bg_colors, interpolation='gaussian', aspect='auto', alpha=0.7, zorder=-100)
ax[0].invert_yaxis()

In [None]:
sps.shape[0] * lps.shape[0] * 4

In [None]:
lps

In [None]:
def plot_cluster_dummy(sps, lps, labels, save_to_dir='.', color_bg=False,
                 bg_fitter=KNeighborsClassifier(n_neighbors=3, metric='manhattan'),
                 bg_interpolation='gaussian', color_order = None, save_name='dummy'):
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))
    light_colors = get_light_colors(labels, color_order)
    dark_colors = get_dark_colors(labels, color_order)
    predict_colors = [dark_colors[i] for i in labels]
    if color_bg:
        try:
            assert bg_fitter.__module__.partition('.')[0] == 'sklearn'
        except Exception:
            raise ValueError("bg_fitter variable must point to initialized sklearn fitting function!")
        grid = []
        for s in np.unique(sps):
            for l in np.unique(lps):
                grid.append([s,l])
        grid = np.array(grid)
        x_min, x_max = 0, grid[:, 0].max() + 5
        y_min, y_max = 0, grid[:, 1].max() + 5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, 1),
                             np.arange(y_min, y_max, 1))
        try:
            grid_fit = bg_fitter.fit(grid, labels)
        except:
            raise ValueError("bg_fitter failed to fit bg color grid!")
        grid_fit_predict = grid_fit.predict(np.c_[xx.ravel(), yy.ravel()])

        bg_colors = np.array([dark_colors[i] for i in grid_fit_predict]).reshape((xx.shape[0], xx.shape[1], 4))
        ax[0].imshow(bg_colors, interpolation=bg_interpolation, aspect='auto', alpha=0.4, zorder=-100)
        ax[0].invert_yaxis()
    ax[0].scatter(sps, lps, c=[dark_colors[i] for i in labels])#, s=150)
    ax[0].set_xlabel("$\epsilon$ ($k_BT$)")#, fontsize=30)
    ax[0].set_ylabel("$L_p/L$")#, fontsize=30)
    ax[0].set_title("Clustering classification: kmeans")#, fontsize=30)
    #ax[1].scatter(tsne_fit.embedding_[:,0], tsne_fit.embedding_[:,1])
    ax[1].scatter(tsne.embedding_[:,0], tsne.embedding_[:,1])#c=[light_colors[i] for i in labels])
    ax[1].set_xlabel("Embedded space, x")
    ax[1].set_ylabel("Embedded space, y")
    ax[1].set_title("TSNE Projection")

    fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    fig.savefig(Path(save_to_dir, save_name + "_clustering_fit_kmeans.png"), dpi=600)
    plt.show()
    plt.close(fig)

In [None]:
sps = np.array([15, 15, 15, 25, 25, 25, 50, 50, 50, 100, 100, 100])
lps = np.array([20, 50, 100, 20, 50, 100, 20, 50, 100, 20, 50, 100])
labels=np.array([1, 3, 3, 2, 3, 3, 2, 3, 3, 4, 5, 5])-1
svc = SVC(kernel=50*RBF(35), gamma='auto', tol=1e-6,
          probability=True, max_iter=1e8)
plot_cluster_dummy(sps, lps, labels, '.', True, bg_fitter=svc, bg_interpolation='gaussian',
             color_order = ['g','b','c','o','r'], save_name='pf0.4_dr30')

In [None]:
sps = np.array([15, 15, 15, 20, 20, 20, 25, 25, 25, 50, 50, 50, 80, 80, 80, 100, 100, 100])
lps = np.array([20, 50, 100, 20, 50, 100, 20, 50, 100, 20, 50, 100, 20, 50, 100, 20, 50, 100])
labels=np.array([1, 1, 1, 1, 1, 2, 2, 2, 3, 2, 3, 3, 4, 5, 5, 4, 5, 5])-1
svc = SVC(kernel=30*RBF(20), gamma='auto', tol=1e-6,
          probability=True, max_iter=1e8)
plot_cluster_dummy(sps, lps, labels, '.', True, bg_fitter=svc, bg_interpolation='gaussian',
             color_order = ['g','b','c','o','r'], save_name='pf0.2_dr30')

In [None]:
sps = np.array([15, 15, 15, 25, 25, 25, 50, 50, 50, 100, 100, 100])
lps = np.array([20, 50, 100, 20, 50, 100, 20, 50, 100, 20, 50, 100])
labels=np.array([1, 1, 1, 1, 2, 2, 2, 2, 3, 4, 4, 2])
plot_cluster_dummy(sps, lps, labels, '.', True, bg_fitter=svc, bg_interpolation='gaussian',
             color_order = ['r','g','b','c','o'], save_name='pf0.20_dr30')