In [77]:
import os, sys, re, io, pathlib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools
import cudf
# import cuml
from cuml.manifold import TSNE
from cuml.cluster import KMeans
from sklearn.preprocessing import normalize

buffer = io.StringIO()

# define the current path (notebooks in lab_utils)
labutilspath = str(pathlib.Path(os.getcwd()).parents[1])
sys.path.append(labutilspath)

# import the autoscan routines
from autoscan import autoscan

pp = autoscan.basics(material_info = True)

def ix_before_and_after(ds, index = ['tag', 'subtag'], columns = 'experiment', values = 'ix', mask = None, subset = None, aggfunc = lambda x: [*x],
                        dropna_tresh = 2, chain = True):
    ds.loc[:, 'ix'] = ds.index.values
    if mask is not None:
        ds = ds.loc[mask, :]
    dx = ds.pivot_table(index = index, 
                        columns = columns, 
                        values = values, 
                        aggfunc = aggfunc)
    if subset is not None:
        dx = dx.loc[:, subset]
    dx.dropna(thresh = dropna_tresh, inplace = True)

    # get the index of samples with before and after characterization
    if chain:
        ix = list(itertools.chain(*dx.apply(lambda x: [*itertools.chain(*[s for s in x if type(s) != float])], axis = 1).values))
    else:
        ix = dx
    return ix

def set_spe_style(ax, title = '', xlabel = '', ylabel=''):
    plt.sca(ax)
    plt.title(title, fontweight = 'bold');
    plt.xlabel(xlabel, fontweight = 'bold')
    plt.ylabel(ylabel, fontweight = 'bold');
    plt.xticks(fontweight = 'bold');
    plt.yticks(fontweight = 'bold');
    return ax

def kmeans_calc(data, nclusters = 4, kmeans_iter = 10000, kmeans_inplace = True, 
                plot = False, plot_save = True, plot_figsize = (12, 12), 
                plot_x = 'x', plot_y = 'cluster', plot_hue = 'c', plot_title = '', plot_savename = 'kmeans.png', plot_figpath = './'):
    df = data.copy()
    kmeans = KMeans(n_clusters = nclusters, max_iter = kmeans_iter, init = 'scalable-k-means++', n_init = 10)
    kmeans.fit(data.iloc[:, 5:])
    out = None
    df.loc[:, 'cluster'] = kmeans.labels_
    if kmeans_inplace:
        data.loc[:, 'cluster'] = kmeans.labels_
    else:
        out = kmeans.labels_
    
    if plot:
        fig, ax = plt.subplots(figsize = plot_figsize)
        sns.stripplot(x = plot_x, y = plot_y, hue = plot_hue, dodge = True, data = df, ax = ax)
        plt.yticks(range(nclusters), range(nclusters))
        plt.title(plot_title)
        if plot_save:
            plt.savefig(os.path.join(plot_figpath, plot_savename), dpi = 300, bbox_inches = 'tight')

    return out

def tsne_calc(data, datacols = None, perplexity = 30, n_iter = 10000, angle = 0.8, learning_rate = 10, random_state = 5, leftcols = None, concat = True):    
    p = perplexity
    neighbors = int(3 * p)
    if datacols is not None:
        v = data.loc[:, datacols].values
    else:
        v = data.values
    
    tsne = TSNE(n_components = 2, n_iter = n_iter, angle = angle, n_neighbors = neighbors, perplexity = p, 
                random_state = random_state, learning_rate = 10)
    X_hat = tsne.fit_transform(v)
    X_hat = pd.DataFrame(X_hat, columns = ['u', 'v'])
    
    out = None
    
    if concat:
        
        if leftcols is not None:
            left = data.loc[:, leftcols].copy()
        else:
            left = data
        out = pd.concat([left, X_hat], axis = 1)
    else:
        out = X_hat
    return out

def ftir_kmeans_plot():
    pass
#     x_hat = pd.concat([df_ftir.loc[:, left_cols], ], axis = 1)
#     fig, ax = plt.subplots(ncols = 2, figsize = (24, 13), sharey = True)
#     plt.subplots_adjust(wspace = 0.05)
#     sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'before'"), ax = ax[0], palette = 'viridis', legend = 'full')
#     sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'after' "), ax = ax[1], palette = 'viridis', legend = 'full')
#     ax[0].set_title('before')
#     ax[1].set_title('after')
#     fig.suptitle('tSNE before & after for all samples, perplexity = ' + str(p), fontsize = 12, fontweight = 'bold', y = 0.92)
#     plt.savefig(os.path.join(figspath, 'all-rocks_tsne.png'), dpi = 300, bbox_inches = 'tight')

rc_dict = {
    "font.size":12,
    'font.weight':'bold',
    "axes.titlesize":12,
    "axes.titleweight":'bold',
    "axes.labelsize":12,
    'axes.labelweight':'bold',
    'xtick.labelsize':12,
    'ytick.labelsize':12,
    'legend.frameon':True,
    'legend.fontsize':12,
    'legend.title_fontsize':12,
}

sns.set_context("paper", 
                rc = rc_dict)

sns.set(rc = rc_dict)

sns.set_style('darkgrid')

In [13]:
# define paths
datapath = '/sandbox/data/autoscan/'
vispath = '/sandbox/vis/autoscan/'
savepath = datapath
datafname = 'autoscan_corrected.h5'

datafile = os.path.join(datapath, datafname)
figspath = os.path.join(vispath, 'rock_multiphysics_display')

In [14]:
for p in [figspath, savepath]:
    if not os.path.exists(p):
        os.makedirs(p)

In [15]:
# read the data
dd = pd.read_hdf(datafile, key = 'data')
ds = pd.read_hdf(datafile, key = 'desc')
df = ds.join(dd)
del dd, ds

# df.columns = list(df.columns[:10].values) + list(pp.ftir_lambdas) + list(df.columns[-23:].values)
# set the value limits for perm
for k,v in {'sandstone':1e3, "shale":1e2, 'carbonate':1e3}.items():
    ix = df.loc[(df.family == k) & (df.instance == 'before'), 'perm'] > v
    fill_val = df.loc[ix.index[ix == False], 'perm'].mean()
    df.loc[ix.index[ix == True], 'perm'] = fill_val

df = df.query("family != 'metal' & family != 'gemstones'").copy()
df = df.sort_values(by ='instance', ascending = False, ignore_index = True)

In [16]:
# normalize(df.loc[df.l_1.isna() == False, pp.ftir_cols], norm = 'max')
df_temp = df.dropna(subset = pp.ftir_cols).copy()
df_temp.loc[:, pp.ftir_cols] = normalize(df_temp.loc[:, pp.ftir_cols], norm = 'max')
ix = df_temp.index
df.loc[ix, pp.ftir_cols] = df_temp.loc[:, pp.ftir_cols].values
del df_temp, ix

In [17]:
# df_ftir_bna = df.loc[ix, pp.meta_cols + pp.grid_cols + pp.ftir_cols]

In [27]:
df_ftir = df.loc[:, ['family', 'tag','instance', 'l_max_peak'] + pp.ftir_cols].dropna()
newcolumns = list(df_ftir.columns[:4].values) + list(pp.ftir_lambdas)
df_ftir.columns = newcolumns
df_ftir.reset_index(drop = False, inplace = True)
# df_ftir.iloc[:, 4:] = normalize(df_ftir.iloc[:, 4:])
df_ftir.head()
# df_ftir.iloc[:, 2:] = normalize(df_ftir.iloc[:, 2:], norm = 'max')

Unnamed: 0,index,family,tag,instance,l_max_peak,398.647,400.702,402.757,404.812,406.867,408.922,410.977,413.031,415.086,417.141,419.196,421.251,423.306,425.361,427.416,429.471,431.525,433.58,435.635,437.69,439.745,441.8,443.855,445.91,447.964,450.019,452.074,454.129,456.184,458.239,460.294,462.349,464.404,466.458,468.513,470.568,472.623,474.678,476.733,478.788,480.843,482.897,484.952,487.007,489.062,491.117,493.172,495.227,497.282,499.337,501.391,503.446,505.501,507.556,509.611,511.666,513.721,515.776,517.83,519.885,521.94,523.995,526.05,528.105,530.16,532.215,534.27,536.324,538.379,540.434,...,3844.686,3846.741,3848.796,3850.85,3852.905,3854.96,3857.015,3859.07,3861.125,3863.18,3865.235,3867.289,3869.344,3871.399,3873.454,3875.509,3877.564,3879.619,3881.674,3883.729,3885.783,3887.838,3889.893,3891.948,3894.003,3896.058,3898.113,3900.168,3902.222,3904.277,3906.332,3908.387,3910.442,3912.497,3914.552,3916.607,3918.662,3920.716,3922.771,3924.826,3926.881,3928.936,3930.991,3933.046,3935.101,3937.155,3939.21,3941.265,3943.32,3945.375,3947.43,3949.485,3951.54,3953.595,3955.649,3957.704,3959.759,3961.814,3963.869,3965.924,3967.979,3970.034,3972.088,3974.143,3976.198,3978.253,3980.308,3982.363,3984.418,3986.473,3988.528,3990.582,3992.637,3994.692,3996.747
0,0,shale,sh_001,before,406.867,0.947639,0.949593,0.97616,0.995882,1.0,0.999916,0.991398,0.983236,0.976724,0.970515,0.975131,0.968315,0.948678,0.928772,0.904866,0.89006,0.891846,0.892634,0.882096,0.877506,0.878122,0.870638,0.859796,0.850939,0.841426,0.829414,0.818378,0.814845,0.81246,0.802754,0.794283,0.79009,0.784638,0.777766,0.774743,0.77682,0.778355,0.780397,0.785232,0.791547,0.799511,0.806093,0.807431,0.805974,0.806375,0.806463,0.802292,0.796255,0.793267,0.792479,0.790627,0.787719,0.78569,0.784163,0.781558,0.779218,0.777902,0.775755,0.771979,0.77128,0.775495,0.778742,0.777779,0.775482,0.774703,0.77421,0.774488,0.775574,0.774712,0.774215,...,0.892876,0.894548,0.896281,0.894847,0.891714,0.889026,0.889818,0.892594,0.893267,0.892885,0.89204,0.891732,0.892445,0.893527,0.894702,0.89442,0.893963,0.89497,0.895278,0.894156,0.893382,0.894037,0.894702,0.894108,0.89398,0.894508,0.893118,0.890478,0.889426,0.890143,0.890438,0.89149,0.894143,0.896528,0.897074,0.896273,0.89629,0.896906,0.896242,0.894913,0.894561,0.894957,0.894442,0.893835,0.895001,0.896871,0.897628,0.896805,0.895221,0.894504,0.894772,0.89585,0.897408,0.897725,0.896906,0.896788,0.897434,0.897813,0.898002,0.898367,0.898328,0.897949,0.897346,0.895789,0.894196,0.893958,0.894064,0.894147,0.894895,0.895371,0.895014,0.894429,0.89453,0.895415,0.895784
1,12943,shale,sh_001,before,400.702,0.993814,1.0,0.925277,0.90991,0.93592,0.944133,0.913653,0.878019,0.860032,0.857239,0.857428,0.883235,0.878102,0.851242,0.837233,0.826899,0.806236,0.791397,0.793352,0.79656,0.788226,0.77119,0.757842,0.752011,0.7429,0.730004,0.722618,0.721875,0.720844,0.71798,0.714508,0.708998,0.703106,0.697313,0.691765,0.690418,0.695502,0.703899,0.709043,0.708832,0.708764,0.715557,0.723015,0.723267,0.722716,0.721558,0.716904,0.712477,0.708809,0.705677,0.703401,0.700427,0.695517,0.689905,0.687327,0.687199,0.687459,0.689459,0.692218,0.692211,0.690135,0.689135,0.689297,0.689878,0.690561,0.68921,0.686678,0.686036,0.685082,0.68364,...,0.774145,0.774255,0.777142,0.77932,0.778776,0.776032,0.773523,0.774119,0.775429,0.774859,0.772504,0.772043,0.775198,0.777946,0.778908,0.778738,0.776806,0.774783,0.774255,0.775285,0.776923,0.777538,0.77741,0.777176,0.776901,0.777452,0.778112,0.777244,0.775078,0.773149,0.771651,0.770873,0.773043,0.776285,0.77721,0.777335,0.778308,0.778271,0.777074,0.776278,0.776229,0.776667,0.777742,0.779029,0.780116,0.781048,0.781116,0.780622,0.77958,0.778455,0.778576,0.779267,0.779746,0.780018,0.779746,0.77878,0.778293,0.778508,0.778014,0.777565,0.777716,0.777369,0.777784,0.779214,0.780003,0.779999,0.779044,0.777667,0.776738,0.776014,0.774957,0.7743,0.77487,0.775821,0.776848
2,34079,shale,sh_001,before,402.757,0.92339,0.962308,1.0,0.989697,0.968856,0.940704,0.912592,0.886583,0.873156,0.873635,0.87343,0.85921,0.84712,0.827586,0.815856,0.816498,0.815283,0.811211,0.804397,0.794542,0.782132,0.769164,0.762323,0.75774,0.748829,0.73913,0.732319,0.728169,0.727369,0.727902,0.722194,0.712154,0.707173,0.708859,0.709586,0.706156,0.704187,0.708855,0.714173,0.714537,0.713179,0.71089,0.707192,0.707339,0.711261,0.713775,0.712669,0.709068,0.704187,0.698112,0.695219,0.697129,0.697036,0.694391,0.693393,0.693556,0.69395,0.694426,0.693567,0.690744,0.686606,0.6834,0.683292,0.684618,0.684552,0.682731,0.680093,0.678987,0.680399,0.682943,...,0.779223,0.778933,0.77845,0.776354,0.772888,0.769891,0.77331,0.779622,0.77968,0.776617,0.774339,0.775859,0.778291,0.77821,0.77867,0.779591,0.780233,0.780538,0.779181,0.778562,0.779305,0.779974,0.780589,0.779436,0.777935,0.777947,0.777614,0.776427,0.775735,0.775835,0.776524,0.777313,0.777034,0.776907,0.778075,0.779239,0.779842,0.780573,0.780736,0.779734,0.77867,0.778195,0.778059,0.77804,0.778798,0.779858,0.779629,0.778442,0.777015,0.77734,0.77968,0.780496,0.779359,0.778763,0.779606,0.780395,0.779715,0.778469,0.778516,0.779181,0.779482,0.77997,0.780314,0.779695,0.778628,0.778353,0.778264,0.777607,0.77698,0.776775,0.776605,0.776477,0.777804,0.779769,0.779962
3,34080,shale,sh_001,before,408.922,0.862991,0.886071,0.902102,0.922998,0.980134,1.0,0.967835,0.943698,0.924033,0.906519,0.88366,0.871329,0.865682,0.865807,0.877268,0.882515,0.863927,0.843377,0.835719,0.835908,0.836266,0.828896,0.81561,0.808656,0.809006,0.807818,0.806532,0.803952,0.79135,0.77642,0.769703,0.763595,0.756688,0.755316,0.758037,0.759217,0.757758,0.758474,0.760574,0.760932,0.761624,0.763182,0.764928,0.769368,0.772676,0.770792,0.768853,0.766305,0.760204,0.754073,0.748968,0.746368,0.74739,0.74953,0.750262,0.746226,0.738911,0.736142,0.741322,0.746494,0.743331,0.735513,0.731733,0.734691,0.738879,0.740732,0.739115,0.735745,0.736047,0.73941,...,0.8599,0.860049,0.860104,0.860726,0.860639,0.85811,0.855896,0.858398,0.859711,0.859298,0.858944,0.858689,0.857536,0.855589,0.855247,0.857379,0.859703,0.860974,0.861426,0.86084,0.859247,0.858571,0.858822,0.859011,0.860053,0.860883,0.860108,0.85944,0.858673,0.857552,0.858012,0.858964,0.8592,0.860081,0.860289,0.859577,0.860238,0.862138,0.863302,0.862315,0.860435,0.860148,0.86086,0.860706,0.861052,0.86344,0.865532,0.866366,0.866134,0.864301,0.863286,0.864089,0.864572,0.864344,0.863963,0.863699,0.862319,0.860089,0.860069,0.862055,0.863117,0.862618,0.86213,0.861194,0.859459,0.858201,0.857996,0.859711,0.861831,0.862881,0.863511,0.862842,0.861552,0.861709,0.8625
4,34081,shale,sh_001,before,413.031,0.875805,0.888646,0.914583,0.930502,0.943931,0.967054,0.99126,1.0,0.980966,0.952613,0.918078,0.895971,0.881607,0.862175,0.859739,0.868,0.850543,0.827389,0.820891,0.8112,0.796213,0.789936,0.795347,0.8043,0.805943,0.795695,0.778076,0.762154,0.752084,0.746012,0.742784,0.739711,0.735076,0.732417,0.733263,0.73387,0.734419,0.736591,0.739819,0.743689,0.74687,0.747048,0.746097,0.748818,0.753155,0.754095,0.75036,0.743515,0.738389,0.736665,0.73392,0.73203,0.73559,0.739243,0.73915,0.73859,0.738404,0.735625,0.732308,0.730782,0.728358,0.725868,0.725204,0.724141,0.72169,0.721265,0.724566,0.72702,0.727913,0.729235,...,0.851541,0.852051,0.855588,0.857076,0.854482,0.849051,0.845248,0.847749,0.850667,0.850872,0.849898,0.850219,0.851463,0.851417,0.849666,0.848958,0.85105,0.853064,0.851494,0.84706,0.846454,0.84977,0.851115,0.851707,0.853543,0.854761,0.853296,0.851231,0.851668,0.852592,0.851537,0.851158,0.852812,0.853616,0.852294,0.851285,0.852113,0.85272,0.851834,0.852376,0.85502,0.856048,0.854498,0.85437,0.855043,0.852662,0.849944,0.849318,0.849991,0.851773,0.853435,0.852511,0.851003,0.85185,0.852159,0.850791,0.851181,0.853141,0.854111,0.855108,0.85502,0.853021,0.853315,0.854753,0.854107,0.853122,0.852198,0.85226,0.854146,0.855205,0.854386,0.852913,0.851861,0.852499,0.85432


In [None]:
print("number of samples: %d, number of features: %d" %(df_ftir.shape))

In [None]:
nclusters = 10
kmeans = KMeans(n_clusters = nclusters, max_iter = 10000, init = 'scalable-k-means++', n_init = 10)
kmeans.fit(df_ftir.iloc[:, 5:])
df_ftir.loc[:, 'cluster'] = kmeans.labels_

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.stripplot(x = 'family', y = 'cluster', hue = 'instance', dodge = True, data = df_ftir, ax = ax)
plt.title('Distribution of clusters per rock family before and after')
plt.savefig(os.path.join(figspath, 'all-rocks_kmeans_stripplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
# fig, ax = plt.subplots(figsize = (24, 12))
# sns.kdeplot(x = 'l_max_peak', y = 'cluster', hue = 'family', shade = True, alpha = 0.5, data = df_ftir.query("instance == 'before'"), ax = ax, 
#             palette = 'deep')

In [None]:
# fig, ax = plt.subplots(figsize = (24, 12))
# sns.kdeplot(x = 'l_max_peak', y = 'cluster', hue = 'family', shade = True, alpha = 0.5, data = df_ftir.query("instance == 'after'"), ax = ax, 
#             palette = 'deep')

In [None]:
# fig, ax = plt.subplots(figsize = (12, 12))
# sns.violinplot(x = 'family', y = 'cluster', hue = 'instance', data = df_ftir, split = True, orient = 'v', inner = 'quartile', scale = 'area', ax = ax)
# plt.yticks(range(6), range(6))
# plt.title('Cluter distribution pero rock family before and after ')
# plt.savefig(os.path.join(figspath, 'all-rocks_kmeans_violinplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
left_cols = df_ftir.columns[:5].to_list() + ['cluster']

In [None]:
perplexity = p = 50
neighbors = int(3 * p)
tsne = TSNE(n_components = 2, n_iter = 10000, angle = 0.8, n_neighbors = neighbors, perplexity = p, random_state = 5, learning_rate = 10)
X_hat = tsne.fit_transform(df_ftir.iloc[:, 5:-1].values)

In [None]:
x_hat = pd.concat([df_ftir.loc[:, left_cols], pd.DataFrame(X_hat, columns = ['u', 'v'])], axis = 1)
fig, ax = plt.subplots(ncols = 2, figsize = (24, 13), sharey = True)
plt.subplots_adjust(wspace = 0.05)
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'before'"), ax = ax[0], palette = 'viridis', legend = 'full')
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'after' "), ax = ax[1], palette = 'viridis', legend = 'full')
ax[0].set_title('before')
ax[1].set_title('after')
fig.suptitle('tSNE before & after for all samples, perplexity = ' + str(p), fontsize = 12, fontweight = 'bold', y = 0.92)
plt.savefig(os.path.join(figspath, 'all-rocks_tsne.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
mask = df.loc[:, pp.ftir_cols].isna().any(axis = 1) == False
ix = ix_before_and_after(df.loc[:, pp.meta_cols], mask = mask, subset = ['before', 'heat_treatment', 'perf'])
df_ftir_bna = df_ftir.set_index('index').loc[ix, :].copy()
df_ftir_bna.reset_index(drop = False, inplace = True)

In [None]:
# data = df_ftir_bna.loc[:, ['instance'] + list(pp.ftir_lambdas)]
# data_before = data.copy()
# data_before.loc[data.instance == 'after', 'instance':] = np.nan
# fig, ax = plt.subplots(figsize = (12, 12))
# sns.heatmap(data_before.iloc[:, 1:], cmap = 'viridis', vmin = 0, vmax = 1)
# plt.title('FTIR data')

In [None]:
# data_after = data.copy()
# data_after.loc[data.instance == 'before', 'instance':] = np.nan
# fig, ax = plt.subplots(figsize = (12, 12))
# sns.heatmap(data_after.iloc[:, 1:], cmap = 'viridis', vmin = 0, vmax = 1)
# plt.title('FTIR data')

In [None]:
nclusters = 10
kmeans_bna = KMeans(n_clusters = nclusters, max_iter = 10000, init = 'scalable-k-means++', n_init = 10)
kmeans_bna.fit(df_ftir_bna.iloc[:, 5:-1])
df_ftir_bna.loc[:, 'cluster'] = kmeans_bna.labels_

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.stripplot(x = 'family', y = 'cluster', hue = 'instance', dodge = True, data = df_ftir_bna, ax = ax)
plt.yticks(range(nclusters), range(nclusters))
plt.title('Distribution of clusters per rock family before and after')
plt.savefig(os.path.join(figspath, 'all-rocks-bna_kmeans_stripplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'cluster', hue = 'instance', data = df_ftir_bna, split = True, orient = 'v', inner = 'quartile', scale = 'area', ax = ax)
plt.yticks(range(nclusters), range(nclusters))
plt.title('Cluter distribution pero rock family before and after ')
plt.savefig(os.path.join(figspath, 'all-rocks-bna_kmeans_violinplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
perplexity = p = 30
neighbors = int(5 * p)
tsne = TSNE(n_components = 2, n_iter = 10000, angle = 0.8, n_neighbors = neighbors, perplexity = p, random_state = 5, learning_rate = 10, method = 'exact')

X_hat = tsne.fit_transform(df_ftir_bna.iloc[:, 5:-1].values)

In [None]:
x_hat = pd.concat([df_ftir_bna.loc[:, left_cols], pd.DataFrame(X_hat, columns = ['u', 'v'])], axis = 1)
fig, ax = plt.subplots(ncols = 2, figsize = (24, 12.1), sharey = True)
plt.subplots_adjust(wspace = 0.05)

sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'before'"), ax = ax[0], palette = 'viridis', legend = 'full')
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'after' "), ax = ax[1], palette = 'viridis', legend = 'full')
ax[0].set_title('before')
ax[1].set_title('after')
fig.suptitle('tSNE before & after for all samples, perplexity = ' + str(p), fontsize = 12, fontweight = 'bold', y = 0.92)
plt.savefig(os.path.join(figspath, 'all-rocks-bna_tsne.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
df_temp = df_ftir.query("tag == 'sh_001'").copy()
df_temp.reset_index(drop = True, inplace = True)

In [None]:
nclusters = 4
kmeans_sh = KMeans(n_clusters = nclusters, max_iter = 10000, init = 'scalable-k-means++', n_init = 10)
kmeans_sh.fit(df_temp.iloc[:, 5:-1])
df_temp.loc[:, 'cluster'] = kmeans_sh.labels_

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.stripplot(x = 'instance', y = 'cluster', hue = 'cluster', dodge = False, data = df_temp, ax = ax)
plt.yticks(range(nclusters), range(nclusters))
plt.title('Cluter distribution before and after for shale sample sh_1')
plt.savefig(os.path.join(figspath, 'sh-1_kmeans_stripplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'cluster', hue = 'instance', data = df_temp, split = True, orient = 'v', inner = 'quartile', scale = 'count', ax = ax)
plt.xlabel('')
plt.xticks([0], [''])
plt.yticks([0, 1, 2, 3], [0, 1, 2, 3])
plt.title('Cluter distribution before and after for shale sample sh_1')
plt.savefig(os.path.join(figspath, 'sh-1_kmeans_violinplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
perplexity = p = 50
neighbors = int(3 * p)
print('neighbors = %d' % (neighbors))
tsne = TSNE(n_components = 2, n_iter = 10000, angle = 0.8, n_neighbors = neighbors, perplexity = p, random_state = 5, learning_rate = 10, method = 'exact')

X_hat = tsne.fit_transform(df_temp.iloc[:, 5:-1].values)

In [None]:
x_hat = pd.concat([df_temp.loc[:, left_cols], pd.DataFrame(X_hat, columns = ['u', 'v'])], axis = 1)

fig, ax = plt.subplots(ncols = 2, figsize = (24, 13), sharey = True)
plt.subplots_adjust(wspace = 0.05)
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'before'"), ax = ax[0], palette = 'viridis', legend = 'full')
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'after' "), ax = ax[1], palette = 'viridis', legend = 'full')
ax[0].set_title('before')
ax[1].set_title('after')
fig.suptitle('tSNE before & after for shale sample sh_1, perplexity = ' + str(p), fontsize = 12, fontweight = 'bold', y = 0.92)
plt.savefig(os.path.join(figspath, 'sh-1_tsne.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
df_temp = df_ftir.query("family == 'shale'").copy()
df_temp.reset_index(drop = True, inplace = True)

In [None]:
nclusters = 10
kmeans_sh = KMeans(n_clusters = nclusters, max_iter = 10000, init = 'scalable-k-means++', n_init = 10)
kmeans_sh.fit(df_temp.iloc[:, 5:-1])
df_temp.loc[:, 'cluster'] = kmeans_sh.labels_

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.stripplot(x = 'instance', y = 'cluster', hue = 'cluster', dodge = False, data = df_temp, ax = ax)
plt.yticks(range(nclusters), range(nclusters))
plt.title('Cluster distribution before and after for shale samples')
plt.savefig(os.path.join(figspath, 'shale_kmeans_stripplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'cluster', hue = 'instance', data = df_temp, split = True, orient = 'v', inner = 'quartile', scale = 'count', ax = ax)
plt.xlabel('')
plt.xticks([0], [''])
plt.yticks(range(nclusters), range(nclusters))
plt.title('Cluster distribution before and after for shale samples')
plt.savefig(os.path.join(figspath, 'shale_kmeans_violinplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
perplexity = p = 50
neighbors = int(3 * p)
tsne = TSNE(n_components = 2, n_iter = 10000, angle = 0.8, n_neighbors = neighbors, perplexity = p, random_state = 5, learning_rate = 10, method = 'exact')
X_hat = tsne.fit_transform(df_temp.iloc[:, 5:-1].values)

In [None]:
x_hat = pd.concat([df_temp.loc[:, left_cols], pd.DataFrame(X_hat, columns = ['u', 'v'])], axis = 1)

fig, ax = plt.subplots(ncols = 2, figsize = (24, 13), sharey = True)
plt.subplots_adjust(wspace = 0.05)
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'before'"), ax = ax[0], palette = 'viridis', legend = 'full')
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'after' "), ax = ax[1], palette = 'viridis', legend = 'full')
ax[0].set_title('before')
ax[1].set_title('after')
fig.suptitle('tSNE before & after for shale samples, perplexity = ' + str(p), fontsize = 12, fontweight = 'bold', y = 0.92)
plt.savefig(os.path.join(figspath, 'shale_tsne.png'), dpi = 300, bbox_inches = 'tight')