# As expected without scaling the data it is very very hard to cluster since absolute size of the feature is not indicitive of its effectiveness 

# Try DBSCAN on PCAd and non-PCAd data

In [1]:
import os
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import decomposition, metrics
from sklearn.preprocessing import scale, robust_scale
from sklearn.cluster import DBSCAN
#from sklearn.manifold import TSNE
%matplotlib inline

In [2]:
# Read in data
filename = '/Users/tswenson/Documents/Joels/Health_Data_Science/COMPANY_consulting_project/datasets/merged_csvs/LABELLED_by_cell_type_screen_525_cell_plate_1_well_c03.csv'
my_data=pd.read_csv(filename, index_col=0)

In [3]:
# Find data columns
my_data_headers = list(my_data)
meta_headers = ["Width","cell_label","cell_plate","lineage","screen","well","Time"]
my_data_data_headers = [x for x in my_data_headers if not x in meta_headers]
# Scale the data columns
my_scaled_data = robust_scale(my_data[my_data_data_headers])
# Make a list where cell_label is converted to numbers for plotting
word_as_num=[]
for word in my_data['cell_label']:
    if word == "unlabelled":
        word_as_num.append("0")
    if word == "blast":
        word_as_num.append("0.5")
    if word == "healthy":
        word_as_num.append("1")

# Check results without doing PCA or scaling

In [4]:
my_scaled_data_transformed_pd = my_data[my_data_data_headers]

# Automate the above and play with the hyper-parameters

In [14]:
eps_param = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.4, 1.8, 3, 6, 12]
algorithm_ = ["auto", "ball_tree","kd_tree","brute"]
metric_ = ["euclidean","cityblock","l1","l2","manhattan",\
           "chebyshev", "braycurtis", "canberra",\
           "dice", "hamming", "jaccard", "kulsinski", \
           "matching", "rogerstanimoto", "russellrao",\
           "sokalmichener", "sokalsneath"]
# metric not run with auto: "cosine", "correlation", "mahalanobis", \
# "minkowski", "seuclidean", "sqeuclidean", "yule"
# "braycurtis", "canberra" only run with auto and ball_tree

In [23]:
eps_param = [float(x)/1 for x in list(range(10000,10000000,500000))]
eps_param

[10000.0,
 510000.0,
 1010000.0,
 1510000.0,
 2010000.0,
 2510000.0,
 3010000.0,
 3510000.0,
 4010000.0,
 4510000.0,
 5010000.0,
 5510000.0,
 6010000.0,
 6510000.0,
 7010000.0,
 7510000.0,
 8010000.0,
 8510000.0,
 9010000.0,
 9510000.0]

In [24]:
metric_auto_only = [ "braycurtis", "canberra",\
           "dice", "hamming","jaccard", "kulsinski", \
           "matching", "rogerstanimoto", "russellrao",\
           "sokalmichener", "sokalsneath", "yule"]

In [25]:
for met in metric_:
    for algo in algorithm_:
        if met in metric_auto_only:
            if algo != "auto":
                continue
        for eps_ in eps_param:
            dbsc = None; labels = None; DB_PCA = None; DB_PCA_df = None;
            # Do DBSCAN on the PCAd data
            print("Starting DBSCAN with EPS of " + str(eps_) + " ALGORITHM of " + algo\
                 + " distance METRIC of " + met)
            dbsc = DBSCAN(eps = eps_,min_samples=10, algorithm = algo, metric = met).fit(my_scaled_data_transformed_pd)
            labels = dbsc.labels_
            core_samples = np.zeros_like(labels, dtype = bool)
            core_samples[dbsc.core_sample_indices_] = True
            unique_labels = np.unique(labels)

            # Done with DBSCAN, plot it
            DB_PCA = [] # This will be a list of lists where the 1st entry is the cluster ID, 2nd: is
            ## how many "unlabelled" are in that cluster, 3rd: number of blast, 4th: number of healthy.
            for i in xrange(len(unique_labels)):
                clst_index = my_data['cell_label'][np.where(labels==unique_labels[i])[0]]
                if clst_index.empty == True:
                    print("Cluster index and original data don't line up right. STOP AND FIX")
                    break
                DB_PCA.append([unique_labels[i],sum(clst_index=='unlabelled'),
                                 sum(clst_index=='blast'),sum(clst_index=='healthy')])
                #print(str(unique_labels2[i])+"--") # prints cluster name plus two "--"s
                #print len(tsne_out_mink2[np.where(labels2==unique_labels2[i])[0],0]) # prints how many cells are in that cluster
            print(DB_PCA) # This was print(tsne_db_2)

            # Convert the list of lists to a pandas dataframe
            DB_PCA_df = pd.DataFrame(DB_PCA,columns=["Cluster Label","Unlabelled","Blast","Healthy"])
            DB_PCA_df_melted = pd.melt(DB_PCA_df,value_vars=["Unlabelled","Blast","Healthy"],id_vars="Cluster Label")

            # Plot the results
            plt.gcf().clear()
            ax = sns.barplot(hue="variable",y="value",x="Cluster Label",data=DB_PCA_df_melted,log='y')
            plt.title("DBSCAN results with eps of " + str(eps_))
            plt.ylim(0, plt.ylim()[1])
            plt.savefig(filename + "__NO-PCA_raw-unscaled-data_DBSCAN_eps-" + str(eps_) + "_algo-" + algo + "_dist-metric-" + met + "_results.png")
            plt.close()
            print("Done with iteration with EPS of " + str(eps_) + " ALGORITHM of " + algo\
                 + " distance METRIC of " + met)


Starting DBSCAN with EPS of 10000.0 ALGORITHM of auto distance METRIC of euclidean
[[-1, 8399, 189, 493]]
Done with iteration with EPS of 10000.0 ALGORITHM of auto distance METRIC of euclidean
Starting DBSCAN with EPS of 510000.0 ALGORITHM of auto distance METRIC of euclidean
[[-1, 172, 1, 0], [0, 8117, 188, 489], [1, 104, 0, 0], [2, 6, 0, 4]]
Done with iteration with EPS of 510000.0 ALGORITHM of auto distance METRIC of euclidean
Starting DBSCAN with EPS of 1010000.0 ALGORITHM of auto distance METRIC of euclidean
[[-1, 59, 1, 0], [0, 8340, 188, 493]]
Done with iteration with EPS of 1010000.0 ALGORITHM of auto distance METRIC of euclidean
Starting DBSCAN with EPS of 1510000.0 ALGORITHM of auto distance METRIC of euclidean
[[-1, 35, 1, 0], [0, 8364, 188, 493]]
Done with iteration with EPS of 1510000.0 ALGORITHM of auto distance METRIC of euclidean
Starting DBSCAN with EPS of 2010000.0 ALGORITHM of auto distance METRIC of euclidean


KeyboardInterrupt: 