# For a set of csv files, read them in one at a time, perform t-SNE followed by DBSCAN, save figures and results 

In [2]:
import os
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import decomposition, metrics
from sklearn.preprocessing import scale, robust_scale
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE

# Note that file names are hardcoded in and I'm merging wells and then subsampling them. These wells only have "blast" labelled...no "normal".

In [7]:
#directory = '/Users/tswenson/Documents/Joels/Health_Data_Science/COMPANY_consulting_project/datasets/merged_csvs/'
directory = '/Users/tswenson/Documents/Joels/Health_Data_Science/COMPANY_consulting_project/datasets/screen_357_cell_plate_1_labeled_merged_csvs/'
df = []
do_boxplots = 0; # binary, 1 indicates that boxplots should be made
#for filename in glob.glob(directory + "LABELLED*.csv"): # reads through all files in this directory looking for *.csv and ignores sub-directories
for filename in glob.glob(directory + "LABELLED*n09*.csv"): # reads through all files in this directory looking for *.csv and ignores sub-directories
    print("Read in" + filename)
    # Set some variables equal to "None" so that I don't have surprises later
    my_data=None; my_data_headers=None; meta_headers=None; my_data_data_headers=None
    my_scaled_data = None; word_as_num = None; tsne_out_mink = None; labels = None
    # Read in two wells
    my_other_data=pd.read_csv(glob.glob(directory + "LABELLED*l07.csv")[0], index_col=0)
    my_data=pd.read_csv(filename, index_col=0)
    # Concatenate the two dataframes/wells together
    my_data = pd.concat([my_data, my_other_data], ignore_index = True)
    # Subsample the data
    samp_size = 12000
    print("SUBSAMPLING THIS MANY POINTS OF THE DATA: " + str(samp_size))
    rows = np.random.choice(my_data.index.values, replace=False, size = samp_size)
    my_data = my_data.ix[rows]
    filename = filename + "__" + str(samp_size) + "_SUBSAMPLE_iteration3__"
    #
    filename = filename + "__n09_cat_l07__"
    my_data_headers = list(my_data)
    meta_headers = ["Width","cell_label","cell_plate","lineage","screen","well","Time"]
    my_data_data_headers = [x for x in my_data_headers if not x in meta_headers]
    # Scale the data columns
    my_scaled_data = scale(my_data[my_data_data_headers])
    # Make a list where cell_label is converted to numbers for plotting
    word_as_num=[]
    for word in my_data['cell_label']:
        if word == "unlabelled":
            word_as_num.append("0")
        if word == "blast":
            word_as_num.append("0.5")
    lr = 3000 # Set the learning rate for t-SNE
    # Perform t-SNE
    print("Starting t-SNE calculation")
    tsne_out_mink = TSNE(metric='minkowski', learning_rate=lr, n_iter=lr, random_state=11).fit_transform(my_scaled_data)
    print("Done with t-SNE calculation")
    # Plot and save the output
    plt.figure(figsize=(20, 10))
    plt.gcf().clear()
    plt.subplot(121)
    plt.scatter(tsne_out_mink[:, 0], tsne_out_mink[:, 1], c= word_as_num, cmap=plt.cm.viridis)
    plt.savefig(filename + "__tsne.png")
    plt.close()
    # Do DBSCAN on the tSNEd data
    print("Starting DBSCAN")
    dbsc = DBSCAN(eps = .4,min_samples=10).fit(tsne_out_mink)
    labels = dbsc.labels_
    core_samples = np.zeros_like(labels, dtype = bool)
    core_samples[dbsc.core_sample_indices_] = True
    unique_labels = np.unique(labels)
    print("Plotting DBSCAN")
    # Plot the DBSCAN results
    plt.figure(figsize=(20, 10))
    plt.gcf().clear()
    plt.subplot(121)
    col = np.linspace(0,1,len(unique_labels))
    for i in xrange(len(unique_labels)):
        plt.plot(tsne_out_mink[np.where(labels==unique_labels[i])[0], 0], 
             tsne_out_mink[np.where(labels==unique_labels[i])[0], 1],
                #'o', mfc=plt.cm.viridis(col[i]), label=unique_labels[i])
                'o', mfc=plt.cm.nipy_spectral(col[i]), label=unique_labels[i])
                #'o', mfc=plt.cm.prism(col[i]), label=unique_labels[i])
                #'o', mfc=plt.cm.flag(col[i]), label=unique_labels[i])
    plt.legend(loc='best', frameon=False)
    plt.savefig(filename + "__DBSCAN_of_tsne.png")
    plt.close()
    # Summarize the combination of tSNE and DBSCAN results
    tsne_db = [] # This will be a list of lists where the 1st entry is the cluster ID, 2nd: is
    ## how many "unlabelled" are in that cluster, 3rd: number of blast, 4th: number of healthy.
    for i in xrange(len(unique_labels)):
        clst_index = my_data['cell_label'][np.where(labels==unique_labels[i])[0]]
        if clst_index.empty == True:
            print("Cluster index and original data don't line up right. STOP AND FIX")
            break
        tsne_db.append([unique_labels[i],sum(clst_index=='unlabelled'),
                         sum(clst_index=='blast')])
    # Prepare to plot the results as grouped barplots
    print("Plotting grouped barplots")
    tsne_db_df = pd.DataFrame(tsne_db,columns=["Cluster Label","Unlabelled","Blast"])
    tsne_db_df_melted = pd.melt(tsne_db_df,value_vars=["Unlabelled","Blast"],id_vars="Cluster Label")
    plt.gcf().clear()
    ax = sns.barplot(hue="variable",y="value",x="Cluster Label",data=tsne_db_df_melted,log='y')
    plt.savefig(filename + '__barplot.png')
    plt.close()
    if do_boxplots == 1:
        # For each cluster plot boxplots of each feature
        print("Starting boxplots plotting")
        for i in xrange(len(unique_labels)):
            clst_index = my_data.ix[np.where(labels==unique_labels[i])[0],my_data_data_headers]
            clst_index_scaled = my_scaled_data[np.where(labels==unique_labels[i])[0],]
            # Plot the unscaled data
            locations = range(1,(len(my_data_data_headers)+1))
            plt.figure()
            plt.gcf().clear()
            plt.boxplot(clst_index.as_matrix(),positions=locations)
            plt.title("Cluster number " + str(unique_labels[i]))
            plt.ylabel('A.U.')
            plt.xticks(locations, my_data_data_headers,rotation='vertical')
            #plt.yscale('log')
            percs = (clst_index.describe(percentiles=[0.1,0.5,0.9]))
            plt.ylim(percs.iloc[4].min(),percs.iloc[6].max())
            #plt.show()
            plt.savefig(filename + "__boxplots_of_cluster" +str(unique_labels[i])+".png")
            plt.close()
            locations = range(1,(len(my_data_data_headers)+1))
            # Plot the scaled data
            plt.figure()
            plt.gcf().clear()
            plt.boxplot(clst_index_scaled,positions=locations)
            plt.title("Cluster number " + str(unique_labels[i]))
            plt.ylabel('A.U.')
            plt.xticks(locations, my_data_data_headers,rotation='vertical')
            #plt.yscale('log')
            percs_scaled = pd.DataFrame(clst_index_scaled).describe(percentiles=[0.1,0.5,0.9])
            plt.ylim(percs_scaled.iloc[4].min(),percs_scaled.iloc[6].max())
            plt.savefig(filename + "__scaled_boxplots_of_cluster" +str(unique_labels[i])+".png")
            plt.close()
    print("Done with " + filename)

Read in/Users/tswenson/Documents/Joels/Health_Data_Science/COMPANY_consulting_project/datasets/screen_357_cell_plate_1_labeled_merged_csvs/LABELLED_by_cell_type_screen_357_cell_plate_1_labeled_merged_screen_357_cell_plate_1_labeled_merged_well_n09.csv
SUBSAMPLING THIS MANY POINTS OF THE DATA: 12000
Starting t-SNE calculation
Done with t-SNE calculation
Starting DBSCAN
Plotting DBSCAN
Plotting grouped barplots
Done with /Users/tswenson/Documents/Joels/Health_Data_Science/COMPANY_consulting_project/datasets/screen_357_cell_plate_1_labeled_merged_csvs/LABELLED_by_cell_type_screen_357_cell_plate_1_labeled_merged_screen_357_cell_plate_1_labeled_merged_well_n09.csv__12000_SUBSAMPLE_iteration3____n09_cat_l07__


# Now merge after robust scaling and do DBSCAN

In [None]:
directory = '/Users/tswenson/Documents/Joels/Health_Data_Science/COMPANY_consulting_project/datasets/merged_csvs/'
df = []
do_boxplots = 0; # binary, 1 indicates that boxplots should be made
#for filename in glob.glob(directory + "LABELLED*.csv"): # reads through all files in this directory looking for *.csv and ignores sub-directories
for filename in glob.glob(directory + "LABELLED*e21*.csv"): # reads through all files in this directory looking for *.csv and ignores sub-directories
    print("Read in" + filename)
    # Set some variables equal to "None" so that I don't have surprises later
    my_data=None; my_data_headers=None; meta_headers=None; my_data_data_headers=None
    my_scaled_data = None; word_as_num = None; tsne_out_mink = None; labels = None
    # Read in two wells
    my_other_data=pd.read_csv(glob.glob(directory + "LABELLED*c03.csv")[0], index_col=0)
    my_data=pd.read_csv(filename, index_col=0)
    my_data_headers = list(my_data)
    meta_headers = ["Width","cell_label","cell_plate","lineage","screen","well","Time"]
    my_data_data_headers = [x for x in my_data_headers if not x in meta_headers]
    # Scale the data columns
    my_scaled_data = robust_scale(my_data[my_data_data_headers])
    my_scaled_other_data = robust_scale(my_other_data[my_data_data_headers])
    # Concatenate the two np_arrays/wells together
    my_scaled_data = np.concatenate([my_scaled_data, my_scaled_other_data])
    filename = filename + "__robust-scaled-before-merging_e21_cat_c03__"
    # Concatenate the two dataframes together for proper cell labelling
    my_data = pd.concat([my_data, my_other_data], ignore_index = True)
    # Make a list where cell_label is converted to numbers for plotting
    word_as_num=[]
    for word in my_data['cell_label']:
        if word == "unlabelled":
            word_as_num.append("0")
        if word == "blast":
            word_as_num.append("0.5")
        if word == "healthy":
            word_as_num.append("1")
    lr = 3000 # Set the learning rate for t-SNE
    # Perform t-SNE
    print("Starting t-SNE calculation")
    tsne_out_mink = TSNE(metric='minkowski', learning_rate=lr, n_iter=lr, random_state=11).fit_transform(my_scaled_data)
    print("Done with t-SNE calculation")
    # Plot and save the output
    plt.figure(figsize=(20, 10))
    plt.gcf().clear()
    plt.subplot(121)
    plt.scatter(tsne_out_mink[:, 0], tsne_out_mink[:, 1], c= word_as_num, cmap=plt.cm.viridis)
    plt.savefig(filename + "__tsne.png")
    plt.close()
    # Do DBSCAN on the tSNEd data
    print("Starting DBSCAN")
    dbsc = DBSCAN(eps = .4,min_samples=10).fit(tsne_out_mink)
    labels = dbsc.labels_
    core_samples = np.zeros_like(labels, dtype = bool)
    core_samples[dbsc.core_sample_indices_] = True
    unique_labels = np.unique(labels)
    print("Plotting DBSCAN")
    # Plot the DBSCAN results
    plt.figure(figsize=(20, 10))
    plt.gcf().clear()
    plt.subplot(121)
    col = np.linspace(0,1,len(unique_labels))
    for i in xrange(len(unique_labels)):
        plt.plot(tsne_out_mink[np.where(labels==unique_labels[i])[0], 0], 
             tsne_out_mink[np.where(labels==unique_labels[i])[0], 1],
                #'o', mfc=plt.cm.viridis(col[i]), label=unique_labels[i])
                'o', mfc=plt.cm.nipy_spectral(col[i]), label=unique_labels[i])
                #'o', mfc=plt.cm.prism(col[i]), label=unique_labels[i])
                #'o', mfc=plt.cm.flag(col[i]), label=unique_labels[i])
    plt.legend(loc='best', frameon=False)
    plt.savefig(filename + "__DBSCAN_of_tsne.png")
    plt.close()
    # Summarize the combination of tSNE and DBSCAN results
    tsne_db = [] # This will be a list of lists where the 1st entry is the cluster ID, 2nd: is
    ## how many "unlabelled" are in that cluster, 3rd: number of blast, 4th: number of healthy.
    for i in xrange(len(unique_labels)):
        clst_index = my_data['cell_label'][np.where(labels==unique_labels[i])[0]]
        if clst_index.empty == True:
            print("Cluster index and original data don't line up right. STOP AND FIX")
            break
        tsne_db.append([unique_labels[i],sum(clst_index=='unlabelled'),
                         sum(clst_index=='blast'),sum(clst_index=='healthy')])
    # Prepare to plot the results as grouped barplots
    print("Plotting grouped barplots")
    tsne_db_df = pd.DataFrame(tsne_db,columns=["Cluster Label","Unlabelled","Blast","Healthy"])
    tsne_db_df_melted = pd.melt(tsne_db_df,value_vars=["Unlabelled","Blast","Healthy"],id_vars="Cluster Label")
    plt.gcf().clear()
    ax = sns.barplot(hue="variable",y="value",x="Cluster Label",data=tsne_db_df_melted,log='y')
    plt.savefig(filename + '__barplot.png')
    plt.close()
    if do_boxplots == 1:
        # For each cluster plot boxplots of each feature
        print("Starting boxplots plotting")
        for i in xrange(len(unique_labels)):
            clst_index = my_data.ix[np.where(labels==unique_labels[i])[0],my_data_data_headers]
            clst_index_scaled = my_scaled_data[np.where(labels==unique_labels[i])[0],]
            # Plot the unscaled data
            locations = range(1,(len(my_data_data_headers)+1))
            plt.figure()
            plt.gcf().clear()
            plt.boxplot(clst_index.as_matrix(),positions=locations)
            plt.title("Cluster number " + str(unique_labels[i]))
            plt.ylabel('A.U.')
            plt.xticks(locations, my_data_data_headers,rotation='vertical')
            #plt.yscale('log')
            percs = (clst_index.describe(percentiles=[0.1,0.5,0.9]))
            plt.ylim(percs.iloc[4].min(),percs.iloc[6].max())
            #plt.show()
            plt.savefig(filename + "__boxplots_of_cluster" +str(unique_labels[i])+".png")
            plt.close()
            locations = range(1,(len(my_data_data_headers)+1))
            # Plot the scaled data
            plt.figure()
            plt.gcf().clear()
            plt.boxplot(clst_index_scaled,positions=locations)
            plt.title("Cluster number " + str(unique_labels[i]))
            plt.ylabel('A.U.')
            plt.xticks(locations, my_data_data_headers,rotation='vertical')
            #plt.yscale('log')
            percs_scaled = pd.DataFrame(clst_index_scaled).describe(percentiles=[0.1,0.5,0.9])
            plt.ylim(percs_scaled.iloc[4].min(),percs_scaled.iloc[6].max())
            plt.savefig(filename + "__scaled_boxplots_of_cluster" +str(unique_labels[i])+".png")
            plt.close()
    print("Done with " + filename)

Read in/Users/tswenson/Documents/Joels/Health_Data_Science/COMPANY_consulting_project/datasets/merged_csvs/LABELLED_by_cell_type_screen_525_cell_plate_1_well_e21.csv
Starting t-SNE calculation
Done with t-SNE calculation
Starting DBSCAN
Plotting DBSCAN
Plotting grouped barplots
Starting boxplots plotting
Done with /Users/tswenson/Documents/Joels/Health_Data_Science/COMPANY_consulting_project/datasets/merged_csvs/LABELLED_by_cell_type_screen_525_cell_plate_1_well_e21.csv__robust-scaled-before-merging_e21_cat_c03__


# Now merge after scaling and do DBSCAN

In [3]:
directory = '/Users/tswenson/Documents/Joels/Health_Data_Science/COMPANY_consulting_project/datasets/merged_csvs/'
df = []
do_boxplots = 0; # binary, 1 indicates that boxplots should be made
#for filename in glob.glob(directory + "LABELLED*.csv"): # reads through all files in this directory looking for *.csv and ignores sub-directories
for filename in glob.glob(directory + "LABELLED*e21*.csv"): # reads through all files in this directory looking for *.csv and ignores sub-directories
    print("Read in" + filename)
    # Set some variables equal to "None" so that I don't have surprises later
    my_data=None; my_data_headers=None; meta_headers=None; my_data_data_headers=None
    my_scaled_data = None; word_as_num = None; tsne_out_mink = None; labels = None
    # Read in two wells
    my_other_data=pd.read_csv(glob.glob(directory + "LABELLED*c03.csv")[0], index_col=0)
    my_data=pd.read_csv(filename, index_col=0)
    my_data_headers = list(my_data)
    meta_headers = ["Width","cell_label","cell_plate","lineage","screen","well","Time"]
    my_data_data_headers = [x for x in my_data_headers if not x in meta_headers]
    # Scale the data columns
    my_scaled_data = scale(my_data[my_data_data_headers])
    my_scaled_other_data = scale(my_other_data[my_data_data_headers])
    # Concatenate the two np_arrays/wells together
    my_scaled_data = np.concatenate([my_scaled_data, my_scaled_other_data])
    filename = filename + "__scaled-before-merging_e21_cat_c03__"
    # Concatenate the two dataframes together for proper cell labelling
    my_data = pd.concat([my_data, my_other_data], ignore_index = True)
    # Make a list where cell_label is converted to numbers for plotting
    word_as_num=[]
    for word in my_data['cell_label']:
        if word == "unlabelled":
            word_as_num.append("0")
        if word == "blast":
            word_as_num.append("0.5")
        if word == "healthy":
            word_as_num.append("1")
    lr = 3000 # Set the learning rate for t-SNE
    # Perform t-SNE
    print("Starting t-SNE calculation")
    tsne_out_mink = TSNE(metric='minkowski', learning_rate=lr, n_iter=lr, random_state=11).fit_transform(my_scaled_data)
    print("Done with t-SNE calculation")
    # Plot and save the output
    plt.figure(figsize=(20, 10))
    plt.gcf().clear()
    plt.subplot(121)
    plt.scatter(tsne_out_mink[:, 0], tsne_out_mink[:, 1], c= word_as_num, cmap=plt.cm.viridis)
    plt.savefig(filename + "__tsne.png")
    plt.close()
    # Do DBSCAN on the tSNEd data
    print("Starting DBSCAN")
    dbsc = DBSCAN(eps = .4,min_samples=10).fit(tsne_out_mink)
    labels = dbsc.labels_
    core_samples = np.zeros_like(labels, dtype = bool)
    core_samples[dbsc.core_sample_indices_] = True
    unique_labels = np.unique(labels)
    print("Plotting DBSCAN")
    # Plot the DBSCAN results
    plt.figure(figsize=(20, 10))
    plt.gcf().clear()
    plt.subplot(121)
    col = np.linspace(0,1,len(unique_labels))
    for i in xrange(len(unique_labels)):
        plt.plot(tsne_out_mink[np.where(labels==unique_labels[i])[0], 0], 
             tsne_out_mink[np.where(labels==unique_labels[i])[0], 1],
                #'o', mfc=plt.cm.viridis(col[i]), label=unique_labels[i])
                'o', mfc=plt.cm.nipy_spectral(col[i]), label=unique_labels[i])
                #'o', mfc=plt.cm.prism(col[i]), label=unique_labels[i])
                #'o', mfc=plt.cm.flag(col[i]), label=unique_labels[i])
    plt.legend(loc='best', frameon=False)
    plt.savefig(filename + "__DBSCAN_of_tsne.png")
    plt.close()
    # Summarize the combination of tSNE and DBSCAN results
    tsne_db = [] # This will be a list of lists where the 1st entry is the cluster ID, 2nd: is
    ## how many "unlabelled" are in that cluster, 3rd: number of blast, 4th: number of healthy.
    for i in xrange(len(unique_labels)):
        clst_index = my_data['cell_label'][np.where(labels==unique_labels[i])[0]]
        if clst_index.empty == True:
            print("Cluster index and original data don't line up right. STOP AND FIX")
            break
        tsne_db.append([unique_labels[i],sum(clst_index=='unlabelled'),
                         sum(clst_index=='blast'),sum(clst_index=='healthy')])
    # Prepare to plot the results as grouped barplots
    print("Plotting grouped barplots")
    tsne_db_df = pd.DataFrame(tsne_db,columns=["Cluster Label","Unlabelled","Blast","Healthy"])
    tsne_db_df_melted = pd.melt(tsne_db_df,value_vars=["Unlabelled","Blast","Healthy"],id_vars="Cluster Label")
    plt.gcf().clear()
    ax = sns.barplot(hue="variable",y="value",x="Cluster Label",data=tsne_db_df_melted,log='y')
    plt.savefig(filename + '__barplot.png')
    plt.close()
    if do_boxplots == 1:
        # For each cluster plot boxplots of each feature
        print("Starting boxplots plotting")
        for i in xrange(len(unique_labels)):
            clst_index = my_data.ix[np.where(labels==unique_labels[i])[0],my_data_data_headers]
            clst_index_scaled = my_scaled_data[np.where(labels==unique_labels[i])[0],]
            # Plot the unscaled data
            locations = range(1,(len(my_data_data_headers)+1))
            plt.figure()
            plt.gcf().clear()
            plt.boxplot(clst_index.as_matrix(),positions=locations)
            plt.title("Cluster number " + str(unique_labels[i]))
            plt.ylabel('A.U.')
            plt.xticks(locations, my_data_data_headers,rotation='vertical')
            #plt.yscale('log')
            percs = (clst_index.describe(percentiles=[0.1,0.5,0.9]))
            plt.ylim(percs.iloc[4].min(),percs.iloc[6].max())
            #plt.show()
            plt.savefig(filename + "__boxplots_of_cluster" +str(unique_labels[i])+".png")
            plt.close()
            locations = range(1,(len(my_data_data_headers)+1))
            # Plot the scaled data
            plt.figure()
            plt.gcf().clear()
            plt.boxplot(clst_index_scaled,positions=locations)
            plt.title("Cluster number " + str(unique_labels[i]))
            plt.ylabel('A.U.')
            plt.xticks(locations, my_data_data_headers,rotation='vertical')
            #plt.yscale('log')
            percs_scaled = pd.DataFrame(clst_index_scaled).describe(percentiles=[0.1,0.5,0.9])
            plt.ylim(percs_scaled.iloc[4].min(),percs_scaled.iloc[6].max())
            plt.savefig(filename + "__scaled_boxplots_of_cluster" +str(unique_labels[i])+".png")
            plt.close()
    print("Done with " + filename)

Read in/Users/tswenson/Documents/Joels/Health_Data_Science/COMPANY_consulting_project/datasets/merged_csvs/LABELLED_by_cell_type_screen_525_cell_plate_1_well_e21.csv




Starting t-SNE calculation
Done with t-SNE calculation
Starting DBSCAN
Plotting DBSCAN
Plotting grouped barplots
Done with /Users/tswenson/Documents/Joels/Health_Data_Science/COMPANY_consulting_project/datasets/merged_csvs/LABELLED_by_cell_type_screen_525_cell_plate_1_well_e21.csv__scaled-before-merging_e21_cat_c03__
