Analysis on segmented smFISH dataset May 2017

TODO  

* Look into cutoffs. Do they make sense, do they need to be different? then run everything again
* Repeat all the plots with the clusters (cell size, count etc)
* For each cell find a way to see how good it fits to its cluster. (similarity to the mean cell?)
* For each cell find the similarity with the other clusters (find doublets)
* Is there an relationship between doublets and cell size?
* Size correction of cells
* If clusterd on the fold change data, the clusters are very clear, but a third of the cells cluster together because they have low gene expression
    - If we want to say something about gene expression per cluster, maybe we should do that on this clustering and discard the 'bad cells'
    - Look at how uniform expression is, are there expression level differences. etc.
* Look into functions Cell Profiler. that is in python 

In [1]:
import loompy
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
plt.rcParams.update({'figure.max_open_warning': 150})
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
import pickle
import scipy.stats as stats
from mpl_toolkits.axes_grid1 import make_axes_locatable

from fastcluster import linkage
import polo # 
from polo import optimal_leaf_ordering #from polo import polo
#Paper: http://bioinformatics.oxfordjournals.org/content/17/suppl_1/S22.long

from scipy.spatial.distance import pdist

from scipy.cluster import hierarchy
from collections import Counter

#import cytograph
%matplotlib notebook

# Open, clean and transform file

In [2]:
file_name = '/home/lars/storage/Documents/Cortex_FISH/CountsV1-1.loom'
ds = loompy.connect(file_name)
ds

0,1,2,3,4,5,6,7,8,9,10,11,12
,CellID,1124,2325,2400,241,6248,5992,275,2573,330,1149,...
,CellArea,23939.0,13532.0,28819.0,40777.0,16303.0,9065.0,23155.0,44564.0,18963.0,19373.0,...
genes,,,,,,,,,,,,...
Hybridization1_Tbr1,,13.0,11.0,28.0,12.0,7.0,6.0,14.0,24.0,5.0,3.0,...
Hybridization1_Aldoc,,38.0,0.0,9.0,5.0,38.0,2.0,4.0,3.0,7.0,10.0,...
Hybridization1_Foxj1,,0.0,0.0,0.0,1.0,5.0,0.0,3.0,1.0,1.0,1.0,...
Hybridization6_Bmp4,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...
Hybridization6_Itpr2,,4.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,1.0,...
Hybridization6_Vip,,13.0,1.0,2.0,4.0,30.0,1.0,3.0,2.0,1.0,4.0,...
Hybridization4_Cnr1,,0.0,0.0,0.0,0.0,65.0,5.0,0.0,0.0,0.0,0.0,...


In [3]:
cell_size = dict(zip(ds.CellID, ds.CellArea))

In [4]:
#Cell area to square um
(np.sqrt(ds.CellArea)*0.065)**2
print('Low cutoff: ',(np.sqrt(2000)*0.065)**2, ' square um')
print('High cutoff: ',(np.sqrt(60000)*0.065)**2, ' square um')
print('A 10um diameter is an area of {}um2'.format(round(np.pi*5**2,1)))

Low cutoff:  8.45  square um
High cutoff:  253.5  square um
A 10um diameter is an area of 78.5um2


In [5]:
#Plot cell size
plt.figure()
    #In number  of pixels
#plt.hist(ds.CellArea, bins=100, range=[2000,80000])
    #In Square um
plt.hist((np.sqrt(ds.CellArea)*0.065)**2, bins=100, range=[8,254])

plt.xlabel('square um')
plt.title('Cell size histogram');

<IPython.core.display.Javascript object>

In [6]:
#Plot molecule count
plt.figure()
df_mol_count=pd.DataFrame(data=ds[:,:], columns=ds.col_attrs['CellID'], index=ds.row_attrs['genes']).astype(int)
plt.hist(df_mol_count.sum(axis=0), bins=100, range=[0,2000])
plt.xlabel('molecules')
plt.title('Molecule count')

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x7f3076db5e10>

In [7]:
plt.figure()
mol_count = df_mol_count.sum(axis=0)
area = (np.sqrt(ds.CellArea)*0.065)**2

count_area = list(zip(mol_count, area))
#Select only cells with count<600 and area<400
count_area = [x for x in count_area if x[0]<600 and x[1]<400]

mol_count = [item[0] for item in count_area]
area = [item[1] for item in count_area]

fit = np.polyfit(mol_count, area, 1)
fit_fn = np.poly1d(fit)

plt.plot(mol_count, area, 'k+', mol_count, fit_fn(mol_count), '--k')

plt.xlabel('Molecule count')
plt.xlim(xmin=0, xmax=600)
plt.ylabel('Area um2')
plt.ylim(ymin=0, ymax=400)
plt.title('Count Area')

print('Formula: {}'.format(fit_fn))

<IPython.core.display.Javascript object>

Formula:  
0.4395 x + 20.98


In [8]:
plt.figure(figsize=(20,12))
gs = plt.GridSpec(5,8)

area = (np.sqrt(ds.CellArea)*0.065)**2

for i, gene in enumerate(gene_sort_fish):
    plt.subplot(gs[i])
    count_area = list(zip(df_fish.loc[gene], area))
    #Select only cells with count<600 and area<400
    count_area = [x for x in count_area if 10<x[0]<600 and x[1]<400]

    mol_count = [item[0] for item in count_area]
    area_select = [item[1] for item in count_area]

    fit = np.polyfit(mol_count, area_select, 1)
    fit_fn = np.poly1d(fit)

    plt.plot(mol_count, area_select, 'k+', mol_count, fit_fn(mol_count), '--k')
    
    if i in range(len(gene_sort_fish)-8,len(gene_sort_fish)):
        plt.xlabel('Molecule count')
    #plt.xlim(xmin=0, xmax=600)
    if i%8 ==0:
        plt.ylabel('Area um2')
    #if i in range(8):
    plt.title(gene.split('_')[1])
    #plt.ylim(ymin=0, ymax=400)
    
plt.tight_layout()

<IPython.core.display.Javascript object>

NameError: name 'gene_sort_fish' is not defined

In [None]:
def loompy2data(filename, cutoff_low, cutoff_high, molecules_low):
    """
    Loom file to Panda's dataframe. Perform size and count selection on cells
    Input:
    `filename`(str): Name and path tot the Loom file
    `cutoff_low`(flt): Minimal cell size (in number of pixels, square um = (np.sqrt(pixels)*0.065)**2
    `cutoff_high`(flt): Max cell size (in number of pixels, square um = (np.sqrt(pixels)*0.065)**2
    `molecules_low`(int): Minimal number of molecules
    Returns:
    Panda's dataframe with cells that passed the criteria. df has cell label as column name. 
    Some gene names are corrected
    Metadata is discarded. 
    
    """
    #Read loom file and import to Pandas df
    ds = loompy.connect(filename)
    df=pd.DataFrame(data=ds[:,:], columns=ds.col_attrs['CellID'], index=ds.row_attrs['genes']).astype(int)
    
    #Filter cell size
    for i, j in enumerate(ds.col_attrs['CellID']):
        if ds.col_attrs['CellArea'][i] < cutoff_low or ds.col_attrs['CellArea'][i] > cutoff_high:
            del df[j]
    
    #Filter total molecule count
    for i, j in enumerate(ds.col_attrs['CellID']):
        try: #not all 'CellID' are still present
            if df[j].sum() < molecules_low:
                del df[j]
        except Exception:
            pass
        
    #Filter bad hybridiztion4 (out of focus imaging) and repeat Tbr1
    bad_data = ['Hybridization4_Cnr1', 'Hybridization4_Plp1', 'Hybridization4_Vtn', 'Hybridization11_Tbr1']
    for i in bad_data:
        df.drop(i, inplace=True)
     
    #Change incorrect names Tmem6-->Tmem2 and Kcnip-->Kcnip2
    df= df.rename({'Hybridization11_Tmem6': 'Hybridization11_Tmem2', 'Hybridization12_Kcnip': 'Hybridization12_Kcnip2'})
    
    print('Selected cells with  {}>size<{} and a minimum molecule count of {} molecules'.format(cutoff_low, cutoff_high, molecules_low))
    print('Change incorrect names Tmem6-->Tmem2 and Kcnip-->2')
    return df

    
df_fish = loompy2data(file_name,2000, 60000, 20)
df_fish.shape

In [None]:
excluded_cells = []
for i in ds.CellID:
    if i not in df_fish.columns:
        excluded_cells.append(i)
pickle.dump(excluded_cells, open( "excluded_cells.p", "wb" ))

In [None]:
df_fish.head(5)

In [None]:
gene_sort = ['Gad2', 'Slc32a1', 'Crhbp', 'Kcnip2', 'Cnr1', 'Vip', 'Cpne5', 'Pthlh', 'Crh',             
             'Tbr1', 'Lamp5', 'Rorb', 'Syt6', 
             'Aldoc', 'Gfap', 'Serpinf1', 'Mfge8', 
             'Sox10', 'Plp1', 'Pdgfra', 'Bmp4', 'Itpr2', 'Tmem2',  'Ctps',  'Klk6', 'Anln',   
             'Mrc1', 'Hexb', 
             'Ttr', 
             'Foxj1', 
             'Vtn', 'Flt1', 'Apln', 'Acta2',  'Lum'] 

gene_sort_wrong = ['Gad2', 'Slc32a1', 'Crhbp', 'Kcnip', 'Cnr1', 'Vip', 'Cpne5', 'Pthlh', 'Crh', 
             'Tbr1', 'Lamp5', 'Rorb', 'Syt6', 
             'Aldoc', 'Gfap', 'Serpinf1', 'Mfge8', 
             'Sox10', 'Plp1', 'Pdgfra', 'Bmp4', 'Itpr2', 'Tmem6',  'Ctps',  'Klk6', 'Anln',   
             'Mrc1', 'Hexb', 
             'Ttr', 
             'Foxj1', 
             'Vtn', 'Flt1', 'Apln', 'Acta2',  'Lum'] 


gene_sort_fish = ['Hybridization2_Gad2', 'Hybridization12_Slc32a1', 'Hybridization10_Crhbp', 'Hybridization12_Kcnip2', 'Hybridization13_Cnr1', 'Hybridization6_Vip', 'Hybridization5_Cpne5', 'Hybridization8_Pthlh',  'Hybridization10_Crh', 
'Hybridization1_Tbr1', 'Hybridization9_Lamp5', 'Hybridization7_Rorb', 'Hybridization11_Syt6',
'Hybridization1_Aldoc', 'Hybridization2_Gfap', 'Hybridization8_Serpinf1', 'Hybridization3_Mfge8',
 'Hybridization7_Sox10', 'Hybridization13_Plp1', 'Hybridization8_Pdgfra', 'Hybridization6_Bmp4','Hybridization6_Itpr2','Hybridization11_Tmem2', 'Hybridization7_Ctps','Hybridization5_Klk6','Hybridization9_Anln',
 'Hybridization3_Mrc1','Hybridization3_Hexb', 
 'Hybridization13_Ttr',
 'Hybridization1_Foxj1',
 'Hybridization12_Vtn',
 'Hybridization2_Flt1',
 'Hybridization10_Apln',
 'Hybridization5_Acta2',
 'Hybridization9_Lum']


#Make conversion dictionary with gene names and corresponding HybridizationX_geneY names

gene_name_conversion = {}

for i in gene_sort:
    for j in df_fish.index:
        if j.endswith(i):
            gene_name_conversion[j] =i
            
gene_name_conversion_reverse = dict((v,k) for k,v in gene_name_conversion.items())
            
#There are two mistakes in the gene names, correct for them in the dict
#gene_name_conversion['Hybridization11_Tmem6'] = 'Tmem2'
#gene_name_conversion['Hybridization12_Kcnip'] = 'Kcnip2'

# Cell properties

In [None]:
#Open object properties
obj_prop = pickle.load(open("ObjProp_dict.pkl", 'rb'))

#Load cell ID and XY coordinates into a list
coord_list = []
for i in obj_prop:
    coord_list.append([i, obj_prop[i]['obj_centroid'][0], obj_prop[i]['obj_centroid'][1]])
    
#Place cell ID and X and Y coordinates into dataframe
coord_df = pd.DataFrame(coord_list, columns=['Cell_ID', 'X', 'Y']).set_index('Cell_ID').T
coord_df = coord_df.loc[:,df_fish.columns]

In [None]:
def plot_cell_pos(coordinate_df, cell_ids=None, cell_of_interest=None, color='gray', color_highlight1='r', color_highlight2='r', s=5, standalone = True, mode='Highlight'):
    """
    Plot the centroids of all cells in grey. A selection of cells can be made red.
    And one cell of interest can be shown larger with a white border around.
    Input:
    `coordinate_df`(pd df): Pandas df with cells in columns, and 'X' & 'Y' as rows.
    `cell_ids`(list): If selection of cells to plot, enter a list of cell ids. Default = None
    `color`(str): color of cells, default gray
    `s`(float): Size of dots. Default = 5
    `stanalone`(bool): If true it creates a figure. If false it can be used as subplot
    `mode`(str): if 'highlight' it is possible to highlight a group of cells and/or one cell
    `color_highlight1`: color of group of cells. default='r' 
    `color_highlight2`: color of single cells. default='r'
    
    """
    if standalone == True:
        plt.figure(figsize=(7,7))
        
    if mode.lower() == 'tsne':
        plt.scatter(coordinate_df.loc['X',:], coordinate_df.loc['Y',:], linewidths=0, c=color, s=s)
    
    if mode.lower() == 'highlight':
        plt.scatter(coordinate_df.loc['X',:], coordinate_df.loc['Y',:], linewidths=0, c=color, s=s)
        if cell_ids != None:
            plt.scatter(coordinate_df.loc[:,cell_ids].loc['X'], coordinate_df.loc[:,cell_ids].loc['Y'], color=color_highlight1, s=s*2)
        if cell_of_interest != None:
            plt.scatter(coordinate_df.loc[:,cell_of_interest][0], coordinate_df.loc[:,cell_of_interest][1], color=color_highlight2, s=s*8, lw=2, edgecolor='w')
    plt.gca().invert_xaxis()
    plt.gca().invert_yaxis()
    plt.gca().axis('equal')
    plt.xlim([coordinate_df.loc['X'].max(), coordinate_df.loc['X'].min()])
    plt.ylim([coordinate_df.loc['Y'].max(), coordinate_df.loc['Y'].min()])
    plt.gca().patch.set_facecolor((.9,.9,.9))
    
plot_cell_pos(coord_df, cell_ids=None, cell_of_interest='7442', s=10, standalone=True, mode='Highlight')

# Gene expression

## Normalization of data

In [None]:
#Normalization on unsorted dataset
#Transform data
X = df_fish.values.astype(np.float64)
    #Sqrt normalize
X_sqrt = np.sqrt(X)
    #Mean normalize
X_norm = X.sum(0).mean() * (X/X.sum(0))
    #Log transform
X_log = np.log2(X_norm+1)
    #Build dataframes of normalized and log transformed data
df_fish_sqrt = pd.DataFrame(data=X_sqrt, columns=df_fish.columns, index=df_fish.index)
df_fish_norm=pd.DataFrame(data=X_norm, columns=df_fish.columns, index=df_fish.index)
df_fish_log=pd.DataFrame(data=X_log, columns=df_fish.columns, index=df_fish.index)

#Calculate the fold change dataframe
df_fish_fc = df_fish.loc[gene_sort_fish,:]
mean_expression =  df_fish.loc[gene_sort_fish,:].T.mean()
df_fish_fc = df_fish_fc.divide(mean_expression, axis='rows')

X_fc = df_fish_fc.values

In [None]:
#Same but with sorted gene list df_fish

df_fish = df_fish.loc[gene_sort_fish,:]

#Transform data
X = df_fish.values.astype(np.float64)
    #Sqrt normalize
X_sqrt = np.sqrt(X)
    #Mean normalize
X_norm = X.sum(0).mean() * (X/X.sum(0))
    #Log transform
X_log = np.log2(X_norm+1)

    #Build dataframes of normalized and log transformed data
df_fish_sqrt = pd.DataFrame(data=X_sqrt, columns=df_fish.columns, index=df_fish.index)
df_fish_norm = pd.DataFrame(data=X_norm, columns=df_fish.columns, index=df_fish.index)
df_fish_log = pd.DataFrame(data=X_log, columns=df_fish.columns, index=df_fish.index)

#Devide by size
area_df_fish = np.array([(np.sqrt(obj_prop[c]['obj_area'])*0.065)**2 for c in df_fish.columns])
X_size = X/area_df_fish
    #Sqrt normalize
X_size_sqrt = np.sqrt(X_size)
    #Mean normalize
X_size_norm = X_size.sum(0).mean() * (X_size/X_size.sum(0))
    #Log transform
X_size_log = np.log2(X_size_norm+1)

    #Build dataframes of normalized and log transformed data
df_fish_size_sqrt = pd.DataFrame(data=X_size_sqrt, columns=df_fish.columns, index=df_fish.index)
df_fish_size_norm = pd.DataFrame(data=X_size_norm, columns=df_fish.columns, index=df_fish.index)
df_fish_size_log = pd.DataFrame(data=X_size_log, columns=df_fish.columns, index=df_fish.index)


#Calculate the fold change dataframe
df_fish_fc = df_fish.loc[gene_sort_fish,:]
mean_expression_for_fc =  df_fish.loc[gene_sort_fish,:].T.mean()
df_fish_fc = df_fish_fc.divide(mean_expression_for_fc, axis='rows')

X_fc = df_fish_fc.values

In [None]:
area_df_fish = np.array([(np.sqrt(obj_prop[c]['obj_area'])*0.065)**2 for c in df_fish.columns])
area_df_fish.shape

## Add featrues

Does not enhance the data so much

In [None]:
major_type_markers = ['Hybridization12_Slc32a1',
                     'Hybridization1_Tbr1',
                      'Hybridization7_Sox10',
                      'Hybridization1_Aldoc',
                      'Hybridization3_Hexb',
                      'Hybridization3_Mrc1',
                      'Hybridization1_Foxj1',
                      'Hybridization12_Vtn',
                      'Hybridization2_Flt1',
                      'Hybridization5_Acta2',
                      'Hybridization9_Lum']

In [None]:
#Using the mean corrected and Log normalized data
#Multiply the major markers with all the other genes to enhance the profile.

df_fish_addfeature = df_fish_log

for i in major_type_markers:
    for j in gene_sort_fish:
        if i != j: 
            name_new = i.split('_')[1] + '_' + j.split('_')[1]
            data = pd.Series(df_fish_log.loc[i] * df_fish_log.loc[j], name=name_new)
            df_fish_addfeature = df_fish_addfeature.append(data)


## Add location

In [None]:
df_fish_location = df_fish_log

df_fish_location = df_fish_location.append(coord_df.loc['X']/10000)
df_fish_location = df_fish_location.append(coord_df.loc['Y']/10000)

## Visualizing gene expression

In [None]:
def plot_cell_expression(cell, dataset='Count', save=False):
    """
    Plot the expression levels of all genes (except round 4 genes).
    Possible to select counts, normalized or log-normalized data
    Input:
    `cell`(str): Cell name, like: '100'
    `data_format`(str): Choose the dataset.
        'None' will give the count data
        'Norm' will give normalized data
        'Log' will give log-normalized data
    
    """
    try:
        df_fish[cell]
        
        fig, ax = plt.subplots(figsize=[5,2])
        if dataset == None or dataset.lower() == 'count':
            expression_data = df_fish.loc[gene_sort_fish,:][cell]
            mean = True, df_fish.loc[gene_sort_fish,:].T.mean()
            y_label = ''
        elif dataset.lower() == 'norm':
            expression_data = df_fish_norm.loc[gene_sort_fish,:][cell]
            mean = True, df_fish_norm.loc[gene_sort_fish,:].T.mean()
            y_label = 'Normalized'
        elif dataset.lower() == 'log':
            expression_data = df_fish_log.loc[gene_sort_fish,:][cell]
            mean = True, df_fish_log.loc[gene_sort_fish,:].T.mean()
            y_label = 'Log-Normalized'
        elif dataset.lower() == 'fc': 
            expression_data = df_fish.loc[gene_sort_fish,:][cell] / df_fish.loc[gene_sort_fish,:].T.mean()
            mean = False, 0.5
            y_label = 'Fold Change'
        else:
            return ('Error incorrect dataset: {}, Choose "Count", "log" or "fc"'.format(dataset))
        
        #Plot expression data
        left_pos = np.arange(len(expression_data))
        gene_colors = ['red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red','blue', 'blue', 'blue', 'blue','orange', 'orange', 'orange', 'orange',
        'green', 'green', 'green', 'green', 'green', 'green', 'green', 'green', 'green',
        'lightblue', 'lightblue', 'darkblue', 'darkred','darkred','red','lightgreen','red','lightblue']
        ax.bar(left_pos, list(expression_data), color=gene_colors, alpha=0.5)
        
        '''
        #Plot mean expression level (count/norm/log, not FoldChange)
        if mean[0] == True:
            for i, m in enumerate(mean[1]):
                ax.axhline(m, xmin=i/len(mean[1]), xmax=(i+1)/len(mean[1]), color='grey')
        else:
            ax.axhline(1, color='grey')
'''

        labels = [i.split('_')[1] for i in list(expression_data.index)]
        ax.set_xticks(left_pos)
        ax.set_xticklabels(labels, rotation='vertical', fontsize=8)
        ax.set_xlabel
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
        ax.set_ylabel('Expression level {}'.format(y_label))
        #ax.set_title('Data: {}'.format(dataset))
        
        plt.tight_layout()
        if save == True:
            plt.savefig('/home/lars/storage/Documents/Cortex_FISH/cell_expression/{}.png'.format(cell))
    
        
    except KeyError as e:
        print('KeyError: Cell {} not in df, maybe cell is removed in quality clean up'.format(e))

    


plot_cell_expression(df_fish.columns[6], 'count', save=Fasle)

In [None]:
def cell_info(cell, dataset='Count', save=False, post_clustering=False):
    """
    Plot expression level, location, size, 
    
    """
    try:
        df_fish[cell]
    except KeyError as e:
        print('KeyError: Cell {} not in df_fish, maybe cell is removed in quality clean up'.format(e))
        return

    plt.figure(figsize=(14,5))
    ax1 = plt.subplot2grid((2,10), (0,0), rowspan=2, colspan=5)
    if dataset.lower() == 'count':
        expression_data = df_fish.loc[gene_sort_fish,:][cell]
        mean = True, df_fish.loc[gene_sort_fish,:].T.mean()
        dataset_label = ''
    elif dataset.lower() == 'norm':
        expression_data = df_fish_norm.loc[gene_sort_fish,:][cell]
        mean = True, df_fish_norm.loc[gene_sort_fish,:].T.mean()
        dataset_label = 'Normalized'
    elif dataset.lower() == 'log':
        expression_data = df_fish_log.loc[gene_sort_fish,:][cell]
        mean = True, df_fish_log.loc[gene_sort_fish,:].T.mean()
        dataset_label = '(Log Normalized)'
    elif dataset.lower() == 'fc': 
        expression_data = df_fish.loc[gene_sort_fish,:][cell] / df_fish.loc[gene_sort_fish,:].T.mean()
        mean = False, 0.5
        dataset_label = 'Fold Change'
    elif dataset.lower() == 'size':
        expression_data = df_fish_size_sqrt.loc[gene_sort_fish,:][cell]
        mean = True, df_fish_size_sqrt.loc[gene_sort_fish,:].T.mean()
        dataset_label = '(size normalized, sqrt)'
    else:
        return ('Error incorrect dataset: {}, Choose "Count", "Norm", "log" or "fc"'.format(dataset))
    
    #Plot expression data
    left_pos = np.arange(len(expression_data))/ len(expression_data)
    gene_colors = ['red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red','blue', 'blue', 'blue', 'blue','orange', 'orange', 'orange', 'orange',
    'green', 'green', 'green', 'green', 'green', 'green', 'green', 'green', 'green',
    'lightblue', 'lightblue', 'darkblue', 'darkred','darkred','red','lightgreen','red','lightblue']
    ax1.bar(left_pos+0.01, list(expression_data), color=gene_colors, alpha=0.5, width=0.025)
    
    #Plot mean expression level (count/norm/log, not FoldChange)
    if mean[0] == True:
        for i, m in enumerate(mean[1]):
            ax1.axhline(m, xmin=i/len(mean[1])-0.005, xmax=(i+1)/len(mean[1])-0.005, color='grey')
    else:
        ax1.axhline(1, color='grey')
    
    ax1.set_xlim([0,1])
    
    labels = [i.split('_')[1] for i in list(expression_data.index)]
    ax1.set_xticks(left_pos+0.01)
    ax1.set_xticklabels(labels, rotation='vertical')
    ax1.spines['right'].set_visible(False)
    ax1.spines['top'].set_visible(False)
    ax1.yaxis.set_ticks_position('left')
    ax1.xaxis.set_ticks_position('bottom')
    ax1.set_ylabel('Expression level ' + dataset_label)
    ax1.set_title('Expression Level, Cell: {}'.format(cell))

    ax2 = plt.subplot2grid((2,10), (0,5), rowspan=2, colspan=2)
    plt.sca(ax2)
    plot_cell_pos(coord_df, cell_ids=None, cell_of_interest=cell, s=5, standalone=False)
    for tl in ax2.get_xticklabels() + ax2.get_yticklabels():
                tl.set_visible(False)
    ax2.set_title('Cell Location')

    ax3 = plt.subplot2grid((2,10), (0,7), rowspan=1, colspan=2)
    ax3.hist((np.sqrt(ds.CellArea)*0.065)**2, bins=100, range=[8,254], color='grey')
    plt.xticks(rotation=-45)
    cell_size = float((np.sqrt(ds.CellArea[np.where(ds.CellID=='1124')])*0.065)**2)
    plt.axvline(cell_size, color='r', linewidth=3)
    ax3.set_title('Cell Size  = {} um2'.format(np.round(cell_size)))
    ax3.set_xlabel('Square um')

    ax4 = plt.subplot2grid((2,10), (1,7), rowspan=1, colspan=2)
    ax4.hist(df_fish.sum(axis=0), bins=100, range=[20,2000], color='grey');
    plt.axvline(df_fish.loc[:,cell].sum(), color='r', linewidth=3)
    ax4.set_title('Total molecule count = {}'.format(df_fish.loc[:,cell].sum()))
    ax4.set_xlabel('Molecule count')
   
    if post_clustering == True:
        ax5 = plt.subplot2grid((2,10), (0,9), rowspan=2, colspan=1)
        ax5.scatter(tSNE_points[:,0], tSNE_points[:,1], lw=0, s=1, alpha = 1, c ='grey')
        ax5.scatter(tSNE_points[df_fish.columns.get_loc(cell),0], tSNE_points[df_fish.columns.get_loc(cell),1], lw=0, s=4, alpha = 1, c ='red')
        ax5.set_aspect('equal')
        ax5.set_axis_off()
        ax5.set_title('Cluster: {}\n{}'.format(cell_labels[cell], cluster_labels[cell_labels[cell]]))
        
    plt.tight_layout()
    if save == True:
        plt.savefig('Cell_{}.png'.format(cell))
cell_n = 33
cell_info(df_fish.columns[cell_n], 'size', post_clustering=False)



In [None]:
def plot_top_expression(gene, top_n, df):
    gene = gene_name_conversion_reverse[gene]
    top_expression = df.iloc[:, np.argsort(df.loc[gene])[::-1]].iloc[:,:top_n]

    expression_mean = top_expression.mean(axis=1)
    fig, ax = plt.subplots()
    left_pos = np.arange(len(expression_mean))
    gene_colors = ['red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red','blue', 'blue', 'blue', 'blue','orange', 'orange', 'orange', 'orange',
    'green', 'green', 'green', 'green', 'green', 'green', 'green', 'green', 'green',
    'lightblue', 'lightblue', 'darkblue', 'darkred','darkred','red','lightgreen','red','lightblue']
    labels = [i.split('_')[1] for i in list(expression_mean.index)]
    ax.bar(left_pos, list(expression_mean), color=gene_colors, alpha=0.5)
    ax.hlines(1, 0, len(expression_mean))
    ax.set_xticks(left_pos+0.4)
    ax.set_xticklabels(labels, rotation='vertical')
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_title('Mean expression level of top {} {} cells'.format(top_n, gene.split('_')[1]))
    ax.set_ylabel('Count or FC')

import time
for i in gene_sort:
    plot_top_expression(i, 100, df_fish_fc)


### Spatial expression

In [None]:
plt.figure(figsize=(20,12))
gs = plt.GridSpec(5,8)

for i, gene in enumerate(gene_sort_fish):
    plt.subplot(gs[i])
    filt = df_fish.index == gene
    levels = X_log[filt,:].flat[:]
    #levels = X_fc[filt,:].flat[:]
    levels = np.clip(levels, np.percentile(levels, 10), np.percentile(levels,90))
    selection_df = coord_df.loc[:,df_fish.columns] #the coord_df is not filtered
    plt.scatter(selection_df.loc['X'], selection_df.loc['Y'],color=plt.cm.coolwarm(levels/max(levels)), lw=0, alpha=0.4,s=5)
    #for label in plt.gca().axes.get_xticklabels():
    #    label.set_visible(False)
    #for label in plt.gca().axes.get_yticklabels():
    #    label.set_visible(False)
    plt.gca().set_axis_off()
    
    plt.gca().invert_xaxis()
    plt.gca().invert_yaxis()
    plt.gca().axis('equal')
    plt.xlim([coord_df.loc['X'].max(), coord_df.loc['X'].min()])
    plt.ylim([coord_df.loc['Y'].max(), coord_df.loc['Y'].min()])
    plt.title(gene.split('_')[1], color='black', fontsize=16)
    #plt.tight_layout()

#plt.savefig('spatial_marker_expression.png', dpi=300)

In [None]:
channels_wplaceholder = {'cy3_genes': ['Hybridization1_Foxj1',
  'Hybridization2_Gfap',
  'Hybridization3_Mfge8',
  'Hybridization5_Cpne5',
  'Hybridization6_Itpr2',
  'Hybridization7_Ctps',
  'Hybridization8_Pdgfra',
  'Hybridization9_Lamp5',
  'Hybridization10_Crh',
  'Hybridization11_Syt6',
  'Hybridization12_Kcnip2',
  'Hybridization13_Cnr1'],
 'cy5_genes': ['Hybridization1_Tbr1',
  'Hybridization2_Gad2',
  'Hybridization3_Mrc1',
  'Hybridization5_Acta2',
  'Hybridization6_Vip',
  'Hybridization7_Sox10',
  'Hybridization8_Serpinf1',
  'Hybridization9_Anln',
  'Hybridization10_Apln',
    'Hybridization1_Tbr1',
  'Hybridization12_Vtn',
  'Hybridization13_Ttr'],
 'txred_genes': ['Hybridization1_Aldoc',
  'Hybridization2_Flt1',
  'Hybridization3_Hexb',
  'Hybridization5_Klk6',
  'Hybridization6_Bmp4',
  'Hybridization7_Rorb',
  'Hybridization8_Pthlh',
  'Hybridization9_Lum',
  'Hybridization10_Crhbp',
  'Hybridization11_Tmem2',
  'Hybridization12_Slc32a1',
  'Hybridization13_Plp1']}

plt.figure(figsize=(6,35))
gs = plt.GridSpec(12,3)

for i in range(len(channels_wplaceholder['cy3_genes'])):
    for j in range(3):
        pos = i*3 + j
        if j == 0:
            gene = channels_wplaceholder['cy3_genes'][i]
        if j == 1:
            gene = channels_wplaceholder['txred_genes'][i]
        if j == 2:
            gene = channels_wplaceholder['cy5_genes'][i]
    
        plt.subplot(gs[pos])
        filt = df_fish.index == gene
        levels = X_log[filt,:].flat[:]
        levels = np.clip(levels, np.percentile(levels, 10), np.percentile(levels,90))
        selection_df = coord_df.loc[:,df_fish.columns] #the coord_df is not filtered
        plt.scatter(selection_df.loc['X'], selection_df.loc['Y'],color=plt.cm.Reds(levels/max(levels)), lw=0, alpha=0.4,s=5)
        for label in plt.gca().axes.get_xticklabels():
            label.set_visible(False)
        for label in plt.gca().axes.get_yticklabels():
            label.set_visible(False)

        plt.gca().invert_xaxis()
        plt.gca().invert_yaxis()
        plt.gca().axis('equal')
        plt.xlim([coord_df.loc['X'].max(), coord_df.loc['X'].min()])
        plt.ylim([coord_df.loc['Y'].max(), coord_df.loc['Y'].min()])
        if j == 0:
            plt.ylabel('Round {}'.format(i+1))
        channel_names = ['Cy3', 'TxRed', 'Cy5']
        plt.title(gene.split('_')[1]+ ' {}'.format(channel_names[j]))
plt.tight_layout()
#plt.savefig('Gene_expression_map_channel_round.png', dpi=300)       


In [None]:
gene = gene_sort_fish[gene_sort.index('Plp1')]

plt.figure(figsize=(5,6))

filt = df_fish.index == gene
levels = X_fc[filt,:].flat[:]
levels = np.clip(levels, np.percentile(levels, 10), np.percentile(levels,90))
selection_df = coord_df.loc[:,df_fish.columns]
scat = plt.scatter(selection_df.loc['X'], selection_df.loc['Y'],color=plt.cm.coolwarm(levels/max(levels)), lw=0, alpha=1,s=10)
for label in plt.gca().axes.get_xticklabels():
    label.set_visible(False)
for label in plt.gca().axes.get_yticklabels():
    label.set_visible(False)

plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
plt.gca().axis('equal')
plt.xlim([coord_df.loc['X'].max(), coord_df.loc['X'].min()])
plt.ylim([coord_df.loc['Y'].max(), coord_df.loc['Y'].min()])
plt.title(gene.split('_')[1])
plt.axis('off')
plt.tight_layout()

#plt.savefig('Cortex_expression_{}.png'.format(gene), dpi=600)

### Gene correlation

In [None]:

def correlation_df(df):
    fig, ax = plt.subplots(figsize=(8,8))
    data = df.T.corr(method='pearson').values
    data = data/np.percentile(data, 98, 1)[:,None]
    #axHM.pcolor(z,cmap='viridis', vmax=1)
    
    ax.pcolor(data,cmap='viridis', vmax=1 )

    y_pos = np.arange(len(df))
    x_pos = np.arange(len(df))
    #ax1.set_xlim(0, len(df_count_average.columns))
    ax.xaxis.tick_top()
    ax.set_xticks(x_pos+0.4)
    ax.set_xticklabels(gene_sort, rotation=-90)
    ax.set_yticks(y_pos+0.4)
    ax.set_yticklabels(gene_sort)
    ax.invert_yaxis()
    ax.set_aspect('equal')
    ax.set_title('')

correlation_df(df_fish)

## Gene contamination

In [None]:
import itertools
import math

In [None]:
major_type_markers = ['Hybridization12_Slc32a1',
                     'Hybridization1_Tbr1',
                      'Hybridization7_Sox10',
                      'Hybridization1_Aldoc',
                      'Hybridization3_Hexb',
                      'Hybridization3_Mrc1',
                      'Hybridization1_Foxj1',
                      'Hybridization12_Vtn',
                      'Hybridization2_Flt1',
                      'Hybridization5_Acta2',
                      'Hybridization9_Lum']

combinations = (list(itertools.combinations(major_type_markers, 2)))
n_combinations = int(math.factorial(len(major_type_markers)) / (math.factorial(len(major_type_markers)-2)*2))

plt.figure(figsize=(20,16))
gs = plt.GridSpec(int(np.sqrt(n_combinations)), int(np.sqrt(n_combinations)+1))
#gs = plt.GridSpec(7, 8)

for i, c in enumerate(combinations):
    ax = plt.subplot(gs[i])
    ax.scatter(df_fish.loc[c[0]],  df_fish.loc[c[1]], s=0.5, alpha=0.2)
    ax.set_aspect('auto')
    #ax.set_yticklabels([])
    #ax.set_xticklabels([])
    ax.set_ylabel(c[0].split('_')[1])
    ax.set_xlabel(c[1].split('_')[1])
    
plt.tight_layout()
plt.savefig('Major_marker_comparison.png', dpi=600)

# Single cell data & Comparison

In [None]:
# Import the Single Cell Cortex data into a data frame (Zeisel 2015)
df_sc_cort = pd.read_csv('/home/lars/storage/Documents/Single_Cell/Single_Cell_Cortex_Data/expression_mRNA_17-Aug-2014.txt', sep="\t", index_col=0, low_memory=False)
#used Low_memory=False, because the df is made up of mixed types in the rows. (normaly pandas expects different parameters in the columns)

# Remove all the bad cells (Cells with the label '(none)'
    #This removes 189 bad cells
df_sc_cort = df_sc_cort.loc[:, [i for i in df_sc_cort.columns if df_sc_cort.iloc[8,df_sc_cort.columns.get_loc(i)] != '(none)']].copy()

# Put all cell labels in a dictionary
sc_cor_cell_label_1 = df_sc_cort.iloc[7,1:].to_dict()
sc_cor_cell_label_2 = df_sc_cort.iloc[8,1:].to_dict()

# Get rid of the meta data
df_sc_cort = df_sc_cort.iloc[10:,1:]

#Make a copy with all the genes
df_sc_cort_all_genes = df_sc_cort

# 
df_sc_cort = df_sc_cort.loc[gene_sort,:]
df_sc_cort = df_sc_cort.astype(np.int64)
df_sc_cort

### Cortex and Oligo dataset

In [None]:
#Load the data set with the cortex and oligo paper cells
df_sc_cort_olig = pd.read_pickle('/home/lars/storage/Documents/Single_Cell/Single_Cell_All_2016/df_cortex_olig.pkl')
df_sc_cort_olig = df_sc_cort_olig.loc[gene_sort,:]

#Load Cell label dictionary
sc_cort_olig_label = pickle.load(open('/home/lars/storage/Documents/Single_Cell/Single_Cell_All_2016/label_cortex_olig.pkl', "rb" ) )
df_sc_cort_olig.head(5)

In [None]:
sc_cort_olig_label

## average molecule compare

In [None]:
difference_sc_fish = np.zeros((len(gene_sort_fish),3))
for i, g in enumerate(gene_sort_fish):
    difference_sc_fish[i,0] = np.mean(df_fish.loc[g])
    difference_sc_fish[i,1] = np.mean(df_sc_cort.loc[g.split('_')[1]])
    difference_sc_fish[i,2] = (np.mean(df_fish.loc[g]) / np.mean(df_sc_cort.loc[g.split('_')[1]]))

In [None]:
plt.figure(figsize=(9, 5))
ax1 = plt.subplot2grid((2,3), (0,0), rowspan=1, colspan=2)
x_pos = list(range(difference_sc_fish.shape[0]))
ax1.bar(x_pos, difference_sc_fish[:,0], color='b', alpha=0.5, linewidth=0, width=0.7)
ax1.bar([x+0.15 for x in x_pos], difference_sc_fish[:,1],  alpha=1, linewidth=0, width=0.7, color=[255/255,128/255,128/255])
ax1.set_xticks(np.arange(difference_sc_fish.shape[0])+0.0)
ax1.set_xticklabels(gene_sort, rotation='vertical');
ax1.set_yscale("log")
ax1.set_xlim(-1, len(gene_sort))
ax1.set_title('Mean count B:smFISH, R:sc')

ax2 = plt.subplot2grid((2,3), (1,0), rowspan=1, colspan=2)
ax2.bar(x_pos, difference_sc_fish[:,2], color='grey', alpha=0.5)
ax2.set_xticks(np.arange(difference_sc_fish.shape[0])+0.0)
ax2.set_xticklabels(gene_sort, rotation='vertical');
ax2.set_title('mean fish count / mean sc count')
ax2.hlines(np.mean(difference_sc_fish[:,2]), 0, 35)
ax2.set_xlim(-1, len(gene_sort))

ax3 = plt.subplot2grid((2,3), (0,2), rowspan=2, colspan=1)
ax3.bar(1,np.mean(difference_sc_fish[:,2]), yerr = np.std(difference_sc_fish[:,2])/np.sqrt(difference_sc_fish.shape[0]), color='grey', alpha=0.5, ecolor='black')
ax3.set_title('Mean count difference, SEM')
plt.tight_layout()

print('The mean count is on average {} times higher for the smFISH compated to the single cell, with a SEM of {}'.format(np.mean(difference_sc_fish[:,2]), np.std(difference_sc_fish[:,2])/np.sqrt(difference_sc_fish.shape[0])))
print('Assumes unbiased cell sampling')

In [None]:
fig, ax = plt.subplots(figsize=(4,2))

x_pos = np.array(range(difference_sc_fish.shape[0]))
ax.bar(x_pos+0.01, difference_sc_fish[:,0], alpha=1, linewidth=0, width=0.65, color=[128/255,128/255, 255/255])
ax.bar(x_pos+0.15, difference_sc_fish[:,1],  alpha=1, linewidth=0, width=0.65, color=[255/255,128/255,128/255])
ax.set_yscale("log")
ax.set_xticks(np.arange(len(gene_sort))+0.0)
ax.set_xticklabels(gene_sort, rotation='vertical', fontsize=8)
ax.set_xlim(-0.5, len(gene_sort))

ax.yaxis.set_major_formatter(FuncFormatter(lambda y,pos: ('{{:.{:1d}f}}'.format(int(np.maximum(-np.log10(y),0)))).format(y)))

ax.set_axisbelow(True)
ax.yaxis.grid(color='gray', linestyle='solid', alpha=0.3)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('none')


ax.set_title('Mean count B:smFISH, R:sc')
plt.tight_layout()

plt.savefig('Mean_count_comaprison_sc_fish.png', dpi=300)

In [None]:
plt.figure()

#plt.scatter(difference_sc_fish[:,0], difference_sc_fish[:,1])
plt.ylabel('Mean count scRNA-seq')
plt.xlabel('Mean count smFISH')

data = difference_sc_fish

#Exclude Plp1
data = np.delete(data, 18, axis=0 )

fit = np.polyfit(data[:,0], data[:,1], 1)
fit_fn = np.poly1d(fit)

plt.plot(data[:,0], data[:,1], '*', data[:,0], fit_fn(data[:,0]))
#plt.yscale('log')
#plt.xscale('log')

print(fit_fn)


In [None]:
#Split plot
difference_sc_fish = pd.DataFrame(data = np.zeros((len(gene_sort_fish),5)), index=gene_sort_fish, columns=['mean_fish', 'f_std', 'mean_sc','s_std', 'f/s'])
#for i, g in enumerate(gene_sort_fish):
#    difference_sc_fish[i,0] = np.mean(df_fish.loc[g])
#    difference_sc_fish[i,1] = np.mean(df_sc_cort.loc[g.split('_')[1]])
#    difference_sc_fish[i,2] = (np.mean(df_fish.loc[g]) / np.mean(df_sc_cort.loc[g.split('_')[1]]))

In [None]:
for g in gene_sort_fish:
    difference_sc_fish.loc[g] = [np.mean(df_fish.loc[g]), np.std(df_fish.loc[g]), np.mean(df_sc_cort.loc[g.split('_')[1]]), np.std(df_sc_cort.loc[g.split('_')[1]]), (np.mean(df_fish.loc[g]) / np.mean(df_sc_cort.loc[g.split('_')[1]]))]

In [None]:
fish_h =[]
sc_h = []
for n, i in enumerate(difference_sc_fish.index):
    if difference_sc_fish.iloc[n,4] > 1:
        fish_h.append(i)
    else:
        sc_h.append(i)

fig= plt.figure (figsize=(8,3))

ax1 = plt.subplot2grid((1,5), (0,0), rowspan=1, colspan=4)
x_pos = np.array(range(difference_sc_fish.loc[fish_h].shape[0]))
ax1.bar(x_pos+0.01, difference_sc_fish.loc[fish_h].iloc[:,0], alpha=1, linewidth=0, width=0.65, color=[255/255,201/255,34/255])
ax1.bar(x_pos+0.15, difference_sc_fish.loc[fish_h].iloc[:,2],  alpha=1, linewidth=0, width=0.65, color=[255/255,128/255,128/255])
#ax1.set_yscale("log")
ax1.set_xticks(np.arange(len(gene_sort))+0.4)
genes = []
for i in difference_sc_fish.loc[fish_h].index:
    genes.append(i.split('_')[1])
#[x.split['_'][1] for x in difference_sc_fish.index]
ax1.set_xticklabels(genes, rotation='vertical', fontsize=12)
ax1.set_xlim(-0.5, len(genes))

#ax1.yaxis.set_major_formatter(FuncFormatter(lambda y,pos: ('{{:.{:1d}f}}'.format(int(np.maximum(-np.log10(y),0)))).format(y)))

ax1.set_axisbelow(True)
ax1.yaxis.grid(color='gray', linestyle='solid', alpha=0.3)

ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.yaxis.set_ticks_position('left')
ax1.xaxis.set_ticks_position('none')
ax1.set_ylabel('Molecules')
ax1.set_title('Mean count')


ax2 = plt.subplot2grid((1,5), (0,4), rowspan=1, colspan=1)
x_pos = np.array(range(difference_sc_fish.loc[sc_h].shape[0]))
ax2.bar(x_pos+0.01, difference_sc_fish.loc[sc_h].iloc[:,2],  alpha=1, linewidth=0, width=0.65, color=[255/255,128/255,128/255])
ax2.bar(x_pos+0.15, difference_sc_fish.loc[sc_h].iloc[:,0], alpha=1, linewidth=0, width=0.65, color=[255/255,201/255,34/255])

#ax1.set_yscale("log")
ax2.set_xticks(np.arange(len(gene_sort))+0.4)
genes = []
for i in difference_sc_fish.loc[sc_h].index:
    genes.append(i.split('_')[1])
#[x.split['_'][1] for x in difference_sc_fish.index]
ax2.set_xticklabels(genes, rotation='vertical', fontsize=12)
ax2.set_xlim(-0.5, len(genes))

#ax1.yaxis.set_major_formatter(FuncFormatter(lambda y,pos: ('{{:.{:1d}f}}'.format(int(np.maximum(-np.log10(y),0)))).format(y)))

ax2.set_axisbelow(True)
ax2.yaxis.grid(color='gray', linestyle='solid', alpha=0.3)

ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.yaxis.set_ticks_position('left')
ax2.xaxis.set_ticks_position('none')
ax2.set_title('Mean count')

blue_patch = matplotlib.patches.Patch(color=[255/255,201/255,34/255], label='smFISH')
red_patch = matplotlib.patches.Patch(color=[255/255,128/255,128/255], label='Single Cell')
plt.sca(ax1)
plt.legend(handles=[blue_patch, red_patch], loc=0)

plt.tight_layout()

plt.savefig('Comparison_smFISh_sc.svg')

### Dot saturation

In [None]:
#df count --> count per mean square um

In [None]:
#Mean cell size
mean_cell_area_px = np.mean(np.array(list(cell_size.values())))
mean_cell_area_um2 = (np.sqrt(mean_cell_area_px)*0.065)**2

#make a dataframe with the count per average cell area. 
df_mean_size = df_fish
df_mean_size = df_mean_size.divide([cell_size[i] for i in df_fish.columns])
df_mean_size = df_mean_size * mean_cell_area_px

In [None]:
fig, ax = plt.subplots(figsize = (10, 5))

handles=[]
for n, i in enumerate(df_mean_size.index):
    ax.plot(np.sort(df_mean_size.loc[i,:]), c=gene_color[n])
    h = matplotlib.patches.Patch(color=gene_color[n], label=i.split('_')[1])
    handles.append(h)
ax.set_xlim([5500, df_mean_size.shape[1]])
ax.set_ylabel('count / mean_cell_area ({} um2)'.format(round(mean_cell_area_um2, 2)))
ax.set_xlabel('cells')
ax.set_title('Max count per average cell area')

plt.legend(handles=handles, loc=5, fontsize=6, bbox_to_anchor=(1.1, .5))

plt.savefig('Max_count_5500.png')   

In [None]:
fig, ax = plt.subplots(figsize = (10, 5))


ax.plot(np.sort(df_fish.loc['Hybridization9_Lamp5']))

In [None]:
df_fish.max(axis=1)

In [None]:
df_fish.mean(axis=1)

In [None]:
np.median(df_fish.mean(axis=0))

In [None]:
df_sc_cort.mean(axis=1)

In [None]:
np.median(df_sc_cort.mean(axis=0))

In [None]:
np.sum(df_sc_cort.sum(axis=1))/df_sc_cort.shape[1]

## Marker selection plot

In [None]:
#make list and array of cell type labels
sc_CO_labels = []
for i in df_sc_cort_olig.columns:
    sc_CO_labels.append(sc_cort_olig_label[i])
sc_CO_labels_a = np.array(sc_CO_labels)

#Make df with count averages per cluster
df_count_average_sc = pd.DataFrame(index=df_sc_cort_olig.index, columns=np.unique(sc_CO_labels_a))

for l in np.unique(sc_CO_labels_a):
    #filt = np.sort(sc_CO_labels) == l
    filt = sc_CO_labels_a == l
    mean = np.array(df_sc_cort_olig.loc[:,filt].T.mean())
    #std = np.array(np.std(df_sort.loc[:,filt], axis=1))
    if np.isnan(np.sum(mean)) == False: #In case some clusters do not have cells
        df_count_average_sc[l] = mean


In [None]:
type_sort_sc_cort = ['Int1', 'Int2', 'Int3', 'Int4', 'Int5','Int6', 'Int7', 'Int8', 'Int9','Int10', 
       'Int11', 'Int12', 'Int13', 'Int14', 'Int15', 'Int16','CA1PyrInt',
       'S1PyrL23', 'S1PyrL4', 'S1PyrL5', 'S1PyrL5a', 'S1PyrL6',
       'S1PyrL6b', 'SubPyr','S1PyrDL','ClauPyr', 'CA1Pyr1','CA1PyrInt', 'CA2Pyr2',
                    'Mgl1', 'Mgl2',
                    'Vsmc',
                    'Peric',
                    'Vend1', 'Vend2',
                    'Epend',
                    'Astro1', 'Astro2',
                    'Choroid',
                     'PPR',
                    'OPC','COP','NFOL1','NFOL2','MFOL1','MFOL2',
                     'MOL1','MOL2', 'MOL3', 'MOL4', 'MOL5', 'MOL6',
                    ]

type_sort_sc_cort_trimmed = ['Int1', 'Int2', 'Int3', 'Int4', 'Int5','Int6', 'Int7', 'Int8', 'Int9','Int10', 
       'Int11', 'Int12', 'Int13', 'Int14', 'Int15', 'Int16',
       'S1PyrL23', 'S1PyrL4', 'S1PyrL5', 'S1PyrL5a', 'S1PyrL6',
       'Astro1', 'Astro2',
       'OPC','COP','NFOL1','NFOL2','MFOL1','MFOL2',
        'MOL1','MOL2', 'MOL3', 'MOL4', 'MOL5', 'MOL6',                      
        'Mgl1', 'Mgl2',                     
        'Choroid',   
        'Epend',                              
        'Peric',
        'Vend1', 'Vend2',
        'Vsmc',
         'PPR']

df_count_average_sc = df_count_average_sc.loc[gene_sort, type_sort_sc_cort_trimmed]

In [None]:
fig, ax = plt.subplots(figsize=(6,5))
z = df_count_average_sc.values
z = z/np.percentile(z, 99, 1)[:,None]
im  = ax.pcolor(z,cmap='viridis', vmax=1)

y_pos = np.arange(len(df_count_average_sc.index))
x_pos = np.arange(len(df_count_average_sc.columns))
ax.set_xticks(x_pos+0.5)
ax.set_xticklabels(df_count_average_sc.columns, rotation='vertical', fontsize=6)
ax.set_yticks(y_pos+0.5)
ax.set_yticklabels(df_count_average_sc.index, fontsize=6)
ax.invert_yaxis()

ax.set_aspect('equal')


cax = fig.add_axes([.91, 0.13, 0.03, 0.2])
colorbar = fig.colorbar(im, cax=cax, ticks=[0,1])
colorbar.set_ticklabels(['0', 'max'])


#plt.savefig('Marker_selection.svg')

#divider = make_axes_locatable(ax)
#axLabel = divider.append_axes("top", .5, pad=0, sharex=ax)
#axLabel.pcolor()

#optimal_sort_labels = np.array(optimal_sort_labels)
#axLabel.pcolor(optimal_sort_labels[None,:]/max(optimal_sort_labels), cmap='prism')
#axLabel.set_xlim(xmax=len(df.columns))
#axLabel.axis('off')

## Gini coefficient

In [None]:
#Function from:
#https://github.com/oliviaguest/gini/blob/master/gini.py
def gini(array):
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # from:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array += 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)))

#One sample has all --> Gini = 1
#All samples have the same --> Gini = 0
#Uniform random --> Gini ~0.33

In [None]:
gini_sc = []
for g in gene_sort:
    gini_sc.append(gini(df_sc_cort.loc[g].values.astype(np.float64)))
    
gini_fish = []
for g in gene_sort_fish:
    gini_fish.append(gini(df_fish.loc[g].values.astype(np.float64)))
    
print('Average Gini coefficient for Single Cell: {}, SEM {}'.format(np.mean(gini_sc), np.std(gini_sc)/np.sqrt(len(gini_sc))))
print('Average Gini coefficient for smFISH: {}, SEM {}'.format(np.mean(gini_fish),  np.std(gini_fish)/np.sqrt(len(gini_fish))))

In [None]:
fig, ax = plt.subplots()
x_pos = range(len(gini_sc))
ax.bar(x_pos, gini_sc, color='r', alpha=0.5)
ax.bar(x_pos, gini_fish, color='b', alpha=0.5)
ax.set_xticks(np.arange(0, len(gini_sc), 1)+0.4)
ax.set_xticklabels(gene_sort, rotation='vertical')
ax.set_ylabel('Gini Coefficient')
ax.set_title('Red=SC, Blue=FISH')

plt.savefig('Gini_FISH_vs_SC.png')

Gini could be a good measure to see how uniformly a gene is expressed within a cell type

## Zero and count distribution

In [None]:
#To compare eficiency, take the X positive cells from the single cell, and the X highest expressing genes from the FISH
#Look at the ratio difference between the means. and calculate the mean ratio.

#Gene expression count distribution, for smFISH data compared to single cell data
#Percentage of cells with zero and higher than 10 molecules are given for smFISH data.

plt.figure(figsize=(20,12))
gs = plt.GridSpec(5,8)

#Contruct lists for later plotting
zeros = []
tens = []

#Match the number of cells of the single cell df and smFISH df
df_fish_resample = df_fish
df_sc_resample = df_sc_cort

len_fish = len(df_fish_resample.columns)
len_sc = len(df_sc_resample.columns)
if len_fish > len_sc:
    print('smFISH dataframe has more cells.\nThis dataframe will be randomly resampled to match the number of cels of the single cell dataframe')
    df_fish_resample = df_fish_resample.sample(n=len_sc, axis=1)
    x_max = len_sc
elif len_sc > len_fish:
    print('single cell dataframe has more cells.\nThis dataframe will be randomly resampled to match the number of cels of the smFISH dataframe')
    df_sc_resample = df_sc.sample(n=len_fish, axis=1)
    x_max = len_fish
    
#Plot comparison plot
for i in range(len(df_fish_resample.index)):
    ax = plt.subplot(gs[i])
    
    #Plot smFISH data
    ax.plot(np.sort(df_fish_resample.iloc[i])[::-1], c='blue', lw=2)
    
    #Convert Hybridization gene name to single cell gene name
    gene_name = gene_name_conversion[df_fish_resample.index[i]]
    #Plot single cell data
    ax.plot(np.sort(df_sc_resample.loc[gene_name])[::-1], c='red', lw=2)
    
    #ax.set_yscale("log")
    ax.tick_params(axis='y', labelsize=10)
    #plt.xscale("log")
    ax.tick_params(axis='x', labelsize=5)
    ax.set_xlim(0, x_max)
    ax.set_ylim(0, 100)
    total = len(df_fish_resample.columns)
    zerosp = ((total - np.count_nonzero(df_fish_resample.iloc[i])) / total) *100
    zeros.append(zerosp)
    tensp = (len(np.where(df_fish_resample.iloc[i] > 10)[0]) / total) *100
    tens.append(tensp)
    annotation = "smFISH\nCount=0:   {}%\nCount>10: {}%".format(round(zerosp), round(tensp))
    ax.text(.9,.9,annotation, horizontalalignment='right',verticalalignment='top',transform = ax.transAxes, color='blue')
    #ax.annotate(annotation,xy=(len(df_smFISH.iloc[i]), 150), size=8, color ='blue', zorder=10000) #-5000 for non-log x-scale
    ax.set_title(df_fish_resample.index[i], size=8)
    
ax = plt.subplot(gs[39])
ax.violinplot((zeros, tens))
ax.tick_params(axis='x', labelsize=10)
#ax.set_xticks([1,2], ('=0', '>10'))
ax.set_xticklabels(['','=0','', '>10'])
ax.tick_params(axis='y', labelsize=10)
ax.set_title('Distribution of zero and ten counts', size=8)

#plt.savefig('smFISH_count_distribution_log', dpi=300)

## Fit curve on count sort

In [None]:
from scipy.optimize import curve_fit
from scipy.misc import derivative

In [None]:
#FISH DATA

def func_fit(x, a, b, c):
    #return a * np.exp(-b * x) + c
    #return a* x**2 + b*x +c
    return (a / (x + b)) + c #<--- Best fit
    #return a * -np.log(x)+b

gene = gene_sort_fish[9]


plt.figure()
ydata = np.sort(df_fish.loc[gene])[::-1]
xdata = np.arange(len(np.sort(df_fish.loc[gene])[::-1]))

plt.plot(xdata, ydata, 'b-', label='data')

popt, pcov = curve_fit(func_fit, xdata, ydata)
plt.plot(xdata, func_fit(xdata, *popt), 'r-', label='fit')

plt.title(gene)

print(popt)
print('Formula: ({} / (x + {})) + {}'.format(*popt) )

In [None]:
#SINGLE CELL DATA

def func_fit(x, a, b, c):
    #return a * np.exp(-b * x) + c
    #return a* x**2 + b*x +c
    return (a / (x + b)) + c #<--- Best fit
    #return a * -np.log(x)+b

gene = gene_sort[5]


plt.figure()
ydata = np.sort(df_sc.loc[gene])[::-1]
xdata = np.arange(len(np.sort(df_sc.loc[gene])[::-1]))

plt.plot(xdata, ydata, 'b-', label='data')

popt, pcov = curve_fit(func_fit, xdata, ydata)
plt.plot(xdata, func_fit(xdata, *popt), 'r-', label='fit')

plt.title(gene)

print(popt)
print('Formula: ({} / (x + {})) + {}'.format(*popt) )

In [None]:
def f(x):
    return (5.23190408e+04 / (x + 4.45520934e+02))-7.44022002e+00

derivative(f, 1000)


In [None]:

def f(x):
    return (5.23190408e+04 / (x + 4.45520934e+02))-7.44022002e+00

plt.figure()
x = np.linspace(0, 1000, 1000)
plt.plot(x, f(x), 'r-', label='f(x)')

In [None]:
len(slice(10, 20, 1), slice(10, 20, 1))

In [None]:
import scipy.optimize

def f(x):
    
    return (52319 / (x + 445))-7


def fa(x):
    return derivative(f, x)
#(slice(10, 20, 1), slice(10, 20, 1))
rranges = (slice(-4, 4, 0.25), slice(-4, 4, 0.25))
q = scipy.optimize.brute(f, rranges )

In [None]:
def g(x):
    return x**2

def f(x):
    return (52319 / (x + 445))-7

def OA(x):
    return np.sqrt(x**2 + (f(x))**2)

def OAd(x):
    return derivative(OA, x)

def fa(x):
    return derivative(f, x)

scipy.optimize.minimize(OAd, [300,60], bounds=((1,1000),(1,100)), method='TNC')


In [None]:
from scipy.optimize import fmin_cobyla

P = (0, 0)

def f(x):
    return (5.23190408e+04 / (x + 4.45520934e+02))-7.44022002e+00

def objective(X):
    x,y = X
    return np.sqrt((x - P[0])**2 + (y - P[1])**2)

def c1(X):
    x,y = X
    return derivative(f, X)

X = fmin_cobyla(objective, x0=[60,300], cons=[c1])

#print( 'The minimum distance is {0:1.3f}'.format(objective(X)))

# Verify the vector to this point is normal to the tangent of the curve
# position vector from curve to point
v1 = np.array(P) - np.array(X)
# position vector
v2 = np.array([1, 2.0 * X[0]])
print ('dot(v1, v2) = ',np.dot(v1, v2))
print(X)

In [None]:
plt.figure()
x = np.linspace(0, 1000, 1000)
plt.plot(x, f(x), 'r-', label='f(x)')
plt.plot(P[0], P[1], 'bo', label='point')
plt.plot(X[0], X[1], 'ro', label='dot')
plt.plot([P[0], X[0]], [P[1], X[1]], 'b-', label='shortest distance')
plt.plot([X[0], X[0] + 1], [X[1], X[1] + 2.0 * X[0]], 'g-', label='tangent')
#plt.axis('equal')
plt.xlabel('x')
plt.ylabel('y')
plt.legend(loc='best')


In [None]:
xx = np.linspace(0, 1000, 1000)
plt.figure()
plt.plot(xx, func_fit(xx, *popt))
plt.xlim([-100, 6])

## Fit distribution

In [None]:
# Fit normal of negative binominal distributions

## Count distribution & comparison smFISH SC

In [None]:
data1 = df_fish.sample(n=len(df_sc_cort.columns), axis=1).iloc[13]
data2 = df_sc_cort.iloc[13]

f, ax = plt.subplots(figsize=(5, 5))
v1 = ax.violinplot(data1, showmeans=False, showextrema=False, showmedians=False)
#v1 = ax.violinplot(data1, points=50, positions=np.arange(0, len(data1)), widths=0.85,
#               showmeans=False, showextrema=False, showmedians=False)

for b in v1['bodies']:
    m = np.mean(b.get_paths()[0].vertices[:, 0])
    b.get_paths()[0].vertices[:, 0] = np.clip(b.get_paths()[0].vertices[:, 0], -np.inf, m)
    b.set_color('r')

v2=ax.violinplot(data2, showmeans=False, showextrema=False, showmedians=False)
  
#v2 = ax.violinplot(data2, points=50, positions=np.arange(0, len(data2)), widths=0.85,
#               showmeans=False, showextrema=False, showmedians=False)
for b in v2['bodies']:
    m = np.mean(b.get_paths()[0].vertices[:, 0])
    b.get_paths()[0].vertices[:, 0] = np.clip(b.get_paths()[0].vertices[:, 0], m, np.inf)
    b.set_color('b')

ax.set_ylim(bottom=-5)

In [None]:
plt.figure(figsize=(20,12))
gs = plt.GridSpec(5,8)


def dual_violin(i, gene, smFISH_data, sc_data):
    ax = plt.subplot(gs[i])
    
    try:
        smFISH_data = smFISH_data.loc[gene]
        sc_data = sc_data.loc[gene_name_conversion[gene]]

        v1 = ax.violinplot(smFISH_data, showmeans=False, showextrema=False, showmedians=False)
        for b in v1['bodies']:
            m = np.mean(b.get_paths()[0].vertices[:, 0])
            b.get_paths()[0].vertices[:, 0] = np.clip(b.get_paths()[0].vertices[:, 0], -np.inf, m)
            b.set_color('b')

        v2 = ax.violinplot(sc_data, showmeans=False, showextrema=False, showmedians=False)
        for b in v2['bodies']:
            m = np.mean(b.get_paths()[0].vertices[:, 0])
            b.get_paths()[0].vertices[:, 0] = np.clip(b.get_paths()[0].vertices[:, 0], m, np.inf)
            b.set_color('r')
    except Exception as e:
        print('Not all hybridizations present in dataset. Error: ', e)

    ax.set_yscale("log")
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.set_ylim(bottom=0)
    ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off')
    ax.set_title(gene, size=12)
    ax.set_xlabel('    smFISH | single cell', size=12)
    if i%8 == 0:
        ax.set_ylabel('count', size=12)
    ax.tick_params(axis='y', labelsize=10)
    plt.tight_layout()

    
for i, gene in enumerate(gene_sort_fish):
    dual_violin(i, gene, df_fish, df_sc_cort)
    
#plt.savefig('smFISH_comapre_SingleCell_log.png', dpi=150)

## Inter experiment effect

In [None]:
def resample_get_mean(df, sample):
    df = df
    resample_mean = df.sample(n=sample, axis=1).T.mean()
    iterations=1000
    for i in range(iterations):
        resample_mean = resample_mean + df.sample(n=sample, axis=1).T.mean()

    return resample_mean/(iterations+1)

resample_get_mean(df_fish, 2000)

### Channel effect

In [None]:
#List of all gene names of smFISH experiment including the hybridization number
genes_fish = list(gene_name_conversion.keys())

#Genes sorted per channel
channels = {
'cy3_genes' : ['Hybridization1_Foxj1','Hybridization2_Gfap','Hybridization3_Mfge8','Hybridization5_Cpne5', 
            'Hybridization6_Itpr2','Hybridization7_Ctps','Hybridization8_Pdgfra','Hybridization9_Lamp5','Hybridization10_Crh',
            'Hybridization11_Syt6','Hybridization12_Kcnip2','Hybridization13_Cnr1',],
'txred_genes' : ['Hybridization1_Aldoc','Hybridization2_Flt1','Hybridization3_Hexb','Hybridization5_Klk6',
              'Hybridization6_Bmp4','Hybridization7_Rorb','Hybridization8_Pthlh','Hybridization9_Lum','Hybridization10_Crhbp',
              'Hybridization11_Tmem2','Hybridization12_Slc32a1','Hybridization13_Plp1', ],
'cy5_genes' : ['Hybridization1_Tbr1','Hybridization2_Gad2','Hybridization3_Mrc1','Hybridization5_Acta2',
            'Hybridization6_Vip','Hybridization7_Sox10','Hybridization8_Serpinf1','Hybridization9_Anln','Hybridization10_Apln',
            'Hybridization12_Vtn','Hybridization13_Ttr']
}

#Removed (round4 and repeat Tbr1):
#Cy3: 'Hybridization4_Cnr1',
#TxRed: 'Hybridization4_Plp1', 
#Cy5:'Hybridization4_Vtn', 
#Cy5: 'Hybridization11_Tbr1'

In [None]:
stats.mstats.normaltest(list(hyb_zerosp.values()))

In [None]:
#Number of cells with zero count

#Create dicrionary with percentage of cells with a zero count for each gene
hyb_zerosp = {}
total = df_fish.shape[1]
for n, i in enumerate(df_fish.index):
    hyb_zerosp[i] = ((total - np.count_nonzero(df_fish.iloc[n])) / total) *100

fig, ax = plt.subplots()

ax.boxplot([[hyb_zerosp[i] for i in channels['cy3_genes']],
    [hyb_zerosp[i] for i in channels['txred_genes']],
    [hyb_zerosp[i] for i in channels['cy5_genes']]])
ax.set_xticklabels(['cy3', 'txred', 'cy5'])
ax.set_title('% of cells with count=0')

print('CY3 vs TxRed: ' ,stats.mannwhitneyu([hyb_zerosp[i] for i in channels['cy3_genes']], [hyb_zerosp[i] for i in channels['txred_genes']]))
print('CY3 vs Cy5:   ' ,stats.mannwhitneyu([hyb_zerosp[i] for i in channels['cy3_genes']], [hyb_zerosp[i] for i in channels['cy5_genes']]))
print('TxRed vs Cy5: ' ,stats.mannwhitneyu([hyb_zerosp[i] for i in channels['txred_genes']], [hyb_zerosp[i] for i in channels['cy5_genes']]))
print('CY3 vs TxRed: ' ,stats.ttest_ind([hyb_zerosp[i] for i in channels['cy3_genes']], [hyb_zerosp[i] for i in channels['txred_genes']]))
print('CY3 vs Cy5:   ' ,stats.ttest_ind([hyb_zerosp[i] for i in channels['cy3_genes']], [hyb_zerosp[i] for i in channels['cy5_genes']]))
print('TxRed vs Cy5: ' ,stats.ttest_ind([hyb_zerosp[i] for i in channels['txred_genes']], [hyb_zerosp[i] for i in channels['cy5_genes']]))

In [None]:
#Number of cells with POSITIVE count

#Create dicrionary with percentage of cells with a zero count for each gene
hyb_positivep = {}
total = df_fish.shape[1]
for n, i in enumerate(df_fish.index):
    hyb_positivep[i] = ((np.count_nonzero(df_fish.iloc[n])) / total) *100

fig, ax = plt.subplots()

ax.boxplot([[hyb_positivep[i] for i in channels['cy3_genes']],
    [hyb_positivep[i] for i in channels['txred_genes']],
    [hyb_positivep[i] for i in channels['cy5_genes']]])
ax.set_xticklabels(['cy3', 'txred', 'cy5'])
ax.set_title('% of cells with POSITIVE count')

print('CY3 vs TxRed: ' ,stats.mannwhitneyu([hyb_positivep[i] for i in channels['cy3_genes']], [hyb_positivep[i] for i in channels['txred_genes']]))
print('CY3 vs Cy5:   ' ,stats.mannwhitneyu([hyb_positivep[i] for i in channels['cy3_genes']], [hyb_positivep[i] for i in channels['cy5_genes']]))
print('TxRed vs Cy5: ' ,stats.mannwhitneyu([hyb_positivep[i] for i in channels['txred_genes']], [hyb_positivep[i] for i in channels['cy5_genes']]))

### Round efficiency effect

In [None]:
#List of all gene names of smFISH experiment including the hybridization number
genes_fish = list(gene_name_conversion.keys())


In [None]:
fig, ax = plt.subplots()

number = 26

ax.hist(df_fish.loc[genes_fish[number]], bins=df_fish.loc[genes_fish[number]].max())
ax.set_title(genes_fish[number])

# tSNE

In [None]:
pca = PCA()
pc = pca.fit_transform(X_log.T)
#pc_fc = pca.fit_transform(X_fc.T) #
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))

In [None]:
model = TSNE(n_iter=5000, perplexity=50, early_exaggeration=15, )
#Transpose the data to work on the cells
tSNE_points = model.fit_transform(pc[:,:33])
scatter_tSNE(tSNE_points)
#pickle.dump(tSNE_points, open( "temp_tSNE/tSNE_cortexFISH_33pc_perplex50_EarlyEx20.p", "wb" ))

In [None]:
model = TSNE(n_iter=5000, perplexity=50, early_exaggeration=20)
#Transpose the data to work on the cells
tSNE_points15 = model.fit_transform(pc_fc[:,:15])
scatter_tSNE(tSNE_points)

In [None]:
#Load pre-calcualted point set
#tSNE_points = pickle.load(open("tSNE_cortexFISH_35pc_perplex50_v2.p", 'rb'))#Non-clean dataset
tSNE_points = pickle.load(open("temp_tSNE/tSNE_cortexFISH_33pc_perplex50_EarlyEx15.p", 'rb'))

In [None]:
def scatter_tSNE(tSNE_data):
    
    f = plt.figure(figsize=(8, 8))
    ax = plt.subplot(aspect='equal')
    sc = ax.scatter(tSNE_data[:,0], tSNE_data[:,1], lw=0, s=40, alpha = 0.1, c = 'blue')
    plt.xlim(-25, 25)
    plt.ylim(-25, 25)
    ax.axis('on')
    ax.axis('tight')
    
    return f, ax, sc


scatter_tSNE(tSNE_points)

In [None]:
f = plt.figure(figsize=(2, 2))
ax = plt.subplot(aspect='equal')
sc = ax.scatter(tSNE_points[:,0], tSNE_points[:,1], lw=0, s=4, alpha = 0.2, c = [55/250,171/250,200/250])
plt.xlim(-25, 25)
plt.ylim(-25, 25)
ax.axis('off')
ax.axis('tight')
#plt.savefig('mini_tSNE.png', dpi=300)

In [None]:
def  scatter_tSNE_cells_of_intrest(tSNE_data, cells_of_interest):
    plt.figure(figsize=(8,8))
    plt.subplot(aspect='equal')
    plt.scatter(tSNE_data[:,0], tSNE_data[:,1], lw=0, s=40, alpha = 0.1, c = 'grey')
    
    tSNE_coord_df = pd.DataFrame(data= np.zeros([1,df_fish.shape[1]]), index=['tSNE1'], columns=df_fish.columns)
    tSNE_coord_df.loc['tSNE1'] = tSNE_data[:,0]
    tSNE_coord_df.loc['tSNE2'] = tSNE_data[:,1]
    tSNE_coord_df = tSNE_coord_df.loc[:,cells_of_interest]
    plt.scatter(tSNE_coord_df.loc['tSNE1'], tSNE_coord_df.loc['tSNE2'],lw=0, s=40, alpha = 0.3, c = 'red')
    
    plt.xlim(-12, 12)
    plt.ylim(-12, 12)
    ax.axis('on')
    ax.axis('tight')

    
def return_top_expression(gene, top_n, df):
    gene = gene_name_conversion_reverse[gene]
    top_expression = df.iloc[:, np.argsort(df.loc[gene])[::-1]].iloc[:,:top_n]
    #mean_expression = df.iloc[:, np.argsort(df.sum())[::-1]].iloc[:,:top_n]
    return(top_expression.columns)
    #return(mean_expression.columns)

#Plot the top n expressing cells 
scatter_tSNE_cells_of_intrest(tSNE_points, return_top_expression('Foxj1', 100, df_fish) )

## tSNE Comparison

### Single Cell all genes

In [None]:
#Transform data
sc_allg = df_sc_cort_all_genes.values.astype(np.float64)
    #normalize
sc_allg_norm = sc_allg.sum(0).mean() * (sc_allg/sc_allg.sum(0))
    #Log transform
sc_allg_log = np.log2(sc_allg_norm+1)
sc_allg_log.shape

In [None]:
pca_scall = PCA()
pc_scall = pca_scall.fit_transform(sc_allg_log.T)
#pc_fc = pca.fit_transform(X_fc.T) #
plt.figure()
plt.plot(np.cumsum(pca_scall.explained_variance_ratio_))

In [None]:
model = TSNE(n_iter=5000, perplexity=50, early_exaggeration=15)
#Transpose the data to work on the cells
tSNE_points_scall = model.fit_transform(pc_scall[:,:33])
scatter_tSNE(tSNE_points_scall)
plt.savefig('tSNE_SC_All-genes_33pc.png', dpi=600)


In [None]:
pickle.dump(tSNE_points_scall, open( "temp_tSNE/tSNE_scall-genes_33pc_perplex50_EarlyEx20.p", "wb" ))

In [None]:
plt.figure(figsize=(10,10))
points = np.zeros((len(df_sc_cort_all_genes.columns),2))
for i, n in enumerate(df_sc_cort_all_genes.columns):
    points[i,:] = tSNE_points_scall[df_sc_cort_all_genes.columns.get_loc(n),:]
print(len(points))

#get list of labels
sc_cort_labels = [sc_cor_cell_label_2[i] for i in df_sc_cort_all_genes.columns]
#Convert labels to integers in dictionary
sc_cort_cell_label_2_int = {}
for n,  i in enumerate(np.unique(sc_cort_labels)):
    sc_cort_cell_label_2_int[i] = n
#Convert list of labels to the corresponding integer label
sc_cort_labels = np.array([sc_cort_cell_label_2_int[i] for i in sc_cort_labels])



plt.scatter(points[:,0], points[:,1],c=plt.cm.prism(sc_cort_labels/max(sc_cort_labels)), lw=0, alpha=1, s=20)
plt.title('SC all genes')
plt.savefig('tSNE_SC_all-genes_labels.png', dpi=600)

### Single Cell 35 genes

In [None]:
#Transform data
df_sc_cort_clean = df_sc_cort.loc[:, (df_sc_cort!=0).any(axis=0)] #some cells have all zeros
sc_35g = df_sc_cort_clean.values.astype(np.float64)
    #normalize
sc_35g_norm = sc_35g.sum(0).mean() * (sc_35g/sc_35g.sum(0))
    #Log transform
sc_35g_log = np.log2(sc_35g_norm+1)
sc_35g_log.shape

In [None]:
pca_sc35 = PCA()
pc_sc35 = pca_sc35.fit_transform(sc_35g_log.T)
#pc_fc = pca.fit_transform(X_fc.T) #
plt.figure()
plt.plot(np.cumsum(pca_sc35.explained_variance_ratio_))

In [None]:
model = TSNE(n_iter=5000, perplexity=50, early_exaggeration=15)
#Transpose the data to work on the cells
tSNE_points_sc35 = model.fit_transform(pc_sc35[:,:33])
scatter_tSNE(tSNE_points_sc35)
plt.savefig('tSNE_SC_35-genes_33pc.png', dpi=600)

In [None]:
pickle.dump(tSNE_points_sc35, open( "temp_tSNE/tSNE_sc-35-genes_33pc_perplex50_EarlyEx20.p", "wb" ))

In [None]:
plt.figure(figsize=(10,10))
points = np.zeros((len(df_sc_cort_clean.columns),2))
for i, n in enumerate(df_sc_cort_clean.columns):
    points[i,:] = tSNE_points_sc35[df_sc_cort_clean.columns.get_loc(n),:]
print(len(points))

#get list of labels
sc_cort_labels = [sc_cor_cell_label_2[i] for i in df_sc_cort_clean.columns]
#Convert labels to integers in dictionary
sc_cort_cell_label_2_int = {}
for n,  i in enumerate(np.unique(sc_cort_labels)):
    sc_cort_cell_label_2_int[i] = n
#Convert list of labels to the corresponding integer label
sc_cort_labels = np.array([sc_cort_cell_label_2_int[i] for i in sc_cort_labels])    

plt.scatter(points[:,0], points[:,1],c=plt.cm.prism(sc_cort_labels/max(sc_cort_labels)), lw=0, alpha=1, s=20)
plt.title('tSNE SC 35 genes')
plt.savefig('tSNE_SC_35-genes_labels.png', dpi=600)

## tSNE on size normalized data

In [None]:
pca_size = PCA()
pc_size = pca_size.fit_transform(X_size_log.T)
#pc_fc = pca.fit_transform(X_fc.T) #
plt.figure()
plt.plot(np.cumsum(pca_size.explained_variance_ratio_))

In [None]:
model = TSNE(n_iter=1000, perplexity=50, early_exaggeration=20)
#Transpose the data to work on the cells
tSNE_points_size = model.fit_transform(pc_size[:,:30])
scatter_tSNE(tSNE_points_size)

In [None]:
model = TSNE(n_iter=1000, perplexity=40, early_exaggeration=20)
#Transpose the data to work on the cells
tSNE_points_size2 = model.fit_transform(pc_size[:,:30])
scatter_tSNE(tSNE_points_size2)

In [None]:
model = TSNE(n_iter=1000, perplexity=60, early_exaggeration=20)
#Transpose the data to work on the cells
tSNE_points_size2 = model.fit_transform(pc_size[:,:30])
scatter_tSNE(tSNE_points_size2)

In [None]:
model = TSNE(n_iter=1000, perplexity=50, early_exaggeration=30)
#Transpose the data to work on the cells
tSNE_points_size3 = model.fit_transform(pc_size[:,:30])
scatter_tSNE(tSNE_points_size3)

In [None]:
model = TSNE(n_iter=1000, perplexity=50, early_exaggeration=40)
#Transpose the data to work on the cells
tSNE_points_size4 = model.fit_transform(pc_size[:,:30])
scatter_tSNE(tSNE_points_size4)

In [None]:
model = TSNE(n_iter=1000, perplexity=50, early_exaggeration=50)
#Transpose the data to work on the cells
tSNE_points_size4 = model.fit_transform(pc_size[:,:30])
scatter_tSNE(tSNE_points_size4)

## tSNE Cytograph

In [None]:
import cytograph

In [None]:
import sys

In [None]:
from cytograph import TSNE as cytoTSNE

In [None]:
sys.path.insert(0, '/home/lars/programs/cytograph/bhtsne')

In [None]:
cytograph.TSNE()

In [None]:
model = bhtsne()
tSNE_points_cyto = model.layout(pc[:,:33])
scatter_tSNE(tSNE_points_cyto)

In [None]:
model = cytoTSNE(max_iter=5000, perplexity=50, theta=.75)
tSNE_points_cyto = model.layout(pc[:,:33])
scatter_tSNE(tSNE_points_cyto)

In [None]:
model = cytoTSNE(max_iter=5000, perplexity=30, theta=.75)
tSNE_points_cyto = model.layout(pc[:,:33])
scatter_tSNE(tSNE_points_cyto)

In [None]:
model = cytoTSNE(max_iter=5000, perplexity=70, theta=.75)
tSNE_points_cyto = model.layout(pc[:,:33])
scatter_tSNE(tSNE_points_cyto)

In [None]:
model = TSNE(n_iter=5000, perplexity=50, early_exaggeration=20, )
#Transpose the data to work on the cells
tSNE_points = model.fit_transform(pc[:,:33])
scatter_tSNE(tSNE_points)

### tSNE on added feature dataset

In [None]:
pca = PCA()
pc2 = pca.fit_transform(df_fish_addfeature.T)
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))

In [None]:
model = TSNE(n_iter=5000, perplexity=50, early_exaggeration=20, )
#Transpose the data to work on the cells
tSNE_points_feat = model.fit_transform(pc2[:,:33])
scatter_tSNE(tSNE_points_feat)
#pickle.dump(tSNE_points, open( "temp_tSNE/tSNE_cortexFISH_33pc_perplex50_EarlyEx20.p", "wb" ))

### tSNE on dataset with locations

In [None]:
pca = PCA()
pc3 = pca.fit_transform(df_fish_location.T)
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))

In [None]:
model = TSNE(n_iter=5000, perplexity=50, early_exaggeration=20, )
#Transpose the data to work on the cells
tSNE_points_location = model.fit_transform(pc3[:,:33])
scatter_tSNE(tSNE_points_location)

### Expression on tSNE

In [None]:
#Expression plot on tSNE points
plt.figure(figsize=(20,12))
gs = plt.GridSpec(5,8)

dataset = X_log

for i, gene in enumerate(gene_sort_fish):
    plt.subplot(gs[i])
    filt = df_fish.index == gene
    levels = dataset[filt,:].flat[:]
    levels = np.clip(levels, np.percentile(levels, 10), np.percentile(levels,90))
    plt.scatter(tSNE_points[:,0], tSNE_points[:,1],color=plt.cm.Reds(levels/max(levels)), lw=0, alpha=.5,s=2)
    for label in plt.gca().axes.get_xticklabels():
        label.set_visible(False)
    for label in plt.gca().axes.get_yticklabels():
        label.set_visible(False)

    plt.title(gene.split('_')[1])
    plt.tight_layout()
    

#plt.savefig('Gene_expression_log.png', dpi=600)

In [None]:
plt.figure(figsize=(5,6))

dataset = X_log
gene = 'Hybridization13_Plp1'

filt = df_fish.index == gene
levels = dataset[filt,:].flat[:]
levels = np.clip(levels, np.percentile(levels, 10), np.percentile(levels,90))
plt.scatter(tSNE_points[:,0], tSNE_points[:,1],color=plt.cm.coolwarm(levels/max(levels)), lw=0, alpha=1,s=10)
plt.axis('off')

title = gene.split('_')[1]
plt.title(title)
plt.tight_layout();
plt.gca().set_aspect('equal')

plt.savefig('tSNE_{}.png'.format(title), dpi=600)

In [None]:
filt

# Clustering

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift, estimate_bandwidth, get_bin_seeds
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

## Cluster number estimation

### Elbow method (Kmeans)

In [None]:
Ks = range(1, 50)
km = [KMeans(n_clusters=i) for i in Ks]
score = [km[i].fit(X_log.T).score(X_log.T) for i in range(len(km))]

plt.figure()
plt.plot(Ks, score)

Suggests ~8 clusters

### Silhouette plot (kmeans/AggClust)

In [None]:
#Mostly copied from: http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

range_n_clusters = np.arange(2,50,1)
#range_n_clusters = [40]

data = X_log.T

plt.figure(figsize=(18,(10*len(range_n_clusters))))
    
silhouette_avg_dict = {}
    
for row, n_clusters in enumerate(range_n_clusters):
    ax1 = plt.subplot2grid((len(range_n_clusters), 2), (row,0), rowspan=1, colspan=1)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])
    
    #CLUSTERING Kmeans or AgglomerativeClustering
    #cluster_algorithm = KMeans(n_clusters=n_clusters)
    cluster_algorithm = AgglomerativeClustering(n_clusters=n_clusters)
    cluster_labels = cluster_algorithm.fit_predict(data)

    silhouette_avg = silhouette_score(data, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    silhouette_avg_dict[n_clusters] = silhouette_avg
    
    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(data, cluster_labels)
    
    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]
  
        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        
        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the {} clusters.".format(n_clusters))
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    ax1.set_ylim(top=y_upper)

    # 2nd Plot showing the actual clusters formed
    ax2 = plt.subplot2grid((len(range_n_clusters), 2), (row,1), rowspan=1, colspan=1)
    colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(tSNE_points[:, 0], tSNE_points[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors)

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("tSNE 1")
    ax2.set_ylabel("tSNE 2")

    plt.suptitle(("Silhouette analysis for AgglomerativeClustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

#plt.savefig('Silhouette_analysis_AggClust_25-50clusters.png')
    

For n_clusters = 2 The average silhouette_score is : 0.154394374405  
For n_clusters = 3 The average silhouette_score is : 0.113730662345  
For n_clusters = 4 The average silhouette_score is : 0.0884204249791  
For n_clusters = 5 The average silhouette_score is : 0.0899432195023  
For n_clusters = 6 The average silhouette_score is : 0.0634700436779  
For n_clusters = 7 The average silhouette_score is : 0.0699286339542  
For n_clusters = 8 The average silhouette_score is : 0.0728726583517  
For n_clusters = 9 The average silhouette_score is : 0.0686104075915  
For n_clusters = 10 The average silhouette_score is : 0.0682938210793  
For n_clusters = 11 The average silhouette_score is : 0.0572151478435  
For n_clusters = 12 The average silhouette_score is : 0.0387362936253  
For n_clusters = 13 The average silhouette_score is : 0.0400175222328  
For n_clusters = 14 The average silhouette_score is : 0.0402744060642  
For n_clusters = 15 The average silhouette_score is : 0.0397037932305  
For n_clusters = 16 The average silhouette_score is : 0.0407893606547  
For n_clusters = 17 The average silhouette_score is : 0.0403650767148  
For n_clusters = 18 The average silhouette_score is : 0.0364709446191  
For n_clusters = 19 The average silhouette_score is : 0.0375083932611  
For n_clusters = 20 The average silhouette_score is : 0.0382517156834  
For n_clusters = 21 The average silhouette_score is : 0.0400260783349  
For n_clusters = 22 The average silhouette_score is : 0.0366484411559  
For n_clusters = 23 The average silhouette_score is : 0.0373432210509  
For n_clusters = 24 The average silhouette_score is : 0.0385092916842  
For n_clusters = 25 The average silhouette_score is : 0.0317282617718  
For n_clusters = 26 The average silhouette_score is : 0.0306447764943  
For n_clusters = 27 The average silhouette_score is : 0.0308523232573  
For n_clusters = 28 The average silhouette_score is : 0.0309823765791  
For n_clusters = 29 The average silhouette_score is : 0.0316622427224  
For n_clusters = 30 The average silhouette_score is : 0.0324194059259  
For n_clusters = 31 The average silhouette_score is : 0.020692519216  
For n_clusters = 32 The average silhouette_score is : 0.0220986579542  
For n_clusters = 33 The average silhouette_score is : 0.0232650275977  
For n_clusters = 34 The average silhouette_score is : 0.0242911741337  
For n_clusters = 35 The average silhouette_score is : 0.0249045083832  
For n_clusters = 36 The average silhouette_score is : 0.0252260679185  
For n_clusters = 37 The average silhouette_score is : 0.0259400728089  
For n_clusters = 38 The average silhouette_score is : 0.0272363844287  
For n_clusters = 39 The average silhouette_score is : 0.0277905739646  
For n_clusters = 40 The average silhouette_score is : 0.0285407236705  
For n_clusters = 41 The average silhouette_score is : 0.0279043373405  
For n_clusters = 42 The average silhouette_score is : 0.0282603043511  
For n_clusters = 43 The average silhouette_score is : 0.0263960411557  
For n_clusters = 44 The average silhouette_score is : 0.025646670401  
For n_clusters = 45 The average silhouette_score is : 0.0262094235034  
For n_clusters = 46 The average silhouette_score is : 0.0265275599925  
For n_clusters = 47 The average silhouette_score is : 0.027168671357  
For n_clusters = 48 The average silhouette_score is : 0.0275528168165  
For n_clusters = 49 The average silhouette_score is : 0.027425381776  

In [None]:
plt.figure()
plt.plot(range(2, len(silhouette_avg_dict)+2), list(silhouette_avg_dict.values()))
plt.ylabel('Average silhouette width')
plt.xlabel('Clusters')

### Gap statistics

In [None]:
#From here: https://anaconda.org/milesgranger/gap-statistic/notebook

def optimalK(data, nrefs=3, maxClusters=15):
    """
    Calculates KMeans optimal K using Gap Statistic from Tibshirani, Walther, Hastie
    Params:
        data: ndarry of shape (n_samples, n_features)
        nrefs: number of sample reference datasets to create
        maxClusters: Maximum number of clusters to test for
    Returns: (gaps, optimalK)
    """
    gaps = np.zeros((len(range(1, maxClusters)),))
    resultsdf = pd.DataFrame({'clusterCount':[], 'gap':[]})
    for gap_index, k in enumerate(range(1, maxClusters)):

        # Holder for reference dispersion results
        refDisps = np.zeros(nrefs)

        # For n references, generate random sample and perform kmeans getting resulting dispersion of each loop
        for i in range(nrefs):
            
            # Create new random reference set by shuffeling the data
            #randomReference = data #make a copy
            #np.random.shuffle(randomReference) #But is does not work
            randomReference = np.random.random_sample(size=data.shape) #Original function from Miles Granger
            
            # Fit to it
            km = KMeans(k)
            km.fit(randomReference)
            #km.fit(randomReference)
            
            refDisp = km.inertia_
            refDisps[i] = refDisp

        # Fit cluster to original data and create dispersion
        km = KMeans(k)
        km.fit(data)
        
        origDisp = km.inertia_

        # Calculate gap statistic
        gap = np.log(np.mean(refDisps)) - np.log(origDisp)

        # Assign this loop's gap statistic to gaps
        gaps[gap_index] = gap
        
        resultsdf = resultsdf.append({'clusterCount':k, 'gap':gap}, ignore_index=True)

    return (gaps.argmax() + 1, resultsdf)  # Plus 1 because index of 0 means 1 cluster is optimal, index 2 = 3 clusters are optimal

k, gapdf = optimalK(X_log.T, nrefs=5, maxClusters=70)    

In [None]:
plt.figure()
plt.plot(gapdf.clusterCount, gapdf.gap, linewidth=3)
plt.scatter(gapdf[gapdf.clusterCount == k].clusterCount, gapdf[gapdf.clusterCount == k].gap, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
#plt.savefig('Gap_statistics_0-70clusters.png')

## Other clustering methods

In [None]:
#DBscan
db = DBSCAN(eps=0.9, min_samples=30, metric='jaccard', algorithm='brute').fit(X_log.T) #correlation
#Jaccard eps=0.999, min_samples=6

print(db.labels_.shape, len(np.unique(db.labels_)), db.labels_)
Counter(db.labels_)

In [None]:
#Mean Shift
bandwidth = estimate_bandwidth(X_log.T, quantile=0.1, n_samples=50)
seeds = get_bin_seeds(X_log.T, bin_size=bandwidth)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, seeds=seeds)
ms.fit(X_log.T)
print(ms.labels_.shape, len(np.unique(ms.labels_)),np.unique(ms.labels_), ms.labels_)
Counter(ms.labels_)

In [None]:
# Affinity Propagation
af = AffinityPropagation(damping=0.99, max_iter=200, convergence_iter=15, 
                         copy=True, preference=.9, affinity='euclidean').fit(X_log.T)
print(af.labels_.shape, len(np.unique(af.labels_)),np.unique(af.labels_), af.labels_)

In [None]:
# KMeans
kmeans30 = KMeans(n_clusters=30, n_init=20, max_iter=600, n_jobs=6)
kmeans30.fit(X_log.T)

## Hierarchical clustering all cells

In [None]:
ac4 = AgglomerativeClustering(n_clusters=4).fit(X_log.T)
ac5 = AgglomerativeClustering(n_clusters=5).fit(X_log.T)
ac6 = AgglomerativeClustering(n_clusters=6).fit(X_log.T)
ac7 = AgglomerativeClustering(n_clusters=7).fit(X_log.T)
ac8 = AgglomerativeClustering(n_clusters=8).fit(X_log.T)
ac9 = AgglomerativeClustering(n_clusters=9).fit(X_log.T)
ac10 = AgglomerativeClustering(n_clusters=10).fit(X_log.T)
ac11 = AgglomerativeClustering(n_clusters=11).fit(X_log.T)
ac12 = AgglomerativeClustering(n_clusters=12).fit(X_log.T)

In [None]:
#Hierarchical
#ac20 = AgglomerativeClustering(n_clusters=20).fit(X_log.T)
#ac21 = AgglomerativeClustering(n_clusters=21).fit(X_log.T)
#ac22 = AgglomerativeClustering(n_clusters=22).fit(X_log.T)
#ac23 = AgglomerativeClustering(n_clusters=23).fit(X_log.T)
#ac24 = AgglomerativeClustering(n_clusters=24).fit(X_log.T)
#ac25 = AgglomerativeClustering(n_clusters=25).fit(X_log.T)
#ac26 = AgglomerativeClustering(n_clusters=26).fit(X_log.T)
#ac27 = AgglomerativeClustering(n_clusters=27).fit(X_log.T)
#ac28 = AgglomerativeClustering(n_clusters=28).fit(X_log.T)
#ac29 = AgglomerativeClustering(n_clusters=29).fit(X_log.T)
ac30 = AgglomerativeClustering(n_clusters=30).fit(X_log.T)
ac31 = AgglomerativeClustering(n_clusters=31).fit(X_log.T)
ac32 = AgglomerativeClustering(n_clusters=32).fit(X_log.T)
ac33 = AgglomerativeClustering(n_clusters=33).fit(X_log.T)
ac34 = AgglomerativeClustering(n_clusters=34).fit(X_log.T)
ac35 = AgglomerativeClustering(n_clusters=35).fit(X_log.T)
ac36 = AgglomerativeClustering(n_clusters=36).fit(X_log.T)
ac37 = AgglomerativeClustering(n_clusters=37).fit(X_log.T)
ac38 = AgglomerativeClustering(n_clusters=38).fit(X_log.T)
ac39 = AgglomerativeClustering(n_clusters=39).fit(X_log.T)
ac40 = AgglomerativeClustering(n_clusters=40).fit(X_log.T)
ac41 = AgglomerativeClustering(n_clusters=41).fit(X_log.T)
ac42 = AgglomerativeClustering(n_clusters=42).fit(X_log.T)
ac43 = AgglomerativeClustering(n_clusters=43).fit(X_log.T)
ac44 = AgglomerativeClustering(n_clusters=44).fit(X_log.T)
ac45 = AgglomerativeClustering(n_clusters=45).fit(X_log.T)

### Clustering all types

In [None]:
ac30 = AgglomerativeClustering(n_clusters=30, affinity='euclidean', linkage='ward').fit(X_log.T)

In [None]:
ac30_pc33 = AgglomerativeClustering(n_clusters=30).fit(pc[:,:33])
ac30_pc20 = AgglomerativeClustering(n_clusters=30).fit(pc[:,:20])

In [None]:
#Clustering on fold change data. Work but a third of the cells end up in a cluster of low expressing cells. 
ac30_fc = AgglomerativeClustering(n_clusters=30).fit(np.array(df_fish_fc.values).T)

In [None]:
ac47 = AgglomerativeClustering(n_clusters=47).fit(np.array(df_fish.values).T)

In [None]:
#Clustering on the raw count data. Similar results as on the Fold Change data
ac30_X = AgglomerativeClustering(n_clusters=30).fit(X.T)
ac30_Norm = AgglomerativeClustering(n_clusters=30).fit(X_norm.T)

In [None]:
ac30_sqrt = AgglomerativeClustering(n_clusters=30).fit(X_sqrt.T)
ac6_sqrt = AgglomerativeClustering(n_clusters=6).fit(X_sqrt.T)

In [None]:
#Save model as pickle file
pickle.dump(ac30, open('Clustering_model_ac30.pkl', 'wb'))
model_pkl.close()

In [None]:
#Load model
ac30 = pickle.load(open('Clustering_model_ac30.pkl', 'rb'))

In [None]:
ac30

### Clustering Major - Minor types

In [None]:
major_type_markers = ['Hybridization12_Slc32a1',
                     'Hybridization1_Tbr1',
                      'Hybridization7_Sox10',
                      'Hybridization1_Aldoc',
                      'Hybridization3_Hexb',
                      'Hybridization3_Mrc1',
                      'Hybridization1_Foxj1',
                      'Hybridization12_Vtn',
                      'Hybridization2_Flt1',
                      'Hybridization5_Acta2',
                      'Hybridization9_Lum']

df_fish_log_major = df_fish_log.loc[major_type_markers,:]
ac11_major = AgglomerativeClustering(n_clusters=11).fit(df_fish_log_major.T)

In [None]:
ac11_all = AgglomerativeClustering(n_clusters=11).fit(df_fish_log.T)

Major cell type clustering does not yield the 11 intended clusters. For instance, the oligodendrocytes are split into two clusters

### Clustering on added featrues dataset

In [None]:
ac_addfeatures = AgglomerativeClustering(n_clusters=35).fit(np.array(df_fish_addfeature.values).T)

### Clustering on dataset with XY location

In [None]:
ac_location = AgglomerativeClustering(n_clusters=35).fit(np.array(df_fish_location.values).T)

### Clustering on size corrected data

In [None]:
ac_size = AgglomerativeClustering(n_clusters=50).fit(np.array(df_fish_size_sqrt.values).T)

## Two step clustering log data

In [None]:
#Major cell types
ac6 = AgglomerativeClustering(n_clusters=6).fit(X_log.T)
#0: L6, Hippocampus, Bad neurons
#1: L4
#2: Int
#3: Oligo
#4: Astocytes, other glia
#5: L2/3, L5

major_label_cells = gen_labels(df_fish, ac6)[1]

In [9]:
#Second step clustering, major 0
major0_label_cells = gen_labels(df_fish, ac6)[1]
df_fish_log_major0 = df_fish_log.loc[:,major0_label_cells[0]]

ac_major0_5 = AgglomerativeClustering(n_clusters=5).fit(df_fish_log_major0.T)
#0_0: bleached cells
#0_1: L6
#0_2: L5 (Cnr1, Cpne5)
#0_3: Bad cells
#0_4: Hippocampal ?Neurons

NameError: name 'gen_labels' is not defined

In [None]:
#Second step clustering, major 1
major1_label_cells = gen_labels(df_fish, ac6)[1]
df_fish_log_major1 = df_fish_log.loc[:,major1_label_cells[1]]

#NO FURTHER SPLIT NESSECARY
#1: L4

#Hack to generate an object with a .labels_ attribute, for downstream handling
class ac_major1_1():
    pass

ac_major1_1.labels_ = np.array([0] * df_fish_log_major1.shape[1])

In [None]:
#Second step clustering, major 2
df_fish_log_major2 = df_fish_log.loc[:,major_label_cells[2]]

ac_major2_10 = AgglomerativeClustering(n_clusters=10).fit(df_fish_log_major2.T)

#10 Clusters looks ok but difficult to decide
#2_0: int - Crh, Cnr1
#2_1: int - Pthlh, Kcnip2
#2_2: int - Kcnip2 (Caudoputamen)
#2_3: int - Cpne5
#2_4: int - Crhbp, Cpne5
#2_5: int - Pthlh, Cnr1, Vip, Crh
#2_6: int - Crh, Vip
#2_7: int - Cnr1
#2_8: int - Crhbp
#2_9: Oligodendrocytes


In [None]:
#Second step clustering, major 3
df_fish_log_major3 = df_fish_log.loc[:,major_label_cells[3]]

ac_major3_5 = AgglomerativeClustering(n_clusters=5).fit(df_fish_log_major3.T)
#3_0: Olig - Mature, Anln
#3_1: Olig - Intermediate, Cpts
#3_2: Olig - COP, Bmp4
#3_3: Olig - (unclear)
#3_4: Olig - Newly formed, Itpr2, Tmem2

In [None]:
#Second step clustering, major 4
df_fish_log_major4 = df_fish_log.loc[:,major_label_cells[4]]

ac_major4_12 = AgglomerativeClustering(n_clusters=12).fit(df_fish_log_major4.T)

#4_0: Endothelial (bad profile)
#4_1: Astro 2
#4_2: Astro 1
#4_3: Choroid plexus? 
#4_4: Endothelial
#4_5: Ependymal
#4_6: Endo1 (Artifact, all on right edge)
#4_7: Astro 1 (Pia)
#4_8: Ependymal (between CC and Hipp)
#4_9: OPC
#4_10: Astro1 Mfge8+ subset
#4_11: VLMC



In [None]:
#Second step clustering, major 5
df_fish_log_major5 = df_fish_log.loc[:,major_label_cells[5]]

ac_major5_2 = AgglomerativeClustering(n_clusters=2).fit(df_fish_log_major5.T)

#5_0: L2/3
#5_1: L5

In [None]:
def combine_sub_clusters(list_models, list_dfs):
    #Cell labels: Dictionary of each cell with label as key
    cell_labels = {}
    
    n_clusters = 0 
    for i, j in zip(list_dfs, list_models):
        j = j.labels_ + n_clusters
        n_clusters += len(np.unique(j))
        cell_labels.update(dict(zip(i.columns, j)))
    
    #Label cells: dictionary with all labels and a list of cells
    label_cells = {}
    for l in np.unique(list(cell_labels.values())):
        label_cells[l] = []
    for i in cell_labels:
        label_cells[cell_labels[i]].append(i)

    #combine df
    df_combined = pd.concat(list_dfs, axis=1)
    
    cellID = df_combined.columns
    
    labels =[]
    for i in df_combined.columns:
        labels.append(cell_labels[i])
    labels_a = np.array(labels)
        
    return cell_labels, label_cells, df_combined, cellID, labels, labels_a

#cell_labels, label_cells, combine_df, cellID, labels, labels_a = combine_sub_clusters([ac_major0_5, ac_major1_1, ac_major2_10, ac_major3_5, ac_major4_12, ac_major5_2], 
#                     [df_fish_log_major0, df_fish_log_major1, df_fish_log_major2, df_fish_log_major3, df_fish_log_major4, df_fish_log_major5])

cluster_labels = [
'Bleached-cells', 'Pyramidal-L6', 'Pyramidal-L5-1', 'Bad-cells', 'Hippocampal',
'Pyramidal-L4',
'Int_Crh-Cnr1', 'Int_Pthlh-Kcnip2', 'Int-Kcnip2', 'Int-Cpne5', 'Int-Crhbp-Cpne5', 'Int-Pthlh-Cnr1-Vip-Crh', 'Int-Crh-Vip', 'Int-Cnr1', 'Int-Crhbp', 'Olig-IntCluster',
'Olig-Mature', 'Olig-Intermediate', 'Olig-COP', 'Olig-unclear', 'Olig-NF',
'Endothelial-bad', 'Astro2-1', 'Astro2-2', 'Coroid-plexus', 'Endothelial', 'Ependymal', 'Endothelial-artefact', 'Astro1-pia', 'Ependymal-ventricle', 'OPC', 'Astro1-Mfge8+', 'VLMC',
'Pyramidal-L2/3', 'Pyramidal-L5-2']
#Missing: Microglia, Pericytes(vtn)


#Sort the clusters manually
cluster_sort_labels = ['Int-Crhbp-Cpne5','Int-Crhbp','Int_Pthlh-Kcnip2','Int_Crh-Cnr1','Int-Pthlh-Cnr1-Vip-Crh',
                       'Int-Cnr1', 'Int-Crh-Vip','Int-Kcnip2','Int-Cpne5',
                'Pyramidal-L2/3','Pyramidal-L4', 'Pyramidal-L5-1', 'Pyramidal-L5-2', 'Pyramidal-L6', 'Hippocampal',
                'Astro1-pia', 'Astro1-Mfge8+', 'Astro2-1', 'Astro2-2',
                'OPC', 'Olig-COP', 'Olig-NF', 'Olig-Intermediate', 'Olig-Mature', 'Olig-unclear', 'Olig-IntCluster',
                'Coroid-plexus',
                'Ependymal', 'Ependymal-ventricle',
                'Endothelial', 'Endothelial-bad', 'Endothelial-artefact',
                'VLMC',
                'Bleached-cells',
                'Bad-cells',
               ]

#Get the cluster numbers in order
cluster_sort = [cluster_labels.index(x) for x in cluster_sort_labels]

#Make dictionary linking cluster number with the name
cluster_labels = dict(zip(range(0, len(cluster_labels)), cluster_labels))


In [None]:
# Looking into the bad cells
minor0_5_label_cells = gen_labels(df_fish_log_major0, ac_major0_5)[1]
df_bad_cells = df_fish_log.loc[:,minor0_5_label_cells[3]]

n_clusters=10
cl_bad_cells =  AgglomerativeClustering(n_clusters).fit(df_bad_cells.T)
cell_labels, label_cells, cellID, labels, labels_a = gen_labels(df_bad_cells, cl_bad_cells)
heat_map(df_bad_cells, labels_a, sort=range(n_clusters))
for i in range(n_clusters):
    print(cluster_binarity(df_bad_cells, labels_a, i, 2.8, 85)[1])



## Two step clustering size corected, sqrt data

In [None]:
#Major cell types
size7 = AgglomerativeClustering(n_clusters=7).fit(X_size_sqrt.T)
#0: L6
#1: Glia
#2: L4
#3: Int
#4: Olig
#5: L2/3
#6: Neurons

major0_label_size = gen_labels(df_fish_size_sqrt, size7)[1]

In [None]:
#Second step clustering, major size 0
df_fish_size_major0 = df_fish_size_sqrt.loc[:,major0_label_size[0]]

size0_7 = AgglomerativeClustering(n_clusters=7).fit(df_fish_size_major0.T)
#0_0: L6
#0_1: BQ-Lum
#0_2: Claustrum+
#0_3: L6-Lamp5+Kcnip+
#0_4: Astro-Fimbria
#0_5: L6-Lamp5low
#0_6: OPC

In [None]:
#Second step clustering, major size 1
df_fish_size_major1 = df_fish_size_sqrt.loc[:,major0_label_size[1]]

size1_11 = AgglomerativeClustering(n_clusters=11).fit(df_fish_size_major1.T)

#1_0: Astro1
#1_1: VLMC
#1_2: Pericytes
#1_3: Astro2_1
#1_4: Astro-ventricle
#1_5: Ependymal-dorals-ventricle
#1_6: Ependymal-ventral-Ventricle
#1_7: Astro-Gfap+Mfge8+
#1_8: Astro2_2
#1_9: Edothelial
#1_10: Choroid-plexus

In [None]:
#Second step clustering, major size 2
df_fish_size_major2 = df_fish_size_sqrt.loc[:,major0_label_size[2]]

size2_2 = AgglomerativeClustering(n_clusters=2).fit(df_fish_size_major2.T)
#2_0: L4
#2_1: L4-Lamp5+Kcnip+

In [None]:
#Second step clustering, major size 3
df_fish_size_major3 = df_fish_size_sqrt.loc[:,major0_label_size[3]]

size3_14 = AgglomerativeClustering(n_clusters=14).fit(df_fish_size_major3.T)
#3_0: Int-Lamp5
#3_1: Int-Cnr1
#3_2: Int-Kcnip2-Pthlh
#3_3: Int-Crh
#3_4: Int-Crhbp-Deep
#3_5: Int-Vip-Pthlh
#3_6: BQ_3
#3_7: Int-Lat-CaudoPutamen
#3_8: Int-VIP-Cnr1-Pthlh-Crh
#3_9: Int-Crhbp-Cpne5-bleached?
#3_10: Int-Cpne5
#3_11: Int-Crhbp-Pthlh
#3_12: Int-Crhbp-Supperficial
#3_13: Int-Med-CaudoPutamen

In [None]:
#Second step clustering, major size 4
df_fish_size_major4 = df_fish_size_sqrt.loc[:,major0_label_size[4]]

size4_5 = AgglomerativeClustering(n_clusters=5).fit(df_fish_size_major4.T)
#4_0: Olig-MF
#4_1: Olig-Mature
#4_2: Olig-Mature-BQ
#4_3: Olig-NF
#4_4: Olig-COP

In [None]:
#Second step clustering, major size 5
df_fish_size_major5 = df_fish_size_sqrt.loc[:,major0_label_size[5]]

size5_2 = AgglomerativeClustering(n_clusters=2).fit(df_fish_size_major5.T)
#5_0: L2/3
#5_0: L5

In [None]:
#Second step clustering, major size 6
df_fish_size_major6 = df_fish_size_sqrt.loc[:,major0_label_size[6]]

size6_6 = AgglomerativeClustering(n_clusters=6).fit(df_fish_size_major6.T)


In [None]:

df_used = df_fish_size_major4
model_used = size4_5

cell_labels, label_cells, cellID, labels, labels_a = gen_labels(df_used, model_used)
heat_map(df_used, labels_a, sort=list(range(len(np.unique(labels_a)))));

In [None]:
tSNE_and_pos(df_used, labels_a, save=False)
for i in np.sort(np.unique(labels_a)):
    print(i,Counter(labels_a)[i], [i.split("_")[1] for i in cluster_binarity(df_used, labels_a, i, .25, 70)[1]])

In [None]:
df_sort = sort_df(df_used.loc[:,], labels_a)
df_sort_norm = sort_dataset_df(X_norm, df_used, labels_a)
df_sort_log = sort_dataset_df(X_log, df_used, labels_a)
df_sort_fc = sort_dataset_df(X_fc, df_used, labels_a)
df_sort_count = sort_dataset_df(X, df_used, labels_a)
cluster_expression(df_sort, 4, 'fc', sort=list(np.sort(np.unique(labels_a))))

In [None]:
X

In [None]:
cluster_compare(df_sort, 0, 1, dataset='fc')

In [None]:
cell_labels, label_cells, combine_df, cellID, labels, labels_a = combine_sub_clusters([size0_7, size1_11, size2_2, size3_14, size4_5, size5_2, size6_6 ], 
                    [df_fish_size_major0,df_fish_size_major1,df_fish_size_major2,df_fish_size_major3,df_fish_size_major4,df_fish_size_major5,df_fish_size_major6,])

cluster_labels = ['L6', 'BQ_0', 'Claustrum+', 'L6-Lamp5+Kcnip+', 'Astro-Fimbria', 'L6-Lamp5low', 'OPC',
                   'Astro1', 'VLMC', 'Pericytes', 'Astro2_1', 'Astro-ventricle', 'Ependymal-dorals-ventricle', 'Ependymal-ventral-Ventricle', 'Astro-Gfap+Mfge8+', 'Astro2_2','Edothelial', 'Choroid-plexus',
                   'L4',  'L4-Lamp5+Kcnip+',
                   'Int-Lamp5', 'Int-Cnr1', 'Int-Kcnip2-Pthlh', 'Int-Crh', 'Int-Crhbp-Deep', 'Int-Vip-Pthlh', 'BQ_2',  'Int-Lat-CaudoPutamen', 'Int-VIP-Cnr1-Pthlh-Crh', 'Int-Crhbp-Cpne5-bleached?', 'Int-Cpne5', 'Int-Crhbp-Pthlh', 'Int-Crhbp-Superficial', 'Int-Med-CaudoPutamen',
                   'Olig-MF', 'Olig-Mature', 'Olig-Mature-BQ', 'Olig-NF', 'Olig-COP',
                   'L23', 'L5',
                   'BQ_4', 'L6_BQ', 'Hippocampus', 'Microglia', 'L5-Aldoc', 'Excitatory-Kcnip2',
                    
                 ]



#Sort the clusters manually
cluster_sort_labels = ['Int-Crhbp-Superficial','Int-Crhbp-Deep','Int-Crhbp-Pthlh', 'Int-Crhbp-Cpne5-bleached?', 'Int-Kcnip2-Pthlh','Int-Cnr1','Int-VIP-Cnr1-Pthlh-Crh',
                        'Int-Vip-Pthlh', 'Int-Cpne5','Int-Crh','Int-Lat-CaudoPutamen', 'Int-Med-CaudoPutamen','Int-Lamp5',
                      'L23', 'L4',  'L4-Lamp5+Kcnip+', 'L5', 'L5-Aldoc', 'L6','L6-Lamp5+Kcnip+', 'L6-Lamp5low', 'L6_BQ', 'Claustrum+', 'Hippocampus', 'Excitatory-Kcnip2',
                        'Astro1', 'Astro2_1', 'Astro2_2', 'Astro-Gfap+Mfge8+', 'Astro-Fimbria', 'Astro-ventricle',
                        'OPC', 'Olig-COP', 'Olig-NF', 'Olig-MF', 'Olig-Mature', 'Olig-Mature-BQ',
                       'Microglia',  
                       'Pericytes',
                       'Ependymal-dorals-ventricle','Ependymal-ventral-Ventricle', 
                       'Edothelial',
                       'Choroid-plexus',
                       'VLMC',
                         'BQ_0',  'BQ_2',  'BQ_4']

#Get the cluster numbers in order
cluster_sort = [cluster_labels.index(x) for x in cluster_sort_labels]

#Make dictionary linking cluster number with the name
cluster_labels = dict(zip(range(0, len(cluster_labels)), cluster_labels))

In [None]:
pickle.dump(label_cells, open( "label_cells_47.p", "wb" ))
pickle.dump(cell_labels, open( "cell_labels_47.p", "wb" ))

## Automatic itterative clustering

In [None]:
def iterative_clustering(df):
    #Make the label table
    initial_labels = np.array(['L0_C0' for i in range(len(df_fish.columns))]).reshape((1,len(df_fish.columns)))
    label_tree = pd.DataFrame(data=initial_labels, columns=df.columns)
    
    current_level = 0
    offset = 0
    
    #Iterate through the current sub clusters
    for superclust in np.unique(label_tree.loc[current_level]):
        current_level += 1
        print(superclust)
        filt = label_tree.loc[current_level] == superclust
        df_to_clust = df.loc[:,filt]
        
        #Split cells into two clusters
        split = AgglomerativeClustering(n_clusters=2).fit(df_to_clust.T)
        
        #Update the label_tree with new temporary labels (Format: Level X _ Cluster Y --> LxCy)
        for i, cell in enumerate(df_to_clust.columns):
            cluster_id = 'L{}_C{}'.format((current_level), (split.labels_[i]+offset))
            label_tree.loc[current_level, cell] = cluster_id
        
        
        
    

    
    return df_temp
qqq = iterative_clustering(df_fish)

In [None]:
qqq.shape

In [None]:
np.unique(qqq.loc[0])

In [None]:
df_test = pd.DataFrame(data=np.zeros((1,5)), columns=[0,1,2,3,4,])

In [None]:
df_test

In [None]:
df_test.loc[1,3] = 15

In [None]:
[0,1,25,3,4][2]

In [None]:
df_fish_log_major3[df_fish_log_major3<6] = 0.0000001

In [None]:
ac_it_3_1 = AgglomerativeClustering(n_clusters=5).fit(df_fish_log_major3.T)

In [None]:
cell_labels, label_cells, cellID, labels, labels_a = gen_labels(df_fish_log_major3, ac_it_3_1)
heat_map(df_fish_log_major3, labels_a)

In [None]:
plt.figure(figsize=(16,8))
gs = plt.GridSpec(5,8)

for i, gene in enumerate(gene_sort_fish):
    plt.subplot(gs[i])
    plt.hist(df_fish_size_sqrt.loc[gene].values, bins=25 )
    plt.ylim([0,500])
    plt.xlim([0,1])
    plt.title(gene, fontsize=8)
    
plt.tight_layout()

In [None]:
plt.figure()
gene= 26
plt.hist(df_fish_log.loc[gene_sort_fish[gene]].values, bins=400 )
plt.title(gene_sort_fish[gene])


In [None]:
#Cluster in 2
#check binaryzation of markers in clusters
#If binary:
    #stop
#If not:
    #repeat

In [None]:
pickle.dump(df_fish_size_major4.columns, open('oligodendrocytes.p', "wb" ))

In [None]:
def cluster_binarity(df, labels, cluster_of_interest, cutoff_expression, cutoff_percentage):
    """
    Calculates the percentage of cells, of a given cluster_of_interest in a df, 
    that have an expression above the cutoff_expression. It returns a pandas 
    Series with the percentages and it returns the names of the genes that have
    a percentage higher than the cutoff_percentage.
    Input:
    `df`(pandas dataframe): df of the cells that were clusterd.
    `labels`(numpy array): Cluster labels of the clustering, in same order as df.
    `cluster_of_interest`(int): Cluster of interest from the labels.
    `cutoff_expression`(float): used to make a binarization and is the same for
        all genes. Data normalization is very important!
    `cutoff_percentage`(float): Percentage of cells that need to be 'positive'
        in the cluster. 
    Returns:
    *Pandas series with the percentages for all genes
    *List of names of genes with a percentage higher than the cutoff_percentage
    
    """
    filt = labels == cluster_of_interest
    df = df.loc[:,filt]
    
    #Calculate percentage of cells that are above the cutoff
    positive = (df[df>cutoff_expression].T.count() / df.shape[1]) * 100
    positive_genes = [positive.index[i] for i,n in enumerate(positive) if n>cutoff_percentage]
    return positive, positive_genes

#qq = cluster_binarity(df_fish_log_major3, labels_a, 4, 3, 80)

#for i in range(5):
#    print(cluster_binarity(df_fish_log_major3, labels_a, i, 3, 80)[1])
    
#type(qq[0])

In [None]:
cl_0 =  AgglomerativeClustering(n_clusters=2).fit(df_fish_log_major3.T)
cell_labels, label_cells, cellID, labels, labels_a = gen_labels(df_fish_log_major3, cl_0)
heat_map(df_fish_log_major3, labels_a)
print(cluster_binarity(df_fish_log_major3, labels_a, 0, 3, 80)[1])
print(cluster_binarity(df_fish_log_major3, labels_a, 1, 3, 80)[1])

df_0_0 = df_fish_log_major3.loc[:, gen_labels(df_fish_log_major3, cl_0)[1][0]]
df_1_1 = df_fish_log_major3.loc[:, gen_labels(df_fish_log_major3, cl_0)[1][1]]

In [None]:
cl_0_0 =  AgglomerativeClustering(n_clusters=2).fit(df_0_0.T)
cell_labels, label_cells, cellID, labels, labels_a = gen_labels(df_0_0, cl_0_0)
heat_map(df_0_0, labels_a)
print(cluster_binarity(df_0_0, labels_a, 0, 3, 80)[1])
print(cluster_binarity(df_0_0, labels_a, 1, 3, 80)[1])

df_0_0_0 = df_fish_log_major3.loc[:, gen_labels(df_0_0, cl_0_0)[1][0]]
df_0_0_1 = df_fish_log_major3.loc[:, gen_labels(df_0_0, cl_0_0)[1][1]]

In [None]:
cl_0_0_0 =  AgglomerativeClustering(n_clusters=2).fit(df_0_0_0.T)
cell_labels, label_cells, cellID, labels, labels_a = gen_labels(df_0_0_0, cl_0_0_0)
heat_map(df_0_0_0, labels_a)
print(cluster_binarity(df_0_0_0, labels_a, 0, 3, 80)[1])
print(cluster_binarity(df_0_0_0, labels_a, 1, 3, 80)[1])

df_0_0_0_0 = df_fish_log_major3.loc[:, gen_labels(df_0_0_0, cl_0_0_0)[1][0]]
df_0_0_0_1 = df_fish_log_major3.loc[:, gen_labels(df_0_0_0, cl_0_0_0)[1][1]]

In [None]:
cl_0_0_0_0 =  AgglomerativeClustering(n_clusters=2).fit(df_0_0_0_0.T)
cell_labels, label_cells, cellID, labels, labels_a = gen_labels(df_0_0_0_0, cl_0_0_0_0)
heat_map(df_0_0_0_0, labels_a)
print(cluster_binarity(df_0_0_0_0, labels_a, 0, 3, 80)[1])
print(cluster_binarity(df_0_0_0_0, labels_a, 1, 3, 80)[1])

df_0_0_0_0_0 = df_fish_log_major3.loc[:, gen_labels(df_0_0_0_0, cl_0_0_0_0)[1][0]]
df_0_0_0_0_1 = df_fish_log_major3.loc[:, gen_labels(df_0_0_0_0, cl_0_0_0_0)[1][1]]

In [None]:
#Clustering of the bad cluster. 
cl_0_0_0_1 =  AgglomerativeClustering(n_clusters=2).fit(df_0_0_0_1.T)
cell_labels, label_cells, cellID, labels, labels_a = gen_labels(df_0_0_0_1, cl_0_0_0_1)
heat_map(df_0_0_0_1, labels_a)
print(cluster_binarity(df_0_0_0_1, labels_a, 0, 3, 80)[1])
print(cluster_binarity(df_0_0_0_1, labels_a, 1, 3, 80)[1])

df_0_0_0_1_0 = df_fish_log_major3.loc[:, gen_labels(df_0_0_0_1, cl_0_0_0_1)[1][0]]
df_0_0_0_1_1 = df_fish_log_major3.loc[:, gen_labels(df_0_0_0_1, cl_0_0_0_1)[1][1]]

## Over clustering

In [None]:
n_clusters=50
cl_over_clustering =  AgglomerativeClustering(n_clusters).fit(df_fish_log.T)
cell_labels, label_cells, cellID, labels, labels_a = gen_labels(df_fish_log, cl_over_clustering)
heat_map(df_fish_log, labels_a, sort=range(n_clusters))
for i in range(n_clusters):
    print(i,Counter(labels_a)[i], [i.split("_")[1] for i in cluster_binarity(df_fish_log, labels_a, i, 2.5, 80)[1]])


## Machine learning

In [None]:
# import model
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import StratifiedShuffleSplit

In [None]:
X_sc = df_sc_cort_olig.values.astype(np.float64).T
    #Normalize
X_sc_norm = X_sc.sum(0).mean() * (X_sc/X_sc.sum(0))
    #Log transform
X_sc_log = np.log2(X_sc_norm+1)

In [None]:
mean_expression_sc =  df_sc_cort_olig.loc[gene_sort,:].T.mean()
df_sc_fc = df_sc_cort_olig.divide(mean_expression_sc, axis='rows')
X_sc_fc = df_sc_fc.values.astype(np.float64).T

In [None]:
classesnames = pd.Series(sc_cell_label)[df_sc_cort_olig.columns].values
classes, y = np.unique(classesnames, return_inverse=True)
class_labels = dict(zip(range(len(classes)), classes))

In [None]:
# instantiate
#Optimze parameters (~20min 6cores)
logreg = LogisticRegressionCV(cv=StratifiedShuffleSplit(y, test_size=.25, n_iter=30, random_state=24,),
                             Cs=np.logspace(0.4, 0.8, 20,base=10), multi_class='multinomial',fit_intercept=False,
                             refit=True, solver='newton-cg', n_jobs=6)
logreg.fit(X_sc_fc, y)

On X_sc_log dataset:
logreg.Cs = array([ 2.51188643,  2.6366509 ,  2.76761237,  2.90507865,  3.04937283,
        3.20083405,  3.35981829,  3.52669921,  3.70186906,  3.88573952,
        4.07874276,  4.2813324 ,  4.49398459,  4.71719914,  4.95150067,
        5.19743987,  5.45559478,  5.72657215,  6.01100886,  6.30957344])

In [None]:
logreg.Cs

In [None]:
#Predict Identitiy 
y_pred = logreg.predict(X_fc.T)

In [None]:
Counter(y_pred)

In [None]:
cell_label_pred = dict(zip(df_fish.columns, y_pred))

## Cluster visualization

In [None]:
tSNE_points

In [None]:

def gen_labels(df, model):
    """
    Generate cell labels from model.
    Input:
    `df`: Panda's dataframe that has been used for the clustering. (used to get
    the names of colums and rows)
    `model`(obj): Clustering object
    Returns (in this order):
    `cell_labels` = Dictionary coupling cellID with cluster label
    `label_cells` = Dictionary coupling cluster labels with cellID
    `cellID` = List of cellID in same order as labels
    `labels` = List of cluster labels in same order as cells
    `labels_a` = Same as "labels" but in numpy array
    
    """
    if str(type(model)).startswith("<class 'sklearn.cluster"):
        cell_labels = dict(zip(df.columns, model.labels_))
        label_cells = {}
        for l in np.unique(model.labels_):
            label_cells[l] = []
        for i, label in enumerate(model.labels_):
            label_cells[label].append(df.columns[i])
        cellID = list(df.columns)
        labels = list(model.labels_)
        labels_a = model.labels_
    elif type(model) == np.ndarray:
        cell_labels = dict(zip(df.columns, model))
        label_cells = {}
        for l in np.unique(model):
            label_cells[l] = []
        for i, label in enumerate(model):
            label_cells[label].append(df.columns[i])
        cellID = list(df.columns)
        labels = list(model)
        labels_a = model
    else:
        print('Error wrong input type')
    
    return cell_labels, label_cells, cellID, labels, labels_a

def sort_df(df, labels_a, row_sort=True, sorted_row_names=gene_sort_fish):
    """
    Sort the dataframe columns based on the cluster labels (additional row sort is optional).
    Input:
    `df`: Panda's dataframe that has been used for the clustering. (or a df
        that has the EXACT same order)
    `cluster_model`: Results of the clustering
    `row_sort`(bool): If True it will sort the rows of the dataframe, acording to the probided list
    `sorted_row_names`(list): List of row names. Default = gene_sort_fish
    
    """
    #Sort the dataframe with the new clusters
    #df_sort = pd.DataFrame(data=X, columns=df_fish.columns, index=df_fish.index)
    df_sort = df
    #labels_a = cluster_model.labels_ #Array of cell labels
    new_column_order = df_sort.columns[labels_a.argsort()]
    if row_sort == True:
        df_sort = df_sort.loc[sorted_row_names,new_column_order]
    else:
        df_sort = df_sort.loc[:,new_column_order]
    return df_sort
    

def sort_dataset_df(data, df, labels_a):# , cluster_model):
    """
    Sort a dataset based on the cluster labels. The data can be a normalized 
    np array. 
    Input:
    `data`(np array): Any normalized array, in the EXACT same order as the df on
        which the clustering algoritm ran.
    `df`: Panda's dataframe that has been used for the clustering. (used to get
        the names of colums and rows)
    Uses df_fish as basis.
    #`cluster_model`: Results of the clustering
    
    """    
    #Filter the datasetet to use the cells in the df
    data_filt = np.zeros((len(df.index), len(df.columns)))
    for i, n in enumerate(df.columns):
        #Assumes that data and df_fish are in the same format
        data_filt[:,i] = data[:,df_fish.columns.get_loc(n)]
    
    df_sort = pd.DataFrame(data=data_filt, columns=df.columns, index=df.index)
    new_column_order = df_sort.columns[labels_a.argsort()]
    
    df_sort = df_sort.loc[gene_sort_fish,new_column_order]
    return df_sort

def plot_labels(df, labels_a, standalone=True):
    if standalone == True:
        plt.figure(figsize=(10,10))
    points = np.zeros((len(df.columns),2))
    for i, n in enumerate(df.columns):
        points[i,:] = tSNE_points[df_fish.columns.get_loc(n),:]
    print(len(points))
    #plt.scatter(points[:,0], points[:,1],c=plt.cm.jet(labels_a/max(labels_a)), lw=0, alpha=1, s=15)
        #Color labels: See below, part 10
    plt.scatter(points[:,0], points[:,1],c=label_colors_hex, lw=0, alpha=1, s=15)


def plot_cells_pos_labels(labels_a):
    
    coord_df_sort = sort_df(coord_df.loc[:,cellID], labels_a, row_sort=False)
    color_labels_sort =plt.cm.jet(np.sort(labels_a)/max(np.sort(labels_a)))    
    #plot_cell_pos(coord_df_sort, cell_ids=None, color=color_labels_sort, s=10, standalone=False, mode='tsne')
        #Color labels: See below, part 10
    plot_cell_pos(coord_df_sort, cell_ids=None, color=label_colors_hex, s=10, standalone=False, mode='tsne')
    
    

def tSNE_and_pos(df, labels_a, save=False):
    """
    Plot the tSNE and cell positions with the cluster colors
    
    """
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14,7))
    plt.sca(axes[0])
    plot_labels(df, labels_a, standalone=False)
    plt.title('tSNE')
    plt.axis('off')
    plt.sca(axes[1])
    plot_cells_pos_labels(labels_a)
    plt.title('cell positon')
    plt.axis('off')
    if save == True:
        plt.savefig('Cluster_tSNE_and_Position.png', dpi=600)

    #ONE STEP CLUSTERING
    
    
    
    
    
#model = ac_size
#df_used = df_fish_size_sqrt
#cell_labels, label_cells, cellID, labels, labels_a = gen_labels(df_used, model)

    #TWO STEP CLUSTERING
#cell_labels, label_cells, df_used, cellID, labels, labels_a = combine_sub_clusters([ac_major0_5, ac_major1_1, ac_major2_10, ac_major3_5, ac_major4_12, ac_major5_2], 
#                     [df_fish_log_major0, df_fish_log_major1, df_fish_log_major2, df_fish_log_major3, df_fish_log_major4, df_fish_log_major5])

    #TWO STEP CLUSTERING 47 types
cell_labels, label_cells, df_used, cellID, labels, labels_a = combine_sub_clusters([size0_7, size1_11, size2_2, size3_14, size4_5, size5_2, size6_6 ], 
                    [df_fish_size_major0,df_fish_size_major1,df_fish_size_major2,df_fish_size_major3,df_fish_size_major4,df_fish_size_major5,df_fish_size_major6,])


df_sort = sort_df(df_used.loc[:,], labels_a)
#df_sort_count = sort_dataset_df(X, df_used, labels_a)
df_sort_norm = sort_dataset_df(X_norm, df_used, labels_a)
df_sort_log = sort_dataset_df(X_log, df_used, labels_a)
df_sort_fc = sort_dataset_df(X_fc, df_used, labels_a)
df_sort_count = sort_dataset_df(X, df_used, labels_a)
print('Generated labels, df_sort, df_sort_log, df_sort_fc')
    #Make sure you have the right color settings
tSNE_and_pos(df_used, labels_a, save=True)


In [None]:
plt.figure(figsize=(9,9))
coord_df_sort = sort_df(coord_df.loc[:,cellID], labels_a, row_sort=False)
plot_cell_pos(coord_df_sort, cell_ids=None, color=label_colors_hex, s=15, standalone=False, mode='tsne')
plt.gca().set_axis_off()
#plt.savefig('cell_type_position_47.png', dpi=300)

In [None]:
def plot_cluster_size(labels): 
    plt.figure()
    count = np.array(Counter(labels).most_common())
    left = np.arange(count.shape[0])
    plt.bar(left, count[:,1])
    plt.xticks(left, count[:,0], rotation=90)
    plt.xlim(-0.5,len(np.unique(labels)))
    plt.title('Sorted Cluster Size')
    plt.xlabel('Cluster')
    plt.ylabel('Number of cells')

plot_cluster_size(labels)

In [None]:
#Backup version heat map
data = df_sort #_log
save = False


fig, axHM = plt.subplots(figsize=(22,10))
z = data.values
z = z/np.percentile(z, 99, 1)[:,None]
axHM.pcolor(z,cmap='viridis', vmax=1)

plt.yticks(np.arange(0.5, len(data.index), 1), data.index)
plt.gca().invert_yaxis()
plt.xlim(xmax=len(data.columns))

divider = make_axes_locatable(axHM)
axLabel = divider.append_axes("top", .5, pad=0, sharex=axHM)

labels_sort = np.sort(labels_a)
axLabel.pcolor(labels_sort[None,:]/labels_sort.max(), cmap='jet')
axLabel.set_xlim(xmax=len(data.columns))
axLabel.axis('off')

if save == True:
    plt.savefig('/home/lars/storage/Documents/Cortex_FISH/Heatmap_{}clusters_counts.png'.format(len(np.unique(labels_a))), dpi=600)

In [None]:
def mean_expression(df, labels):
    """
    Make dataframe with mean expression
    
    """
    #Make df with count averages per cluster
    df_count_average = pd.DataFrame(index=df.index, columns=np.sort(np.unique(labels))) #np.unique(labels_a))
    for l in np.unique(labels):
        filt = np.sort(labels) == l
        mean = np.array(df.loc[:,filt].T.mean())
        #std = np.array(np.std(df_sort.loc[:,filt], axis=1))
        if np.isnan(np.sum(mean)) == False: #In case some clusters do not have cells
            df_count_average[l] = mean
    return df_count_average

#df_count_average = mean_expression(df_sort)

In [None]:
np.unique(sc_cort_olig_label_int)

# Heat map cells

In [None]:
import matplotlib

In [None]:
#Choose the count or log-corrected dataset


def heat_map(df, labels, sort=None, save=False):
    """
    Plot heat_map of a sorted dataframe
    
    """
    #Find the name of the input df, for logging
    df_input_name =[x for x in globals() if globals()[x] is df][0]
    print('DF used for plot: {}'.format(df_input_name))
    
    if sort == None:
        #Make df with count averages per cluster
        df_count_average = mean_expression(df, labels)
        #Make optimal sort on average expression of each cluster
        #Transpose, otherwise you are doing it on genes instead of clusters
        D = pdist(df_count_average.T, 'cityblock') #Working well: 'correlation', 'cityblock', 'seuclidean', 'canberra', 'cosine'
        Z = linkage(D, 'ward')
        optimal_Z = optimal_leaf_ordering(Z, D)
        optimal_o = polo.polo.leaves_list(optimal_Z) 
        #In case some clusters are missing
        optimal_order = []
        for i in optimal_o:
            optimal_order.append(df_count_average.columns[i])
    else:
        optimal_order = sort
    print('Order of clusters: {}'.format(optimal_order))
    
    #Sort the cells according to the optimal cluster order
    optimal_sort_cells = []
    for i in optimal_order:
        optimal_sort_cells.extend(label_cells[i])
    
    
    #Create a list of optimal sorted cell labels
    optimal_sort_labels = []
    for i in optimal_sort_cells:
        optimal_sort_labels.append(cell_labels[i])
    
    fig, axHM = plt.subplots(figsize=(14,6))
    #z = df.loc[:,optimal_sort_cells].values
    #z = z/np.percentile(z, 99.5, 1)[:,None]
    
    df_full = df # _fish_log
    z = df_full.values
    z = z/np.percentile(z, 99, 1)[:,None]
    z = pd.DataFrame(z, index=df_full.index, columns=df_full.columns)
    z = z.loc[:,optimal_sort_cells].values
    print(z.shape)
    
    im = axHM.pcolor(z, cmap='viridis', vmax=1)

    plt.yticks(np.arange(0.5, len(df.index), 1), gene_sort, fontsize=8)
    plt.gca().invert_yaxis()
    plt.xlim(xmax=len(labels))
    print(len(labels))
    plt.title(df_input_name)

    divider = make_axes_locatable(axHM)
    axLabel = divider.append_axes("top", .3, pad=0, sharex=axHM)

    optimal_sort_labels = np.array(optimal_sort_labels)
    axLabel.pcolor(optimal_sort_labels[None,:]/max(optimal_sort_labels), cmap='prism')
        #Colors, see below:
    #axLabel.pcolor(label_colors_rgb) #label_colors_hex
    
    
    
    axLabel.set_xlim(xmax=len(labels))
    axLabel.axis('off')
    
    cax = fig.add_axes([.91, 0.13, 0.01, 0.22])
    colorbar = fig.colorbar(im, cax=cax, ticks=[0,1])
    colorbar.set_ticklabels(['0', 'max'])

    if save == True:
        plt.savefig('/home/lars/storage/Documents/Cortex_FISH/Heatmap_{}clusters_{}.png'.format(len(np.unique(labels_a)), df_input_name), dpi=300)

order = [14, 6, 10, 7, 8, 11, 12, 9, 13, 15, 
33, 34,   5, 2, 1, 4,
31, 22,  28,  23,
30, 18, 20, 17, 16, 19,
 24, 29,26, 25, 27, 21, 32,
                     0,3,]


heat_map(df_sort, labels_a, sort=cluster_sort, save=True) 


In [None]:
#Semi binarization backup
#Make df with count averages per cluster
    df_count_average = pd.DataFrame(index=df.index, columns=np.unique(labels_a))
    
    #Semi binarize count average (all below mean is set to 0)
    df_count_average_bin = df_count_average
    
    for index in df_count_average_bin.index: 
        binarized_mean = []
        for c in df_count_average_bin.loc[index]:
            if c < df_count_average_bin.loc[index].mean():
                c = 0
            binarized_mean.append(c)
        df_count_average_bin.loc[index] = binarized_mean

In [None]:
#Heat map clusters
def cluster_expression(df, cluster_of_interest, dataset='Count', sort=None):
    """
    Plot tSNE colored based on clusters and plot average gene experssion plot.
    
    """
    df_part=df
    
    #Make df with count averages per cluster
    df_count_average = pd.DataFrame(index=df.index, columns=np.unique(labels_a))
    
    for index in df_count_average.index: 
        mean_exp = []
        for c in df_count_average.loc[index]:
            if c < df_count_average.loc[index].mean():
                c = 0
            mean_exp.append(c)
        df_count_average.loc[index] = mean_exp


    for l in range(len(np.unique(labels_a))):
        filt = np.sort(labels) == l
        mean = np.array(df.loc[:,filt].T.mean())
        #std = np.array(np.std(df_sort.loc[:,filt], axis=1))
        df_count_average[l] = mean
        
    #Make df with count averages per cluster
    df_count_average = mean_expression(df, labels_a)
    
    if sort == None:
        #Transpose, otherwise you are doing it on genes instead of clusters
        D = pdist(df_count_average.T, 'cityblock') #Working well: 'correlation', 'cityblock', 'seuclidean', 'canberra', 'cosine'
        Z = linkage(D, 'ward')
        optimal_Z = optimal_leaf_ordering(Z, D)
        optimal_o = polo.polo.leaves_list(optimal_Z)    
        #In case some clusters are missing
        optimal_order = []
        for i in optimal_o:
            optimal_order.append(df_count_average.columns[i])
    else:
        optimal_order = sort
        
    #Plot averages heat map all clusters
    plt.figure(figsize=(14,8))
    
    if sort == None:
        ax0 = plt.subplot2grid((4,7), (0,0), rowspan=1, colspan=3)
        hierarchy.dendrogram(optimal_Z, truncate_mode='lastp', p=40, show_contracted=True, orientation='top')
    
    ax1 = plt.subplot2grid((4,7), (1,0), rowspan=3, colspan=3)
    #Sort according to the optimal order
    z = df_count_average.loc[:,optimal_order].values
    z = z/np.percentile(z, 99.5, 1)[:,None]
    ax1.pcolor(z)
    y_pos = np.arange(len(mean))
    x_pos = np.arange(len(df_count_average.columns))
    ax1.set_xlim(0, len(df_count_average.columns))
    ax1.set_xticks(x_pos+0.0)
    ax1.set_xticklabels(optimal_order, rotation=-90, fontsize=8)
    ax1.set_yticks(y_pos+0.4)
    ax1.set_yticklabels(df.index, fontsize=8)
    ax1.invert_yaxis()
    
    divider = make_axes_locatable(ax1)
    axLabel = divider.append_axes("top", .5, pad=0, sharex=ax1)
    optimal_order = np.array(optimal_order)
    axLabel.pcolor(optimal_order[None,:]/max(optimal_order), cmap='jet')
    axLabel.axis('off')

    #Cluster expression
    ax2 = plt.subplot2grid((4,7), (0,3), rowspan=3, colspan=3)
    gene_colors = ['red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red','blue', 'blue', 'blue', 'blue','orange', 'orange', 'orange', 'orange',
    'green', 'green', 'green', 'green', 'green', 'green', 'green', 'green', 'green',
    'lightblue', 'lightblue', 'darkblue', 'darkred','darkred','red','lightgreen','red','lightblue']
    if dataset.lower() == 'log':
        df = df_sort_log.loc[:,df.columns]
        print('Swiched to using the Log normalized dataset for the expression plot: df_sort_log')
        y_label= '(Log-normalized)'
    elif dataset.lower() == 'fc':
        df = df_sort_fc.loc[:,df.columns]
        print('Swiched to using the Fold Change dataset for the expression plot: df_sort_log')
        y_label= 'Fold Change'
    elif dataset.lower() == 'count':
        df = df_sort_count.loc[:,df.columns]
        print('Swiched to using the Count dataset for the expression plot: df_sort_count')
        y_label= 'Count'
    else:
        y_label=''
        
    filt = np.sort(labels) == cluster_of_interest
    ax2.bar(y_pos, np.array(df.loc[:,filt].T.mean()), color=gene_colors, alpha=0.5, yerr=np.array(np.std(df.loc[:,filt], axis=1)),error_kw=dict(ecolor='gray'))
    if dataset.lower() == 'fc':
        ax2.axhline(1, color='gray')
    top = ax2.get_ylim()[1]
    ax2.set_ylim(0,top)
    left_pos = np.arange(df.shape[0])
    ax2.set_xticks(left_pos+0.0)
    ax2.set_xticklabels(gene_sort, rotation='vertical', fontsize=8)
    ax2.set_ylabel('Count {}'.format(y_label))
    ax2.set_title('Cluster: {} Number of cells: {}/{}'.format(cluster_of_interest, np.count_nonzero(filt==True), len(filt)))
    
    
    #Cluster location
    ax3 = plt.subplot2grid((4,7), (0,6), rowspan=2, colspan=1)
    filt = np.sort(labels) == cluster_of_interest
    plt.sca(ax3)
    plot_cell_pos(coord_df, cell_ids=list(df_sort.columns[filt]), cell_of_interest=None, s=.5, standalone=False)
    for tl in ax3.get_xticklabels() + ax3.get_yticklabels():
                tl.set_visible(False)
    ax3.set_title('Cluster Location')

    #Cluster area
    ax4 = plt.subplot2grid((4,7), (3,3), rowspan=1, colspan=1)
    ax4.hist((np.sqrt(ds.CellArea)*0.065)**2, bins=100, range=[8,254], color='grey',  lw=0)
    cluster_cell_size = [cell_size[i] for i in label_cells[cluster_of_interest]]
    ax4.hist((np.sqrt(cluster_cell_size)*0.065)**2, bins=100, range=[8,254], color='red', lw=0)
    #ax4.hist((np.sqrt(ds.CellArea[np.where(filt)])*0.065)**2, bins=100, range=[8,254], color='red', lw=0) #ds is not sorted so it does not work
    plt.xticks(rotation=-45)
    ax4.set_title('Cell Size', fontsize=12)
    #ax4.set_xlabel('Square um')
    ax4.set_ylim([0,50])

    #Molecule count
    ax5 = plt.subplot2grid((4,7), (3,4), rowspan=1, colspan=1)
    ax5.hist(df_fish.sum(axis=0), bins=100, range=[20,600], color='grey', lw=0)
    ax5.hist(df_sort_count.loc[:,filt].sum(axis=0), bins=100, range=[20,600], color='red', lw=0)
    plt.xticks(rotation=-45)
    ax5.set_title('Total molecule count', fontsize=12)
    ax5.set_ylim([0,100])
    
    #tSNE
    ax6 = plt.subplot2grid((4,7), (3,5), rowspan=1, colspan=1)
    ax6.scatter(tSNE_points[:,0], tSNE_points[:,1], lw=0, s=2, alpha = 0.5, c = 'grey')
    points = np.zeros((len(df.columns),2))
    for i, n in enumerate(df.columns):
        points[i,:] = tSNE_points[df_fish.columns.get_loc(n),:]
    print(points.shape)
    ax6.scatter(points[np.sort(labels_a) == cluster_of_interest][:,0], points[np.sort(labels_a) == cluster_of_interest][:,1], lw=0, s=3, alpha = 0.5, c = 'r') #input is df_sort so labels should be sorted as well
    print(len([labels_a == cluster_of_interest]) )
    ax6.set_aspect('equal')
    ax6.set_xlim(-12,12)
    ax6.set_ylim(-12,12)
    ax6.set_xticks([])
    ax6.set_yticks([])
    ax6.set_title('tSNE position', fontsize=12)
    
    #plt.subplots_adjust(wspace=0)
    plt.tight_layout()

 #Added the selection of a subset of columns of the count and fc datsets on line 76 and 80   
    
cluster_expression(df_sort, 7, 'fc', sort=None)#cluster_sort)

In [None]:
cluster_labels

In [None]:
for  i in cluster_sort:
    cluster_expression(df_sort, i, 'fc', sort=None)
    plt.savefig('Clusters/Cluster{}_{}.png'.format(i, cluster_labels[i]))

In [None]:
cluster_of_interest = 43

plt.figure(figsize=(8,4))
ax0 = plt.subplot2grid((1,5), (0,0), rowspan=1, colspan=3)

gene_colors = ['red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red','blue', 'blue', 'blue', 'blue','orange', 'orange', 'orange', 'orange',
'green', 'green', 'green', 'green', 'green', 'green', 'green', 'green', 'green',
'lightblue', 'lightblue', 'darkblue', 'darkred','darkred','red','lightgreen','red','lightblue']
filt = np.sort(labels) == cluster_of_interest
ax0.bar(y_pos, np.array(df_sort_fc.loc[:,filt].T.mean()), color=gene_colors, alpha=0.5, yerr=np.array(np.std(df_sort_fc.loc[:,filt], axis=1)),error_kw=dict(ecolor='gray'))
#if dataset.lower() == 'fc':
ax0.axhline(1, color='gray')
top = ax0.get_ylim()[1]
ax0.set_ylim(0,top)
left_pos = np.arange(df_sort.shape[0])
ax0.set_xticks(left_pos+0.0)
ax0.set_xticklabels(gene_sort, rotation='vertical', fontsize=8)
ax0.set_ylabel('Fold change')
ax0.set_title('Cluster: {} Number of cells: {}/{}'.format(cluster_of_interest, np.count_nonzero(filt==True), len(filt)))


ax1 = plt.subplot2grid((1,5), (0,3), rowspan=1, colspan=2)
plt.sca(ax1)
plot_cell_pos(coord_df, cell_ids=list(df_sort.columns[filt]), cell_of_interest=None, s=2, standalone=False)

ax1.set_title('Cluster {} Location'.format(cluster_of_interest))
plt.gca().set_axis_off()
plt.tight_layout()
plt.savefig('Cluster_{}_expression_location.png'.format(cluster_of_interest), dpi=500)

In [10]:

def cluster_compare(df, cluster_1, cluster_2, dataset='Count'):
    plt.figure(figsize=(12,6))

    #cell_labels, label_cells, cellID, labels, labels_a = gen_labels(df, model)
        
    gene_colors = ['red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red','blue', 'blue', 'blue', 'blue','orange', 'orange', 'orange', 'orange',
    'green', 'green', 'green', 'green', 'green', 'green', 'green', 'green', 'green',
    'lightblue', 'lightblue', 'darkblue', 'darkred','darkred','red','lightgreen','red','lightblue']
    if dataset.lower() == 'log':
        df = df_sort_log
        print('Swiched to using the Log normalized dataset for the expression plot: df_sort_log')
        y_label= '(Log-normalized)'
    elif dataset.lower() == 'fc':
            df = df_sort_fc
            print('Swiched to using the Fold Change dataset for the expression plot: df_sort_log')
            y_label= 'Fold Change'
    else:
        y_label=''
    
    y_pos = np.arange(len(df.index))
    
    #Cluster 1
    ax1 = plt.subplot2grid((2,6), (0,0), rowspan=1, colspan=3)
    filt = np.sort(labels) == cluster_1
    n = sum(filt)
    ax1.bar(y_pos, np.array(df.loc[:,filt].T.mean()), color=gene_colors, alpha=0.5, yerr=np.array(np.std(df.loc[:,filt], axis=1))/np.sqrt(n),error_kw=dict(ecolor='gray'))
    if dataset.lower() == 'fc':
        ax1.axhline(1, color='gray')
    left_pos = np.arange(df.shape[0])
    ax1.set_xlim(-1, df.shape[0])
    ax1.set_xticklabels([])
    ax1.set_ylabel('Count {}'.format(y_label))
    ax1.set_title('Cluster: {} Number of cells: {}/{}'.format(cluster_1, np.count_nonzero(filt==True), len(filt)))
    
    #Cluster 2
    ax2 = plt.subplot2grid((2,6), (1,0), rowspan=1, colspan=3)
    filt = np.sort(labels) == cluster_2
    n = sum(filt)
    ax2.bar(y_pos, np.array(df.loc[:,filt].T.mean()), color=gene_colors, alpha=0.5, yerr=np.array(np.std(df.loc[:,filt], axis=1))/np.sqrt(n),error_kw=dict(ecolor='gray'))
    if dataset.lower() == 'fc':
        ax2.axhline(1, color='gray')
    left_pos = np.arange(df.shape[0])
    ax2.set_xticks(left_pos+0.0)
    ax2.set_xlim(-1, df.shape[0])
    ax2.set_xticklabels(df_sort.index, rotation='vertical')
    ax2.set_ylabel('Count {}'.format(y_label))
    ax2.set_title('Cluster: {} Number of cells: {}/{}'.format(cluster_2, np.count_nonzero(filt==True), len(filt)))
    
    #Scale y limit equally
    ax1.set_ylim(max([ax1.get_ylim(), ax2.get_ylim()]))
    ax2.set_ylim(max([ax1.get_ylim(), ax2.get_ylim()]))
    
    #Cluster 1 location
    ax3 = plt.subplot2grid((2,6), (0,3), rowspan=1, colspan=1)
    filt1 = np.sort(labels) == cluster_1
    plt.sca(ax3)
    plot_cell_pos(coord_df, cell_ids=list(df_sort.columns[filt1]), cell_of_interest=None, s=1, standalone=False)
    for tl in ax3.get_xticklabels() + ax3.get_yticklabels():
                tl.set_visible(False)
    ax3.set_title('Cluster {}'.format(cluster_1))
    
    #Cluster 2 location
    ax4 = plt.subplot2grid((2,6), (0,4), rowspan=1, colspan=1)
    filt2 = np.sort(labels) == cluster_2
    plt.sca(ax4)
    plot_cell_pos(coord_df, cell_ids=list(df_sort.columns[filt2]), cell_of_interest=None, s=1, standalone=False, color_highlight1='b')
    for tl in ax4.get_xticklabels() + ax4.get_yticklabels():
                tl.set_visible(False)
    ax4.set_title('Cluster {}'.format(cluster_2))
    
    #Clusters location
    ax5 = plt.subplot2grid((2,6), (0,5), rowspan=1, colspan=1)
    filt = np.sort(labels) == cluster_2
    plt.sca(ax5)
    plot_cell_pos(coord_df, cell_ids=list(df_sort.columns[filt1]), cell_of_interest=None, s=1, standalone=False)
    ax5.scatter(coord_df.loc[:,list(df_sort.columns[filt2])].loc['X'], coord_df.loc[:,list(df_sort.columns[filt2])].loc['Y'], s=1, color='b')
    for tl in ax5.get_xticklabels() + ax5.get_yticklabels():
                tl.set_visible(False)
    ax5.set_title('Red{}, Blue{}'.format(cluster_1,cluster_2))
    
    #Cluster 1 tSNE
    ax6 = plt.subplot2grid((2,6), (1,3), rowspan=1, colspan=1)
    ax6.scatter(tSNE_points[:,0], tSNE_points[:,1], lw=0, s=2, alpha = 0.5, c = 'grey')
    points = np.zeros((len(df.columns),2))
    for i, n in enumerate(df.columns):
        points[i,:] = tSNE_points[df_fish.columns.get_loc(n),:]
    ax6.scatter(points[labels_a == cluster_1][:,0], points[labels_a == cluster_1][:,1], lw=0, s=3, alpha = 0.5, c = 'r')
    ax6.set_aspect('equal')
    ax6.set_xlim(-12,12)
    ax6.set_ylim(-12,12)
    ax6.set_xticks([])
    ax6.set_yticks([])
    
    #Cluster 2 tSNE
    ax7 = plt.subplot2grid((2,6), (1,4), rowspan=1, colspan=1)
    ax7.scatter(tSNE_points[:,0], tSNE_points[:,1], lw=0, s=2, alpha = 0.5, c = 'grey')
    points = np.zeros((len(df.columns),2))
    for i, n in enumerate(df.columns):
        points[i,:] = tSNE_points[df_fish.columns.get_loc(n),:]
    ax7.scatter(points[labels_a == cluster_2][:,0], points[labels_a == cluster_2][:,1], lw=0, s=3, alpha = 0.5, c = 'b')
    ax7.set_aspect('equal')
    ax7.set_xlim(-12,12)
    ax7.set_ylim(-12,12)
    ax7.set_xticks([])
    ax7.set_yticks([])
    
    #Clusters tSNE
    ax8 = plt.subplot2grid((2,6), (1,5), rowspan=1, colspan=1)
    ax8.scatter(tSNE_points[:,0], tSNE_points[:,1], lw=0, s=2, alpha = 0.5, c = 'grey')
    points = np.zeros((len(df.columns),2))
    for i, n in enumerate(df.columns):
        points[i,:] = tSNE_points[df_fish.columns.get_loc(n),:]
    ax8.scatter(points[labels_a == cluster_1][:,0], points[labels_a == cluster_1][:,1], lw=0, s=3, alpha = 0.5, c = 'r')
    ax8.scatter(points[labels_a == cluster_2][:,0], points[labels_a == cluster_2][:,1], lw=0, s=3, alpha = 0.5, c = 'b')
    ax8.set_aspect('equal')
    ax8.set_xlim(-12,12)
    ax8.set_ylim(-12,12)
    ax8.set_xticks([])
    ax8.set_yticks([])
    
    
    
    
    
cluster_compare(df_sort, 7, 13, dataset='fc')

NameError: name 'df_sort' is not defined

In [None]:
np.sqrt(47)

In [None]:
def all_clusters():
    plt.figure(figsize=(20,10))
    gs = plt.GridSpec(7,7)

    for i, label in enumerate(cluster_sort):
        plt.subplot(gs[i])
        filt = np.sort(labels) == label

        plot_cell_pos(coord_df, cell_ids=list(df_sort.columns[filt]), cell_of_interest=None, s=1, standalone=False)
        plt.title('{} {}'.format(label, cluster_labels[label]))
        plt.gca().set_axis_off()
        
    plt.tight_layout()
    #plt.savefig('Cluster_location.png')
        
all_clusters()

In [None]:
layer4 = [19, 18]
layer23 = [39]
layer5 = [40,45]
layer6 = [0,3,5]


fig, ax = plt.subplots(figsize=(6,6))

ax.scatter(tSNE_points[:,0], tSNE_points[:,1], lw=0, s=8, alpha = 1, c ='grey' )#[128/255,163/255,250/255]
points = np.zeros((len(df_sort.columns),2))
for i, n in enumerate(df_sort.columns):
    points[i,:] = tSNE_points[df_fish.columns.get_loc(n),:]

#ax.scatter(points[np.sort(labels_a) == cluster_of_interest][:,0], points[np.sort(labels_a) == cluster_of_interest][:,1], lw=0, s=10, alpha = 1, c = [180/255,4/255,38/255])
#for i in layer23:
#    ax.scatter(points[np.sort(labels_a) == i][:,0], points[np.sort(labels_a) == i][:,1], lw=0, s=10, alpha = 1, c = [19/255,216/255,226/255])
#for i in layer4:
#    ax.scatter(points[np.sort(labels_a) == i][:,0], points[np.sort(labels_a) == i][:,1], lw=0, s=10, alpha = 1, c = [1,0,1])
for i in layer5:
    ax.scatter(points[np.sort(labels_a) == i][:,0], points[np.sort(labels_a) == i][:,1], lw=0, s=10, alpha = 1, c = 'yellow')
#for i in layer6:
#    ax.scatter(points[np.sort(labels_a) == i][:,0], points[np.sort(labels_a) == i][:,1], lw=0, s=10, alpha = 1, c = [0,1,0])#[180/255,4/255,38/255])#input is df_sort so labels should be sorted as well
ax.set_aspect('equal')
ax.set_xlim(-12,12)
ax.set_ylim(-12,12)
ax.set_axis_off()


#ax.set_title('tSNE cluster: {}'.format(cluster_of_interest), fontsize=12)
#plt.savefig('Cluster_py5_location_47_clusters.png'.format(cluster_of_interest), dpi=600)

In [None]:
#Endothelial clusters 12 and 13 Foxj1 expression
stats.ttest_ind(df_sort_count.loc['Hybridization1_Foxj1',np.sort(labels) == 12], df_sort_count.loc['Hybridization1_Foxj1',np.sort(labels) == 13], )

In [None]:
df_sort_count.loc['Hybridization7_Rorb',np.sort(labels) == 18]

In [None]:
cell_info('7195', post_clustering=True)

# Pickle

In [None]:
pickle.dump(coord_df, open('coord_df.p', "wb" ))
pickle.dump(df_fish, open('df_fish.p', "wb" ))
pickle.dump(df_sort, open('df_sort.p', "wb" ))
pickle.dump(labels_a, open('labels_a_47.p', "wb" ))
pickle.dump(label_colors_hex, open('label_colors_hex_47.p', "wb" ))
pickle.dump(label_cells, open('label_cells_47.p', "wb" ))
pickle.dump(cell_labels, open('cell_labels_47.p', "wb" ))
pickle.dump(cluster_labels, open('cluster_labels_47.p', "wb"))
pickle.dump(df_fish_log, open('df_fish_log.p', "wb" ))
pickle.dump(df_fish_size_sqrt, open('df_fish_size_sqrt.p', "wb" ))


In [None]:
pickle.dump(cluster_labels, open('cluster_labels_47.p', "wb"))

# Cluster comparison

In [None]:
mean_fish = mean_expression(df_sort_count, labels_a)

In [None]:
#Convert single cell labels from text to a number
sc_cluster_labels = {}
for n, i in enumerate(np.unique(np.array(list(sc_cort_olig_label.values())))):
    sc_cluster_labels[n] = i
#sc_cluster_labels[39] = 'Vlmc'
#sc_cluster_labels[35] = 'Pvmf'
#Reverse the dictionary to match the text with the numbers
sc_label_clusters = {v: k for k, v in sc_cluster_labels.items()}
#Make a list of the labels for each cell
sc_cort_olig_label_int =[]
for i in df_sc_cort_olig.columns:
    sc_cort_olig_label_int.append(sc_label_clusters[sc_cort_olig_label[i]])

In [None]:
#From the marker pannel plot function
mean_sc = df_count_average_sc

In [None]:
df_type_correlation = pd.DataFrame(data=np.zeros((len(cluster_sort),len(mean_sc.columns))), columns=mean_sc.columns, index=cluster_sort)
for i in df_type_correlation.index:
    df_type_correlation.loc[i] = [np.corrcoef(mean_fish[i], mean_sc[x])[0,1] for x in df_type_correlation.columns]

In [None]:
fig, ax = plt.subplots(figsize=(6,6))

corr_max = np.max(df_type_correlation.max(axis=0))
im = ax.pcolor(df_type_correlation, cmap='viridis', vmin=0, vmax=corr_max)

y_pos = np.arange(len(df_type_correlation.index))
x_pos = np.arange(len(df_type_correlation.columns))
ax.set_xticks(x_pos+0.5)
ax.set_xticklabels(df_type_correlation.columns, rotation='vertical', fontsize=6)
ax.set_yticks(y_pos+0.5)
ax.set_yticklabels(df_type_correlation.index, fontsize=6)


ax.set_aspect('equal')
ax.set_title('Cell type correlation')
cax = fig.add_axes([.91, 0.13, 0.03, 0.4])
colorbar = fig.colorbar(im, cax=cax, ticks=[0,corr_max], )
colorbar.set_ticklabels(['0', 'max'])
ax.set_ylim((0,len(df_type_correlation.index)))
ax.set_xlim((0,len(df_type_correlation.columns)))
ax.invert_yaxis()
plt.tight_layout()

plt.savefig('Cell_type_correlation.png')

# Spatial organization

In [None]:
np.array(coord_df.loc[:,label_cells[0]].T)

## Cell type regionalization

### Ripley's K

In [None]:
from astropy.stats import RipleysKEstimator

In [None]:
#Choose a range for the radii to use for the Ripley's K
radius = 7000
plot_cell_pos(coord_df, cell_ids=None, cell_of_interest=None, s=10, standalone=True, mode='Highlight')
circle1 = plt.Circle((25000,35000), radius, lw=0, color='red', alpha=0.4)
circle2 = plt.Circle((10000,17000), radius, lw=0, color='red', alpha=0.4)
ax = plt.gca()
ax.add_artist(circle1)
ax.add_artist(circle2)
ax.set_title('Radius = {}'.format(radius))

In [None]:
np.argmin(np.abs(np.linspace(1000,20000, 100) - 3000))

In [None]:
np.linspace(1000,20000, 100)

In [None]:
###

In [None]:
x_min = coord_df.loc['X'].min()
x_max = coord_df.loc['X'].max()
y_min = coord_df.loc['Y'].min()
y_max = coord_df.loc['Y'].max()
area = ((x_max-x_min)*(y_max-y_min))
Kest = RipleysKEstimator(area, x_max, y_max, x_min, y_min)
radii = np.linspace(1000,20000, 100)

plt.figure()
colors = np.arange(0, 1, 1/len(np.unique(labels_a)))
cmap = matplotlib.cm.get_cmap('prism')

RK_3000 = []
RK_10000 = []
for i in np.sort(np.unique(labels_a)):
    estimate =  Kest(data=np.array(coord_df.loc[:,label_cells[i]].T), radii=radii, mode='none')
    RK_3000.append(estimate[np.argmin(np.abs(radii - 3000))]) #Get the estimate of the radii closest to 3000 and append list
    RK_10000.append(estimate[np.argmin(np.abs(radii - 10000))]) #Get the estimate of the radii closest to 10000 and append list
    plt.plot(radii, estimate, color=cmap(colors[i]))

plt.title("Ripley's K for different radii")
plt.xlabel('Radius in pixels')
    

In [None]:
plt.figure(figsize=(20,12))
n_clusters = len(np.unique(labels_a))
gs = plt.GridSpec(5, int(n_clusters/5) + (n_clusters % 5 > 0))

for i, c in enumerate(np.argsort(RK_10000)[::-1]):
    plt.subplot(gs[i])
    filt = np.sort(labels) == c
    plot_cell_pos(coord_df, cell_ids=list(df_sort.columns[filt]), cell_of_interest=None, s=1, standalone=False, color_highlight1='r')
    ax = plt.gca()
    for tl in ax.get_xticklabels() + ax.get_yticklabels():
        tl.set_visible(False)
    #ax.set_title('Cluster {}'.format(c))
    ax.set_title('{}\n{} {}'.format("{:.2e}".format(RK_10000[c]),c, cluster_labels[c]), fontsize=6)

In [None]:
def create_connectivity(mask, kind="queen"):
    """ Create a connectivity matrix of the pixels in a image

    Args
    ----
    mask: np.2darray
        Square image of side N
    kind: str default 'queen
        The kind of connectivity to apply. Can be: rook, bishop, queen (as in chess)

    Returns
    -------
    connectivity_matrix: np.ndarray
        A connectivity matrix (N^2, N^2) where N is the side of mask
    """

    ll = mask.shape[0]
    ll += 2  # add a 1 pixel margin all around the image to simplify the construction (will be removed as last step)
    cstm_connectivity = np.zeros((ll**2, ll**2))  # initialize empty
    pixel_ixs = np.arange(ll**2)

    # Compute the indexes of the fake edges
    real_pixel_bool = np.ones((ll, ll), dtype=bool)
    real_pixel_bool[0, :] = False
    real_pixel_bool[-1, :] = False
    real_pixel_bool[:, 0] = False
    real_pixel_bool[:, -1] = False
    real_pixel_bool = real_pixel_bool.flat[:]
    real_pixel_ixs = pixel_ixs[real_pixel_bool]

    # Neighbour rule
    if kind == "rook":
        neig_relative_ix_pos = np.array([+1, -1, -ll, ll])
    elif kind == "bishop":
        neig_relative_ix_pos = np.array([-ll + 1, ll + 1, -ll - 1, ll - 1])
    elif kind in ["queen", "king"]:
        neig_relative_ix_pos = np.array([+1, -1, -ll, ll, -ll + 1, ll + 1, -ll - 1, ll - 1])

    # Set True at where there is connectivity
    cstm_connectivity[real_pixel_ixs[:, None], real_pixel_ixs[:, None] + neig_relative_ix_pos] = True
    # Is the same as following but using broadcasting
    # for i in real_pixel_ixs:
    #     cstm_connectivity[i, neig_relative_ix_pos+i] = True

    # Remove connectivity entry corresponding to the dummy 1 pixel edges
    cstm_connectivity = cstm_connectivity[real_pixel_ixs[:, None], real_pixel_ixs[None, :]]
    
    return cstm_connectivity

In [None]:
plt.figure()
plt.pcolor(create_connectivity(np.array([[1,1,1],[1,0,1],[1,1,1]])))

In [None]:
plt.figure()
plt.pcolor(create_connectivity(np.array([[1,1,1],[0,0,0],[1,1,1]]).astype(bool)))

In [None]:
a = np.array([[1,1,1],[0,0,0],[1,1,1]])
af = a.flat[:]

In [None]:
a

In [None]:
af[None,:]

In [None]:
af[:,None]

In [None]:
q = create_connectivity(a)

In [None]:
(q*af[None,:])*af[:,None]

### Tissue regionaliztion (MOVED TO "CORTEX SPATIAL ANALYSIS.IPYNB")

In [None]:
from sklearn import svm

In [None]:
X_loc = coord_df.loc[:,df_sort.columns].values.T
y_loc = labels_a

In [None]:
[x*10000 for x in [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]]

In [None]:
np.random.normal(0, scale=1000)

In [None]:
cloud = 200
grid_c = np.zeros([16, 2])

for i, p in enumerate(grid_c):
    x = (int(i/4))*10000
    y = [x*10000 for x in [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]]
    y = y[i]
    grid_c[i] = x,y

    
grid_p = np.zeros([len(grid_c)*cloud, 2])
label_p = []
for i, p in enumerate(grid_c):
    print(p)
    for n in range(cloud):
        label_p.append(i)
        pos = i*cloud + n
        x_shift = np.random.normal(0, scale=1000)
        y_shift = np.random.normal(0, scale=1000)
        grid_p[pos] = p[0]+x_shift, p[1]+y_shift
        
def cell_scatter(arr2D, color='blue'):
    plt.figure()
    for i in range(len(arr2D)):
        plt.scatter(arr2D[i][0], arr2D[i][1], c=color)
        
cell_scatter(grid_p)

In [None]:
np.unique(label_p)

In [None]:
X_loc.max()

In [None]:
model = svm.SVC(C=1000, kernel='rbf', degree=3, gamma=0.00000001, coef0=0.0, shrinking=True, 
                probability=False, tol=0.001, cache_size=200, class_weight=None, 
                verbose=False, max_iter=-1, decision_function_shape=None, random_state=None)
#clf = model.fit(X_loc, y_loc)
clf = model.fit(grid_p, label_p)

In [None]:
X_loc.shape

In [None]:
def make_meshgrid(x, y, h=1000):
    """Create a mesh of points to plot in

    Parameters
    ----------
    x: data to base x-axis meshgrid on
    y: data to base y-axis meshgrid on
    h: stepsize for meshgrid, optional

    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = x.min() , x.max()
    y_min, y_max = y.min() , y.max() 
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

In [None]:
X0, X1 = X_loc[:, 0], X_loc[:, 1]
xx, yy = make_meshgrid(X0, X1, h=1000)

In [None]:
X0, X1 = grid_p[:, 0], grid_p[:, 1]
xx, yy = make_meshgrid(X0, X1, h=100)

In [None]:
Z

In [None]:
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
fig, ax = plt.subplots(figsize=(5,5))
ax.contourf(xx, yy, Z, cmap=plt.cm.RdBu)

In [None]:
np.unique(Z)

In [None]:
clf.predict([[10000,10000]])

In [None]:
import pysal #Look into regionalization for clustering of areas of the tissue
http://pysal.readthedocs.io/en/latest/users/tutorials/region.html

In [None]:
import clusterpy # Clustering the areas of the tissue
http://www.rise-group.org/risem/clusterpy/index.html

In [None]:
pysal.

# Ordering of leaves

In [None]:
#Make df with count averages per cluster
df_count_average = pd.DataFrame(index=df_sort.index, columns=np.unique(labels_a))

#Semi binarize count average (all below mean is set to 0)
df_count_average_bin = df_count_average
for index in df_count_average_bin.index: 
    binarized_mean = []
    for c in df_count_average_bin.loc[index]:
        if c < df_count_average_bin.loc[index].mean():
            c = 0
        binarized_mean.append(c)
    df_count_average_bin.loc[index] = binarized_mean


for l in range(len(np.unique(labels_a))):
    filt = np.sort(labels) == l
    mean = np.array(df_sort.loc[:,filt].T.mean())
    #std = np.array(np.std(df_sort.loc[:,filt], axis=1))
    df_count_average[l] = mean

#Transpose, otherwise you are doing it on genes instead of clusters
D = pdist(df_count_average.T, 'euclidean')
#On semi-binarized data # It does not really work
#D = pdist(df_count_average_bin.T, 'euclidean')
Z = linkage(D, 'ward')

optimal_Z = optimal_leaf_ordering(Z, D)

In [None]:
polo.polo.leaves_list(optimal_Z)

In [None]:
df_count_average.loc[:,list(polo.polo.leaves_list(optimal_Z))].head(5)

In [None]:
for index in df_count_average.index: 
    binarized_mean = []
    for c in df_count_average.loc[index]:
        if c < df_count_average.loc[index].mean():
            c = 0
        binarized_mean.append(c)
    df_count_average.loc[index] = binarized_mean

df_count_average

In [None]:
fig, ax = plt.subplots()
hierarchy.dendrogram(optimal_Z, truncate_mode='lastp', p=40, show_contracted=True, orientation='top', );

# Colors

In [None]:
gene_color = [
'#850088',
'#9d19d4',
'#ff50b4',
'#ff83f9',
'#d270ff',
'#d270ff',
'#d4abff',
'#7b49f7',
'#ff8498',
'#cdc1ff',
'#003d5e',
'#003d5e',    
'#000070',
'#6e83ff',
'#01afea',
'#d84b00',
'#543400',
'#ff7043',
'#ef9800',
'#005227',
'#436000',
'#436000',
'#01d134',
'#9aff81',
'#95c17d',
'#5da600',
'#436000',
'#ccff21',
'#203500',
'#e0d674',
'#ffe57a',
'#00ac72',
'#4dffc1',
'#01b0af',
'#01b0af',
'#dd006a',
'#bb0020',
'#760022',
'#94fcd8']

gene_color_dict = dict(zip(gene_sort_fish, gene_color))

n = len(gene_sort_fish)
ncols = 1
nrows = n // ncols + 1

fig, ax = plt.subplots(figsize=(8, 5))

# Get height and width
X, Y = fig.get_dpi() * fig.get_size_inches()
h = Y / (nrows + 1)
w = X / ncols

for i, name in enumerate(gene_sort_fish):
    col = i % ncols
    row = i // ncols
    y = Y - (row * h) - h

    xi_line = w * (col + 0.05)
    xf_line = w * (col + 0.25)
    xi_text = w * (col + 0.3)

    ax.text(xi_text, y, name, fontsize=(h * 0.6),
            horizontalalignment='left',
            verticalalignment='center')

    ax.hlines(y + h * 0.1, xi_line, xf_line,
              color=gene_color_dict[name], linewidth=(h * 0.6))

ax.set_xlim(0, X)
ax.set_ylim(0, Y)
ax.set_axis_off()

fig.subplots_adjust(left=0, right=1,
                    top=1, bottom=0,
                    hspace=0, wspace=0)
plt.savefig('Gene_colors.svg')
plt.show()

In [None]:
cluster_labels

In [None]:
type_color = {
    0:[8,11,135],
    1:[0,192,255],
    2:[0,119,0],
    3:[0,227,215],
    4:[0,141,111],
    5:[16,177,186],
    6:[255,110,215],
    7:[0,73,193],
    8:[63,118,241],
    9:[250,0,25],
    10:[255,99,28],
    11:[90,90,90],
    12:[0,130,235],
    13:[222,1,191],
    14:[236,64,242],
    15:[129,239,239],
    16:[68,133,35],
    17:[187,103,194],
    18:[0,143,193],
    19:[215,160,60],
    20:[181,64,242],
    21:[46,195,47],
    22:[105,222,103],
    23:[104,214,150],
    24:[95,0,159],
    25:[180,141,255],
    26:[36,155,104],
    27:[99,85,215],
    28:[255,254,187],
    29:[101,160,0]
}


type_color_35 = {
    0: [192,192,192],
 1: [0,79,154],
 2: [65,96,208],
 3: [105,105,105],
 4: [0,160,216],
 5: [14,206,255],
    6:[103,72,239],
7:[230,0,158],
8:[39,28,175],
9:[242,96,255],
10:[107,0,167],
11:[210,76,254],
12:[166,102,255],
13:[151,8,205],
14:[193,86,229],
 15: [224,173,249],
 16: [0,132,3],
 17: [0,193,26],
 18: [192,225,60],
 19: [81,167,29],
 20: [117,227,55],
 21: [189,0,43],
 22: [243,73,0],
 23: [255,106,88],
 24: [1,161,128],
 25: [231,0,1],
 26: [255,249,109],
 27: [238,144,136],
 28: [255,177,0],
 29: [255,199,0],
 30: [123,255,167],
 31: [255,138,53],
 32: [218,255,178],
 33: [0,86,247],
 34: [118,145,255]    }
#[0,0,193],
type_color_47 = {
    0: [0,66,234],
    1: [138,138,138],
    2: [0,203,251],
    3: [0,81,227],
    4: [186,61,0],
    5: [0,45,237],
    6: [168,225,172],
    7: [188,93,17],
    8: [236,224,138],
    9: [222,210,229],
    10: [229,105,5],
    11: [225,149,55],
    12: [255,229,3],
    13: [225,201,34],
    14: [229,83,11],
    15: [225,114,0],
    16: [255,0,0],
    17: [0,77,100],
    18: [0,196,222],
    19: [0,222,249],
    20: [220,64,161],
    21: [203,161,178],
    22: [116,56,203],
    23: [255,100,122],
    24: [165,109,218],
    25: [126,67,76],
    26: [100,100,100],
    27: [204,74,216],
    28: [216,111,131],
    29: [119,50,144],
    30: [194,154,216],
    31: [155,47,100],
    32: [214,122,192],
    33: [114,75,119],
    34: [53,206,58],
    35: [53,163,58],
    36: [53,142,58],
    37: [53,235,58],
    38: [103,255,104],
    39: [0,147,253],
    40: [0,99,118],
    41: [75,75,75],
    42: [0,0,193],
    43: [0,30,155],
    44: [254,41,90],
    45: [0,114,130],
    46: [0,21,106]
}

#pickle.dump(type_color_47, open( "type_color_47.p", "wb" ))

In [None]:
color_dict = type_color_47

def rgb_to_hex(red, green, blue):
    """Return color as #rrggbb for the given color values."""
    return '#%02x%02x%02x' % (red, green, blue)

#Convert the dictionary to HEX
type_color_hex = {}
for k in color_dict:
    type_color_hex[k] = rgb_to_hex(color_dict[k][0], color_dict[k][1], color_dict[k][2])


#Make label list with HEX values for all cells
label_colors_hex = []
label_colors_rgb = []
for l in labels:
    hex_color = rgb_to_hex(color_dict[l][0], color_dict[l][1], color_dict[l][2])
    #label_colors.append(type_color[l])
    label_colors_hex.append(hex_color)
    label_colors_rgb.append(color_dict[l])


In [None]:
#Same as for heatmap
#type_order = [14, 6, 10, 7, 8, 11, 12, 9, 13, 15, 
#33, 34,   5, 2, 1, 4,
#31, 22,  28,  23,
#30, 18, 20, 17, 16, 19,
# 24, 29,26, 25, 27, 21, 32,
#                     0,3,]

n = len(color_dict)
ncols = 1
nrows = n // ncols + 1

fig, ax = plt.subplots(figsize=(8, 5))

# Get height and width
X, Y = fig.get_dpi() * fig.get_size_inches()
h = Y / (nrows + 1)
w = X / ncols

for i, label in enumerate(cluster_sort):
    col = i % ncols
    row = i // ncols
    y = Y - (row * h) - h

    xi_line = w * (col + 0.02)
    xf_line = w * (col + 0.25)
    xi_text = w * (col + 0.3)

    ax.text(xi_text, y, '{} {}'.format(label, cluster_labels[label]), fontsize=(h * 0.7),
            horizontalalignment='left',
            verticalalignment='center')

    ax.hlines(y + h * 0.1, xi_line, xf_line,
              color=type_color_hex[label], linewidth=(h * 0.6))

ax.set_xlim(0, X)
ax.set_ylim(0, Y)
ax.set_axis_off()

fig.subplots_adjust(left=0, right=1,
                    top=1, bottom=0,
                    hspace=0, wspace=0)
plt.savefig('Type_labels_47clusters.svg')
plt.show()

In [None]:
# Old 30label version
type_names = ['Glut Claustrum', 'Glut L6 Lamp5-', 'Olig Mature 1', 'Glut L4 Lamp5+', 'Ependymal/Plexus', 'Glut CA1', 'GABA Cnr1 Kcnip-low', 'Glut L2/3 Cpne5-',
             'Glut L5', 'Endothelial/pericytes', 'Astrocytes', 'Bad cells', 'Glut L2/3 Cpne5+', 'GABA Kcnip Pthlh', 'GABA Crh', 'Glut L4 Lamp5-', 
             'Olig Mature 2', 'GABA Kcnip', 'Glut L6 Lamp5+', 'Microglia', 'GABA Cnr1 Vip', 'Olig Newly Formed2', 'Olig Committed precursor', 'OPC', 
             'GABA Lamp5', 'GABA Crhbp', 'Olig Myelin Forming', 'GABA Lamp5 Cnr1', 'VLMC', 'Olig Newly formed1']



type_names_sort = ['GABA Crh','GABA Crhbp','GABA Kcnip','GABA Kcnip Pthlh', 'GABA Cnr1 Kcnip-low','GABA Lamp5','GABA Lamp5 Cnr1','GABA Cnr1 Vip', 
    'Glut L2/3 Cpne5+', 'Glut L2/3 Cpne5-', 'Glut L4 Lamp5+','Glut L4 Lamp5-', 'Glut L5', 'Glut L6 Lamp5+', 'Glut L6 Lamp5-', 'Glut CA1', 'Glut Claustrum',
                   'OPC','Olig Committed precursor', 'Olig Newly formed1','Olig Newly Formed2','Olig Myelin Forming','Olig Mature 1','Olig Mature 2',
                   'Astrocytes',
                   'Ependymal/Plexus', 'Endothelial/pericytes', 'Microglia','VLMC', 'Bad cells']

type_names_labels =dict(zip(type_names, list(type_color.keys())))



n = len(gene_sort_fish)
ncols = 1
nrows = n // ncols + 1

fig, ax = plt.subplots(figsize=(8*2, 5*2))

# Get height and width
X, Y = fig.get_dpi() * fig.get_size_inches()
h = Y / (nrows + 1)
w = X / ncols

for i, name in enumerate(type_names_sort):
    col = i % ncols
    row = i // ncols
    y = Y - (row * h) - h

    xi_line = w * (col + 0.02)
    xf_line = w * (col + 0.25)
    xi_text = w * (col + 0.3)

    ax.text(xi_text, y, type_names_sort[i], fontsize=(h * 0.8),
            horizontalalignment='left',
            verticalalignment='center')

    ax.hlines(y + h * 0.1, xi_line, xf_line,
              color=type_color_hex[type_names_labels[name]], linewidth=(h * 0.6))

ax.set_xlim(0, X)
ax.set_ylim(0, Y)
ax.set_axis_off()

fig.subplots_adjust(left=0, right=1,
                    top=1, bottom=0,
                    hspace=0, wspace=0)
#plt.savefig('Type_labels_30clusters.svg')
plt.show()


In [None]:
tSNE_points

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

points = np.zeros((len(df_sort.columns),2))
for i, n in enumerate(df_sort.columns):
    points[i,:] = tSNE_points[df_fish.columns.get_loc(n),:]
    


ax.scatter(points[:,0], points[:,1],c=label_colors_hex,lw=0, alpha=1, s=25)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')

#fig.savefig('tSNE_30clusters_colored_600dpi.png', dpi=600)

# Size normalization
Data normalized by cell area instead of max count

In [None]:
#Build cell_area dictionary of all included cells
cell_area = {}
for i in df_fish.columns:
    cell_area[i] = ds.col_attrs['CellArea'][list(ds.col_attrs['CellID']).index(i)]

In [None]:
X = df_fish.values.astype(np.float64)
#X_norm = X.sum(0).mean() * (X/X.sum(0))
X_norm_size = X * (median / np.array(list(cell_area.values())))
X_log_size = np.log2(X_norm_size+1)
X_log_size

In [None]:
ac30_Nsize = AgglomerativeClustering(n_clusters=30).fit(X_log_size.T)
#Does not make the clustering better. But size normalization might not be correct

# Violin

In [None]:
def LB_violin(data_frame, label,types, probes, save=False):
    '''
    Function to plot violin plots from single cell gene expression data. 
    Input: 
        -Data frame with cells as columns and genes as rows. 
        -Dictionary coupling cellID with their cell type
        -List of types to plot.
        -List of genes to plot.
        -Set save to True, to save the plot as a .SVG with transparent background.
    Author: Lars Borm            
    '''
    
    #nrows = shape(data_frame)[0]
    nrows = len(probes)
    figsize_x = 0.5 * len(types)
    figsize_y = 1 * len(probes)
    fig, axes = plt.subplots(nrows=nrows, ncols=1, figsize=(figsize_x, figsize_y), sharex=True)
    
    #iterate through rows/genes of the data
    for n in range(nrows):
        #Make list of n lists, where n = number of types
        data =[[] for item in types]
        
        #Fill the lists
        gene_name = probes[n]
        for cell in data_frame:
            cell_type = label[cell]
            if (cell_type in types) == True: 
                data[types.index(cell_type)].append(data_frame.loc[gene_name, cell])
        
        #Set figure parameters
        sns.set_style("white")
        sns.violinplot(data=data, palette="Set2", bw=0.2, cut=0, linewidth=1, scale="width", ax = axes[n], spines='left')
        sns.despine(top=True, right=True)
    
    for i, ax in enumerate(axes):

        #ax.yaxis.grid(True)
        ax.set_ylabel(probes[i], rotation='horizontal', size='large', horizontalalignment = 'right')

        #set the limits between zero and the end
        start, end = ax.get_ylim()
        ax.set_ylim(0, end)
        ax.set_yticks(np.arange(end/2,end+1, end/2))
        
    # add x-tick labels
    plt.xticks(rotation=-70, size='large') 
    plt.setp(axes,xticklabels=types)

    if save == True:
        name = input('Will save as transparent SVG. Carefull with overwriting.\nSpecify name: ')
        name = name + '.svg'
        plt.savefig(name, transparent = True)
        
    plt.show()

In [None]:
list(np.unique(labels))

In [None]:
import seaborn as sns
LB_violin(df_log, cell_labels, list(np.unique(labels)), gene_sort_fish)