In [None]:
# import modules, define some functions for loading, saving and processing a gene-barcode matrix
%matplotlib inline
import collections
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse as sp_sparse
import h5py
from scipy import stats
import csv
from scipy import ndimage
from scipy import spatial


np.random.seed(0)

FeatureBCMatrix = collections.namedtuple('FeatureBCMatrix', ['feature_ids', 'feature_names', 'barcodes', 'matrix'])

def get_matrix_from_h5(filename):
    with h5py.File(filename) as f:
        if u'version' in f.attrs:
            if f.attrs['version'] > 2:
                raise ValueError('Matrix HDF5 file format version (%d) is an newer version that is not supported by this function.' % version)
        else:
            raise ValueError('Matrix HDF5 file format version (%d) is an older version that is not supported by this function.' % version)
        
        feature_ids = [x.decode('ascii', 'ignore') for x in f['matrix']['features']['id']]
        feature_names = np.asarray([x.decode('ascii', 'ignore') for x in f['matrix']['features']['name']])    
        barcodes = np.asarray(list(f['matrix']['barcodes'][:]))
        matrix = sp_sparse.csc_matrix((f['matrix']['data'], f['matrix']['indices'], f['matrix']['indptr']), shape=f['matrix']['shape'])
        return FeatureBCMatrix(feature_ids, feature_names, barcodes, matrix)

def get_expression(fbm, gene_name):
    try:
        gene_index = fbm.feature_names.index(gene_name)
    except ValueError:
        raise Exception("%s was not found in list of gene names." % gene_name)
    return fbm.matrix[gene_index, :].toarray().squeeze()
        
def subsample_matrix(gbm, barcode_indices):
    return FeatureBCMatrix(gbm.feature_ids, gbm.feature_names, gbm.barcodes[barcode_indices], gbm.matrix[:, barcode_indices])

In [None]:
folder='FLARE2'
project='Thy1M_Aggr'
dataset='Thy1M_Aggr'
params=''
savefolder="/media/storage/ckk/genomics/scripts/figs/NAc/"

analysis_dir = "/media/storage/ckk/genomics/data/"+folder+"/reanalysis/"+project+"/"+dataset+"_QC"+params+"/outs/analysis/"
tsne_file = analysis_dir+"tsne/2_components/projection.csv"

if project == '20190110_Thy1LM':
    clusters_file = analysis_dir+"clustering/kmeans_9_clusters/clusters.csv"
    diffexp_file = analysis_dir+"diffexp/kmeans_9_clusters/differential_expression.csv"
    print('9 clusters')
elif project == '20190115_NAcDA':
    clusters_file = analysis_dir+"clustering/graphclust/clusters.csv"
    diffexp_file = analysis_dir+"diffexp/graphclust/differential_expression.csv"
    print('Graph-based')
else:
    clusters_file = analysis_dir+"clustering/graphclust/clusters.csv"
    diffexp_file = analysis_dir+"diffexp/graphclust/differential_expression.csv"
    print('Graph-based')

# load TSNE and graph clustering
tsne = pd.read_csv(tsne_file)
clusters = pd.read_csv(clusters_file)

# plot all clusters in TSNE space
plt.figure(figsize=(10, 10))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=clusters['Cluster'], linewidths=0)
plt.title('Graph-Based Clustering: RAW')
plt.savefig(savefolder+project+'_tsneclust.eps', format='eps', dpi=1000)

del_barcodes=np.array([])


In [None]:
# Round 1: remove outliers (scale based on size of cluster)
num_clusters=max(clusters['Cluster'])
comx=np.array([])
comy=np.array([])
del_outliers_round1=np.array([])
del_barcodes_round1=np.array([])
for x in range(0,num_clusters):
    curr_clust=np.where(clusters['Cluster']==x+1)[0]
    
    # scale to min and max
    xmin=np.min(tsne['TSNE-1'].iloc[curr_clust])
    xmax=np.max(tsne['TSNE-1'].iloc[curr_clust])
    ymin=np.min(tsne['TSNE-2'].iloc[curr_clust])
    ymax=np.max(tsne['TSNE-2'].iloc[curr_clust])

    curr_x=(tsne['TSNE-1'].iloc[curr_clust]-xmin)/(xmax-xmin)
    curr_y=(tsne['TSNE-2'].iloc[curr_clust]-ymin)/(ymax-ymin)
    
    comx=np.append(comx,np.median(curr_x))
    comy=np.append(comy,np.median(curr_y))
    
    dist=np.array([])
    for y in range(0,len(curr_clust)):
        dist=np.append(dist,spatial.distance.euclidean([comx[x],comy[x]],[curr_x[curr_clust[y]],curr_y[curr_clust[y]]]))
    outliers=np.where(dist>0.5)[0]
    del_outliers_round1=np.append(del_outliers_round1,np.asarray(curr_clust[outliers])).astype(int)
    del_barcodes_round1=np.append(del_barcodes_round1,clusters['Barcode'].iloc[curr_clust[outliers]])
    #plt.scatter(tsne['TSNE-1'].iloc[curr_clust[outliers]],tsne['TSNE-2'].iloc[curr_clust[outliers]],c='k')
#plt.scatter(comx,comy,c='r')
#plt.title('Graph-Based Clustering')
                                                               
clusters=clusters.drop(clusters.index[del_outliers_round1])
tsne=tsne.drop(tsne.index[del_outliers_round1])

# Plot Round 1 results
plt.figure(figsize=(10, 10))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=clusters['Cluster'], linewidths=0)
plt.title('Graph-Based Clustering: Outliers Round 1')

del_barcodes=del_barcodes_round1


In [None]:
# Round 2 remove outliers
num_clusters=max(clusters['Cluster'])
comx=np.array([])
comy=np.array([])
del_outliers_round2=np.array([])
del_barcodes_round2=np.array([])
for x in range(0,num_clusters):
    
    curr_clust=np.where(clusters['Cluster']==x+1)[0]
    # scale to min and max
    xmin=np.min(tsne['TSNE-1'].iloc[curr_clust])
    xmax=np.max(tsne['TSNE-1'].iloc[curr_clust])
    ymin=np.min(tsne['TSNE-2'].iloc[curr_clust])
    ymax=np.max(tsne['TSNE-2'].iloc[curr_clust])

    curr_x=(tsne['TSNE-1'].iloc[curr_clust]-xmin)/(xmax-xmin)
    curr_y=(tsne['TSNE-2'].iloc[curr_clust]-ymin)/(ymax-ymin)
    
    comx=np.append(comx,np.median(curr_x))
    comy=np.append(comy,np.median(curr_y))
    
    dist=np.array([])
    for y in range(0,len(curr_clust)): 
        dist=np.append(dist,spatial.distance.euclidean([comx[x],comy[x]],[curr_x.iloc[y],curr_y.iloc[y]]))
    outliers=np.where(dist>0.5)[0]
    del_outliers_round2=np.append(del_outliers_round2,np.asarray(curr_clust[outliers])).astype(int)
    del_barcodes_round2=np.append(del_barcodes_round2,clusters['Barcode'].iloc[curr_clust[outliers]])
    #plt.scatter(tsne['TSNE-1'].iloc[curr_clust[outliers]],tsne['TSNE-2'].iloc[curr_clust[outliers]],c='k')
#plt.scatter(comx,comy,c='r')
#plt.title('Graph-Based Clustering')

clusters=clusters.drop(clusters.index[del_outliers_round2])
tsne=tsne.drop(tsne.index[del_outliers_round2])

# Plot Round 2 results
plt.figure(figsize=(10, 10))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=clusters['Cluster'], linewidths=0)
plt.title('Graph-Based Clustering: Outliers Round 2')
plt.savefig(savefolder+project+'_tsneclust.eps', format='eps', dpi=1000)

del_barcodes=np.append(del_barcodes_round1,del_barcodes_round2)





In [None]:
matrices_dir = "/media/storage/ckk/genomics/data/"+folder+"/counts/"+project+"/"+dataset+"/outs/"
file_matrix_h5 = matrices_dir+"filtered_feature_bc_matrix.h5"

gene_bc_matrix = get_matrix_from_h5(file_matrix_h5)

In [None]:
barcodes_dir = "/media/storage/ckk/genomics/data/"+folder+"/reanalysis/"+project+"/Barcodes/"
barcodes_file=barcodes_dir+dataset+"_QC_barcodes.csv"
indices_file=barcodes_dir+dataset+"_QC_indices.npy"

In [None]:
neuron_barcodes=np.array([])    
with open(barcodes_file) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        neuron_barcodes=np.append(neuron_barcodes,row[0].encode('UTF-8'))
csv_file.close()
neuron_barcodes=neuron_barcodes[1:]

neuron_cells=np.array([])
for x in range(0,len(neuron_barcodes)):
    curr_barcodes=int(np.where(gene_bc_matrix.barcodes==neuron_barcodes[x])[0])
    neuron_cells=np.append(neuron_cells,curr_barcodes)
neuron_cells=neuron_cells.astype(int)

print('Number of cells:',np.shape(neuron_cells))

# Delete outlier barcodes
del_outliers=np.array([])
for x in range(0,len(del_barcodes)):
    curr_barcodes=np.where(neuron_barcodes==del_barcodes[x].encode('UTF-8'))[0]
    del_outliers=np.append(del_outliers,curr_barcodes)
neuron_cells=np.delete(neuron_cells,del_outliers.astype(int))
neuron_barcodes=np.delete(neuron_barcodes,del_outliers.astype(int))

print(np.shape(neuron_cells))
print(np.shape(clusters))

In [None]:
subset_matrix=subsample_matrix(gene_bc_matrix, neuron_cells)

# Normalize matrix
subset_matrix_exp = subset_matrix.matrix.toarray().squeeze()
        
# Calculate total and median UMI counts per cell and normalize
total_umicounts_per_cell = np.sum(subset_matrix_exp, axis=0)
median_umicount_across_cells = np.median(total_umicounts_per_cell)
scale_factor = total_umicounts_per_cell/median_umicount_across_cells
subset_matrix_norm = subset_matrix_exp/scale_factor

# No normalization
subset_matrix_norm = subset_matrix_exp

# Log transform data
#subset_matrix_norm = np.log2(subset_matrix_exp+1)

# Mean-center and scale variance per-gene
#mean_vals=np.nanmean(subset_matrix_norm,axis=1)
#mean_vals=np.reshape(mean_vals,(len(mean_vals),1))
#std_vals=np.nanstd(subset_matrix_norm,axis=1)
#std_vals=np.reshape(std_vals,(len(std_vals),1))
#subset_matrix_norm = (subset_matrix_norm - mean_vals)/std_vals

In [None]:
barcodes = subset_matrix.barcodes
def splitGEMs(n): 
    return n[-1]
  
barcodes_last = np.asarray(list(map(splitGEMs, barcodes)))
group1=np.where(barcodes_last==49)[0]
group2=np.where(barcodes_last==50)[0]
group3=np.where(barcodes_last==51)[0]

groups=barcodes_last
groups[groups==49]=-1
groups[groups==50]=1
groups[groups==51]=0

# plot all clusters in TSNE space by group
plt.figure(figsize=(5, 5))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=groups, linewidths=0, s=5, vmin=-1, vmax=1)
plt.title('Graph-Based Clustering')
plt.savefig(savefolder+project+'_groups.eps', format='eps', dpi=1000)

In [None]:
# Identify top most differentiating genes per cluster
num_clusters=max(clusters['Cluster'])

# load TSNE and graph clustering
diffexp = pd.read_csv(diffexp_file)

gene_list=np.array([])
for x in range(0,num_clusters):
    diffexp_sort=diffexp.sort_values(by=['Cluster '+str(x+1)+' Log2 fold change'],ascending=False)
    diffexp_nonzero=diffexp_sort[diffexp_sort['Cluster '+str(x+1)+' Mean Counts']>1]
    diffexp_sig=diffexp_nonzero[diffexp_nonzero['Cluster '+str(x+1)+' Adjusted p value']<0.1]
    currgenes=diffexp_sig.iloc[0:10,1]
    gene_list=np.append(gene_list,currgenes)
    print(x)
    print(currgenes)

cell_list=np.array([])
for x in range(0,num_clusters):
    curr_clust=np.where(clusters['Cluster']==x+1)[0]
    cell_list=np.append(cell_list,curr_clust).astype(int)
    
expression_matrix=np.zeros([len(gene_list),len(cell_list)])
for x in range(0,len(gene_list)):    
    gene_index = np.where(subset_matrix.feature_names == gene_list[x])[0]
    currexpr = subset_matrix_norm[gene_index,:]
    currexpr = currexpr[0,cell_list]
    expression_matrix[x,:]=stats.zscore(currexpr)

print(np.shape(expression_matrix))
print(num_clusters)

In [None]:
import seaborn as sns

sns.set()
custom_map=sns.diverging_palette(240,10, sep=50, as_cmap=True)

plt.figure
sns.heatmap(expression_matrix, cmap=custom_map, vmin=-2, vmax=2)
plt.savefig(savefolder+project+'_heatmap.eps', format='eps', dpi=300)

#plt.figure
#sns.heatmap(group_matrix, cmap=custom_map, vmin=-1, vmax=1)
#plt.savefig('Thy1_Aggr_neurons_groupheatmap.eps', format='eps', dpi=300)


In [None]:
mCherry_gene=np.where(subset_matrix.feature_names == 'mCherry')[0]
mCherry_expr=np.asarray(subset_matrix_norm[mCherry_gene, :])[0]
print(max(mCherry_expr))

# plot all clusters in TSNE space by group
plt.figure(figsize=(5,5))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=mCherry_expr, linewidths=0, s=5,cmap=plt.cm.Reds)
plt.title('Graph-Based Clustering')
plt.savefig(savefolder+project+'_mCherry.eps', format='eps', dpi=1000)

# threshold mCherry expression
thresh=1.5
mCherry_expr_thresh=mCherry_expr
mCherry_expr_thresh[np.where(mCherry_expr<thresh)[0]]=0
mCherry_cells=np.where(mCherry_expr>=thresh)[0]
print(len(mCherry_cells))

# plot all clusters in TSNE space by group
plt.figure(figsize=(5,5))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=mCherry_expr_thresh, linewidths=0, s=5,cmap=plt.cm.Reds,vmin=0,vmax=thresh)
plt.title('Graph-Based Clustering')
#plt.savefig(savefolder+project+'_thresh.eps', format='eps', dpi=1000)




In [None]:
TEV_gene=np.where(subset_matrix.feature_names == 'TEV')[0]
TEV_expr=np.asarray(subset_matrix_norm[TEV_gene, :])[0]
thresh=1.5
TEV_expr_thresh=TEV_expr
TEV_expr_thresh[np.where(TEV_expr<thresh)[0]]=0
TEV_cells=np.where(TEV_expr>=thresh)[0]
print(len(TEV_cells))

# plot all clusters in TSNE space by group
plt.figure(figsize=(5,5))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=TEV_expr, linewidths=0, s=5,cmap=plt.cm.Reds)
plt.title('Graph-Based Clustering')
plt.savefig(savefolder+project+'TEV.eps', format='eps', dpi=1000)

# plot all clusters in TSNE space by group
plt.figure(figsize=(5,5))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=TEV_expr_thresh, linewidths=0, s=5,cmap=plt.cm.Reds,vmin=0,vmax=thresh)
plt.title('Graph-Based Clustering')

#plt.savefig(savefolder+project+'_thresh.eps', format='eps', dpi=1000)


In [None]:
tTA_gene=np.where(subset_matrix.feature_names == 'tTA')[0]
tTA_expr=np.asarray(subset_matrix_norm[tTA_gene, :])[0]
thresh=1.5
tTA_expr_thresh=tTA_expr
tTA_expr_thresh[np.where(TEV_expr<thresh)[0]]=0
tTA_cells=np.where(tTA_expr>=thresh)[0]
print(len(tTA_cells))

# plot all clusters in TSNE space by group
plt.figure(figsize=(5,5))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=tTA_expr, linewidths=0, s=5,cmap=plt.cm.Reds)
plt.title('Graph-Based Clustering')
plt.savefig(savefolder+project+'tTA.eps', format='eps', dpi=1000)

# plot all clusters in TSNE space by group
plt.figure(figsize=(5,5))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=tTA_expr_thresh, linewidths=0, s=5,cmap=plt.cm.Reds,vmin=0,vmax=thresh)
plt.title('Graph-Based Clustering')

#plt.savefig(savefolder+project+'_thresh.eps', format='eps', dpi=1000)


In [None]:
# Plot UMI counts across clusters

mCherry_gene=np.where(subset_matrix.feature_names == 'mCherry')[0]
mCherry_expr_norm=np.asarray(subset_matrix_norm[mCherry_gene, :])[0]

mCherry_clusters_norm_mean=np.zeros(num_clusters)
mCherry_clusters_norm_sem=np.zeros(num_clusters)
for x in range(0,num_clusters):
    curr_clust=np.where(clusters['Cluster']==x+1)[0]
    total_clust=len(curr_clust)
    mCherry_clusters_norm_mean[x]=np.mean(mCherry_expr_norm[curr_clust])
    mCherry_clusters_norm_sem[x]=np.std(mCherry_expr_norm[curr_clust])/np.sqrt(total_clust)


plt.figure
plt.title('mCherry')
plt.plot(np.arange(num_clusters)+1,mCherry_clusters_norm_mean,'o',label='True')
plt.errorbar(np.arange(num_clusters)+1,mCherry_clusters_norm_mean,mCherry_clusters_norm_sem,marker='o',linestyle='none')
plt.savefig(savefolder+project+'_mCherryUMIs_clusters.eps', format='eps', dpi=300)
plt.show()

TEV_gene=np.where(subset_matrix.feature_names == 'TEV')[0]
TEV_expr_norm=np.asarray(subset_matrix_norm[TEV_gene, :])[0]

TEV_clusters_norm_mean=np.zeros(num_clusters)
TEV_clusters_norm_sem=np.zeros(num_clusters)
for x in range(0,num_clusters):
    curr_clust=np.where(clusters['Cluster']==x+1)[0]
    total_clust=len(curr_clust)
    TEV_clusters_norm_mean[x]=np.mean(TEV_expr_norm[curr_clust])
    TEV_clusters_norm_sem[x]=np.std(TEV_expr_norm[curr_clust])/np.sqrt(total_clust)

plt.figure
plt.title('TEV')
plt.plot(np.arange(num_clusters)+1,TEV_clusters_norm_mean,'o',label='True')
plt.errorbar(np.arange(num_clusters)+1,TEV_clusters_norm_mean,TEV_clusters_norm_sem,marker='o',linestyle='none')
plt.savefig(savefolder+project+'_TEVUMIs_clusters.eps', format='eps', dpi=300)
plt.show()



tTA_gene=np.where(subset_matrix.feature_names == 'tTA')[0]
tTA_expr_norm=np.asarray(subset_matrix_norm[tTA_gene, :])[0]

tTA_clusters_norm_mean=np.zeros(num_clusters)
tTA_clusters_norm_sem=np.zeros(num_clusters)
for x in range(0,num_clusters):
    curr_clust=np.where(clusters['Cluster']==x+1)[0]
    total_clust=len(curr_clust)
    tTA_clusters_norm_mean[x]=np.mean(tTA_expr_norm[curr_clust])
    tTA_clusters_norm_sem[x]=np.std(tTA_expr_norm[curr_clust])/np.sqrt(total_clust)

plt.figure
plt.title('tTA')
plt.plot(np.arange(num_clusters)+1,tTA_clusters_norm_mean,'o',label='True')
plt.errorbar(np.arange(num_clusters)+1,tTA_clusters_norm_mean,tTA_clusters_norm_sem,marker='o',linestyle='none')
plt.savefig(savefolder+project+'_tTAUMIs_clusters.eps', format='eps', dpi=300)
plt.show()

In [None]:
D2_gene=np.where(subset_matrix.feature_names == 'Adora2a')[0]
D2_expr=np.asarray(subset_matrix_norm[D2_gene, :][0])


Slc17a7_gene=np.where(subset_matrix.feature_names == 'Npy')[0]
Slc17a7_expr=np.asarray(subset_matrix_norm[Slc17a7_gene, :][0])


thresh=1.5
D2_expr_thresh=D2_expr
D2_expr_thresh[np.where(D2_expr<thresh)[0]]=0
D2_cells=np.where(D2_expr>=thresh)[0]
D2_mCherry=np.asarray(subset_matrix_norm[mCherry_gene, D2_cells])

Slc17a7_expr_thresh=Slc17a7_expr
Slc17a7_expr_thresh[np.where(Slc17a7_expr<thresh)[0]]=0
Slc17a7_cells=np.where(Slc17a7_expr>=thresh)[0]
Slc17a7_mCherry=np.asarray(subset_matrix_norm[mCherry_gene, Slc17a7_cells])

plt.figure
plt.bar([0,1],[np.mean(D2_mCherry),np.mean(Slc17a7_mCherry)])
plt.errorbar([0,1],[np.mean(D2_mCherry),np.mean(Slc17a7_mCherry)],[np.std(D2_mCherry)/np.sqrt(len(D2_mCherry)),np.std(Slc17a7_mCherry)/np.sqrt(len(Slc17a7_mCherry))],marker='.',linestyle='none')
plt.show()

In [None]:
D1_gene=np.where(subset_matrix.feature_names == 'Drd1')[0]
D2_gene=np.where(subset_matrix.feature_names == 'Adora2a')[0]

mCherry_D1=np.asarray(subset_matrix_norm[D1_gene, mCherry_cells])
mCherry_D2=np.asarray(subset_matrix_norm[D2_gene, mCherry_cells])

plt.figure
plt.bar([0,1],[np.mean(mCherry_D1),np.mean(mCherry_D2)])
plt.errorbar([0,1],[np.mean(mCherry_D1),np.mean(mCherry_D2)],[np.std(mCherry_D1)/np.sqrt(len(mCherry_D1)),np.std(mCherry_D2)/np.sqrt(len(mCherry_D2))],marker='.',linestyle='none')
plt.show()

In [None]:
TEV_D1=np.asarray(subset_matrix_norm[D1_gene, TEV_cells])
TEV_D2=-np.asarray(subset_matrix_norm[D2_gene,TEV_cells])

#TEV_D2_D1=np.concatenate((TEV_D1, TEV_D2),axis=0)
TEV_D2_D1=TEV_D1+TEV_D2

#print(np.mean(TEV_D1))
#print(np.mean(TEV_D2))
print(np.mean(TEV_D2_D1))

plt.hist(TEV_D2_D1, bins=9)
plt.xlabel('D2>0>D1 expression (log2 UMI)')
plt.ylabel('Frequency')
plt.title('TEV+ cells')
plt.show()

print(np.shape(TEV_D2_D1))

In [None]:
D1_expr=np.asarray(subset_matrix_norm[D1_gene, :][0])
D2_expr=-np.asarray(subset_matrix_norm[D2_gene, :][0])

all_D2_D1=(D1_expr+D2_expr)
print(len(np.where(D1_expr>1)[0]))
print(len(np.where(D2_expr<-1)[0]))

plt.hist(all_D2_D1,bins=16)
plt.xlabel('D2>0>D1 expression (log2 UMI)')
plt.ylabel('Frequency')
plt.title('all cells')
plt.show()

# plot all clusters in TSNE space by group
plt.figure(figsize=(5,5))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=D1_expr, linewidths=0, s=5,cmap=plt.cm.Reds)
plt.title('Graph-Based Clustering')
#plt.savefig('Habenula_Calb1.eps', format='eps', dpi=1000)

# plot all clusters in TSNE space by group
plt.figure(figsize=(5,5))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=-D2_expr, linewidths=0, s=5,cmap=plt.cm.Reds)
plt.title('Graph-Based Clustering')
#plt.savefig('Habenula_Calb1.eps', format='eps', dpi=1000)


In [None]:
mCherry_gene=np.where(subset_matrix.feature_names == 'mCherry')[0]
mCherry_expr=np.asarray(subset_matrix_norm[mCherry_gene, :])[0]

# plot all clusters in TSNE space by group
plt.figure(figsize=(5,5))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=mCherry_expr, linewidths=0, s=5,cmap=plt.cm.Reds)
plt.title('Graph-Based Clustering')
plt.savefig(savefolder+project+'_mCherry.eps', format='eps', dpi=1000)

In [None]:
#clust_order=np.array([2,3,4,6,7,9,10,1,5,8]) #file 1
#clust_order=np.array([3,4,5,10,8,7,9,1,2,6]) #file 2
#clust_order=np.array([2,3,5,7,8,9,1,4,6]) #file 3
#clust_order=np.array([1,6,5,8,9,7,2,3,4]) #file 4 DAL
#clust_order=np.array([1,6,5,8,9,7,2,3,4]) #file 5 wt
clust_order=np.array([3,4,5,7,8,10,9,1,2,6]) #Thy1L Aggr
clust_order=np.array([2,6,5,16,17,10,11,12,14,1,2,4,8,9,13,7,15,18])

#gene_list=np.array(["Opalin","Gfap","Lyz2","Ccl5","Cldn5","Dcn","Olig1","Scn4b","Tac1","Slc17a7"]) # files 1-2
#gene_list=np.array(["Opalin","Gfap","Lyz2","Cldn5","Dcn","Olig1","Scn4b","Tac1","Slc17a7"]) # file 3
#gene_list=np.array(["Opalin","Gfap","Lyz2","Cldn5","Dcn","Ccl5","Scn4b","Tac1","Slc17a7"]) # file 4
gene_list=np.array(["Opalin","Gfap","Lyz2","Cldn5","Dcn","Ccl5","Pdgfra","Scn4b","Tac1","Slc17a7"]) # file 4
gene_list=np.array(["Opalin","Lyz2","Cldn10","Ccl5","Pdgfra","Bcas1","Sox9","Slco1a4","Ccl11","Scn4b","Foxp2","Adora2a","Penk","Calb1","Npy","Slc17a7","Foxp1","Pcp4"])

for x in range(0,len(clust_order)):
    curr_clust=np.where(clusters['Cluster']==clust_order[x])[0]
    for y in range(0,len(gene_list)): 
        gene_index = np.where(subset_matrix.feature_names == gene_list[y])[0]
        currexpr = subset_matrix_norm[gene_index,:]
        currexpr = currexpr[0,curr_clust]
        plt.violinplot(currexpr+y*10,[x],showextrema=False)
        
plt.savefig(savefolder+project+'_violinplots.eps', format='eps', dpi=1000)

In [None]:
gene_list=np.array(["TEV","tTA","mCherry"])

for x in range(0,len(clust_order)):
    curr_clust=np.where(clusters['Cluster']==clust_order[x])[0]
    for y in range(0,len(gene_list)): 
        gene_index = np.where(subset_matrix.feature_names == gene_list[y])[0]
        currexpr = subset_matrix_norm[gene_index,:]
        currexpr = currexpr[0,curr_clust]
        plt.violinplot(currexpr+y*10,[x],showextrema=False)
        
plt.savefig(savefolder+project+'_FLARE_violinplots.eps', format='eps', dpi=1000)

In [None]:
mCherry_barcodes=neuron_barcodes[mCherry_cells]

analysis_dir = "/media/storage/ckk/genomics/data/"+folder+"/reanalysis/"+project

csvsavefile = analysis_dir+"/Barcodes/"+dataset+'_QC_mCherry_barcodes.csv'
with open(csvsavefile, 'w', newline='', encoding='utf8') as f: 
    writer = csv.writer(f)
    writer.writerow(["Barcode"])
    for x in range(0,len(mCherry_barcodes)):
        writer.writerow([mCherry_barcodes[x].decode('UTF-8')])

In [None]:
#neuron_clusters=np.asarray([1,5,8]) # file 1
#neuron_clusters=np.asarray([1,2,6]) # file 2
#neuron_clusters=np.asarray([1,6,8]) # file 3
#neuron_clusters=np.asarray([2,3,4]) # file 4 DA
#neuron_clusters=np.asarray([1,2,6,8]) # file 5 WT
#neuron_clusters=np.asarray([1,2,5]) # file 6 WT
#neuron_clusters=np.asarray([1,4]) # DAL Aggr
#neuron_clusters=np.asarray([1,4,5,6]) # DAM Aggr
#neuron_clusters=np.asarray([1,2,6]) # Thy1L Aggr
neuron_clusters=np.asarray([1,3,5,7]) # Thy1M Aggr

neurons=np.isin(clusters['Cluster'],neuron_clusters)
print(len(np.where(neurons==True)[0]))

curr_barcodes=neuron_barcodes[neurons]

analysis_dir = "/media/storage/ckk/genomics/data/"+folder+"/reanalysis/"+project

csvsavefile = analysis_dir+"/Barcodes/"+dataset+'_QC_neurons_barcodes.csv'
with open(csvsavefile, 'w', newline='', encoding='utf8') as f: 
    writer = csv.writer(f)
    writer.writerow(["Barcode"])
    for x in range(0,len(curr_barcodes)):
        writer.writerow([curr_barcodes[x].decode('UTF-8')])

In [None]:
gene_list=np.array(["TEV","tTA"])
thresh=1.5

FLARE_cells=np.array([])
for x in range(0,len(gene_list)):
    gene=np.where(subset_matrix.feature_names == gene_list[x])[0]
    gene_expr=np.asarray(subset_matrix_norm[gene, :])[0]
    
    # plot all clusters in TSNE space by group
    plt.figure(figsize=(5, 5))
    plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=gene_expr, linewidths=0, s=5,cmap=plt.cm.Reds)
    plt.title('Graph-Based Clustering')
    
    gene_expr_thresh=gene_expr
    gene_expr_thresh[np.where(gene_expr<thresh)[0]]=0
    cells=np.where(gene_expr>=thresh)[0]
        
    # plot all clusters in TSNE space by group
    plt.figure(figsize=(5, 5))
    plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=gene_expr_thresh, linewidths=0, s=5,cmap=plt.cm.Reds,vmin=0,vmax=thresh)
    plt.title('Graph-Based Clustering')

    FLARE_cells=np.append(FLARE_cells,np.asarray(cells)).astype(int)
    
FLARE_cells=np.unique(FLARE_cells)
FLARE_neurons=FLARE_cells[np.where(np.isin(clusters['Cluster'].iloc[FLARE_cells],neuron_clusters))]

plt.figure(figsize=(5, 5))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], linewidths=0, s=2,cmap=plt.cm.Reds)
plt.scatter(tsne['TSNE-1'].iloc[FLARE_neurons], tsne['TSNE-2'].iloc[FLARE_neurons], linewidths=0, s=5,cmap=plt.cm.Reds)
plt.title('Graph-Based Clustering')

FLARE_barcodes=neuron_barcodes[FLARE_neurons]

analysis_dir = "/media/storage/ckk/genomics/data/FLARE2/reanalysis/"+project

csvsavefile = analysis_dir+"/Barcodes/"+dataset+'_QC_neurons_FLARE_barcodes.csv'
with open(csvsavefile, 'w', newline='', encoding='utf8') as f: 
    writer = csv.writer(f)
    writer.writerow(["Barcode"])
    for x in range(0,len(FLARE_barcodes)):
        writer.writerow([FLARE_barcodes[x].decode('UTF-8')])

In [None]:
gene_list=np.array(["mCherry"])
thresh=1.5

mCherry_cells=np.array([])
for x in range(0,len(gene_list)):
    gene=np.where(subset_matrix.feature_names == gene_list[x])[0]
    gene_expr=np.asarray(subset_matrix_norm[gene, :])[0]
    
    # plot all clusters in TSNE space by group
    plt.figure(figsize=(5, 5))
    plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=gene_expr, linewidths=0, s=5,cmap=plt.cm.Reds)
    plt.title('Graph-Based Clustering')
    
    gene_expr_thresh=gene_expr
    gene_expr_thresh[np.where(gene_expr<thresh)[0]]=0
    cells=np.where(gene_expr>=thresh)[0]
        
    # plot all clusters in TSNE space by group
    plt.figure(figsize=(5, 5))
    plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], c=gene_expr_thresh, linewidths=0, s=5,cmap=plt.cm.Reds,vmin=0,vmax=thresh)
    plt.title('Graph-Based Clustering')

    mCherry_cells=np.append(mCherry_cells,np.asarray(cells)).astype(int)
    
mCherry_cells=np.unique(mCherry_cells)
mCherry_neurons=mCherry_cells[np.where(np.isin(clusters['Cluster'].iloc[mCherry_cells],neuron_clusters))]

plt.figure(figsize=(5, 5))
plt.scatter(tsne['TSNE-1'], tsne['TSNE-2'], linewidths=0, s=2,cmap=plt.cm.Reds)
plt.scatter(tsne['TSNE-1'].iloc[mCherry_neurons], tsne['TSNE-2'].iloc[mCherry_neurons], linewidths=0, s=5,cmap=plt.cm.Reds)
plt.title('Graph-Based Clustering')

mCherry_barcodes=neuron_barcodes[mCherry_neurons]

analysis_dir = "/media/storage/ckk/genomics/data/FLARE2/reanalysis/"+project

csvsavefile = analysis_dir+"/Barcodes/"+dataset+'_QC_neurons_mCherry_barcodes.csv'
with open(csvsavefile, 'w', newline='', encoding='utf8') as f: 
    writer = csv.writer(f)
    writer.writerow(["Barcode"])
    for x in range(0,len(mCherry_barcodes)):
        writer.writerow([mCherry_barcodes[x].decode('UTF-8')])