/
proc_coral_samap.py
1207 lines (1019 loc) · 55.1 KB
/
proc_coral_samap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import commp as cp
import numpy as np
import pandas as pd
import scipy as sp
from itertools import groupby
from scipy.spatial import distance
#import matplotlib.pyplot as plt
try:
import scanpy as sc
from samap.mapping import SAMAP, prepare_SAMap_loadings
from samap.analysis import (get_mapping_scores, GenePairFinder, sankey_plot)
from samap.utils import save_samap, load_samap
from samalg import SAM
from bokeh.plotting import show
import holoviews as hv
except ImportError:
cp._info('ignore absent libraries')
class dotdict(dict):
"""dot.notation access to dictionary attributes"""
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
_cscheme_seurat_dimplot=['#f8766d', '#7cae00', '#00bfc4', '#c77cff', '#e68613', '#0cb702', '#00b8e7', '#ed68ed', '#cd9600', '#00be67', '#00a9ff', '#ff61cc', '#aba300', '#00c19a', '#8494ff', '#ff68a1']
############################################################################################
# fucntions for gene annotations --------------------------------------------------
############################################################################################
# extract gene name, taxon ID from curl download strings
# - adig prost annotations human and drosophila
# $ awk '{printf "curl \"https://rest.uniprot.org/uniprotkb/search?query=%s&fields=gene_names,organism_id\"\necho\n",$1}' 03.uniprot_ID.tsv > batch_curl_all.sh
# $ split -l 140000 batch_curl_all.sh sub.dl. --additional-suffix=.sh
# $ cat *.out > append_info.json.txt
def append_gn_tax(args):
assert len(args)==2, 'Usage: python proc_coral_samap.py append_gn_tax append_info.json.txt append_info.uid_tax_gn.tsv'
import json
from tqdm import tqdm
infile = args[0]
outfile = args[1]
outlist = []
jsonline=''
fp = open(infile,'r')
content = fp.read()
for line in tqdm(content.split('{"results":'), desc='Processing'):
if line=='': continue
jsonline = '{"results":'+line
#print('New json: '+jsonline)
jo = json.loads(jsonline)
try:
uid = jo['results'][0]['primaryAccession']
tax = jo['results'][0]['organism'].get('taxonId', '_NA_')
gid = jo['results'][0].get('genes', '_NA_')
gn = gid[0].get('geneName', '_NA_') if gid!='_NA_' else '_NA_'
gnv = gn.get('value', '_NA_') if gn!='_NA_' else '_NA_'
except Exception as e:
print(e)
print(jsonline)
outlist.append('%s\t%s\t%s' % (uid, tax, gnv))
jsonline=''
fp.close()
with open(outfile, 'w') as fout:
fout.write('%s\n' % '\n'.join(outlist))
cp._info('save to %s' % outfile)
# extract human, drome, top1 from *allhits.cell.uid.pn.gd.gn.tax.tn.dis.tsv
def ann_merge(args):
assert len(args)==2, 'Usage: python proc_coral_samap.py ann_merge in.allhits.cell.uid.pn.gd.gn.tax.tn.dis.tsv outfile'
infile = args[0]
outfile = args[1]
# load data
df = pd.read_csv(infile, sep='\t', names=['Query_gene_ID','Uniprot_ID','Protein_name','Gene_name','Taxon_ID','Taxon_name','PROST_distance'])
# filter _NA_ gene names
df=df.loc[df['Gene_name']!='_NA_']
f=lambda x: ','.join(list(x))
# extract human hits
dh = df.loc[df['Taxon_ID']== 9606]
dh = dh.groupby('Query_gene_ID')['Gene_name'].apply(f).reset_index()
dh.set_index('Query_gene_ID', inplace=True)
dh.rename(columns={'Gene_name': 'PROST_gene_name(Human)'}, inplace=True)
# extract drome hits
dd = df.loc[df['Taxon_ID']== 7227]
dd = dd.groupby('Query_gene_ID')['Gene_name'].apply(f).reset_index()
dd.set_index('Query_gene_ID', inplace=True)
dd.rename(columns={'Gene_name': 'PROST_gene_name(Drome)'}, inplace=True)
# extract gene name, tax name from the top1 hit
#da = df.groupby('Query_gene_ID').apply(lambda x: sorted(list(zip(x['Uniprot_ID'], x['Protein_name'], x['Gene_name'], x['Taxon_ID'], x['Taxon_name'], x['PROST_distance'])), key=lambda tup: tup[5])[0])
da = df.groupby('Query_gene_ID').apply(lambda x: sorted(list(zip(x['Query_gene_ID'], x['Uniprot_ID'], x['Protein_name'], x['Gene_name'], x['Taxon_ID'], x['Taxon_name'], x['PROST_distance'])), key=lambda tup: tup[6])[0])
da = da.reset_index(name='PROST_gene_name(Top1)')
# output top1 flat tsv
dt = pd.DataFrame(da['PROST_gene_name(Top1)'].tolist(), columns=['Query_gene_ID', 'Uniprot_ID','Protein_name','Gene_name','Taxon_ID','Taxon_name','PROST_distance'])
dt.set_index('Query_gene_ID', inplace=True)
dt.to_csv('out.top1.tsv', sep='\t', index=True, header=True)
cp._info('save top1 df to out.top1.tsv')
da['PROST_gene_name(Top1)'] = da['PROST_gene_name(Top1)'].apply(lambda x: '%s, %s' % (x[3], x[5]))
da.set_index('Query_gene_ID', inplace=True)
# merge dh,dd,da
outdf = da.merge(dh, how='left', left_index=True, right_index=True).merge(dd, how='left', left_index=True, right_index=True)
outdf.fillna('_NA_', inplace=True)
outdf.to_csv(outfile, sep='\t', index=True, header=True)
cp._info('save merged df to %s' % outfile)
############################################################################################
# fucntions for homology graph comparison --------------------------------------------------
############################################################################################
# tb: tuples from blast [(ad1, sp1), ...,]
# tp: tuples from prost [(ad1, sp1), ...,]
def _hits_comparison(tb, tp):
# get full set of protein ids
bids = set([t[0] for t in tb])
pids = set([t[0] for t in tp])
#all_ids = list(bids.union(pids))
all_ids = list(bids.intersection(pids))
all_ids.sort()
# get [id: set(), ... ]
# adig-s0001.g2:
# {'Hvul_g19544_1', 'Hvul_g24627_1', 'Hvul_g23353_1', 'Hvul_g15694_1', 'Hvul_1_1', 'Hvul_g10035_1', 'Hv
# ul_g15458_1', 'Hvul_g24294_1', 'Hvul_g28163_1', 'Hvul_g30764_1', 'Hvul_g30759_1', 'Hvul_g7895_1', 'Hv
# ul_2_1', 'Hvul_g11044_1', 'Hvul_g27920_1', 'Hvul_g28153_1', 'Hvul_g15015_1', 'Hvul_g24626_1', 'Hvul_g
# 30756_1', 'Hvul_g24628_1', 'Hvul_g10034_1', 'Hvul_g22665_1'}
dd = dict((x[0], set([i[1] for i in x[1]])) for x in groupby(tb, lambda t: t[0]))
dp = dict((x[0], set([i[1] for i in x[1]])) for x in groupby(tp, lambda t: t[0]))
# iterate all ids
return [(i,cp.jaccard(dd.get(i, set([])), dp.get(i,set([])))) for i in all_ids]
# 1. blast map
# 2. prost map
def maps_comparison(args):
assert len(args)==3, 'Usage: python maps_comparsion map.blast.txt map.prost.txt outfile'
tb = [(t[0],t[1]) for t in cp.loadtuples(args[0], delimiter='\t')]
tp = [(t[0],t[1]) for t in cp.loadtuples(args[1], delimiter='\t')]
with open(args[2], 'w') as fp:
fp.write('\n'.join(['%s %.8f' % (t[0], t[1]) for t in _hits_comparison(tb, tp)]))
cp._info('save jaccard distances to %s' % args[2])
############################################################################################
# apc functions ---------------------------------------------------------------
############################################################################################
#$ python proc_coral_samap.py ut_gene_corr _corr.txt
def ut_gene_corr(args):
assert len(args) == 1, 'Usage: python proc_coral_samap.py ut_gene_corr t.txt'
out=_proc_gene_corr_apc(args[0])
print(out)
# input: gene correlation flat file
# output: np.array: g1, g2, corr_value, apc_value
# outflat = pc._proc_gene_corr_apc('_corr.txt')
# import matplotlib.pyplot as plt
# plt.scatter(outflat[:,2].astype(float),outflat[:,3].astype(float)
# plt.show()
# no sign of apc
def _proc_gene_corr_apc(corrflat):
# load flat data
flatdata = np.genfromtxt(corrflat, delimiter=' ', dtype='str')
r = flatdata[:,0]
c = flatdata[:,1]
v = flatdata[:,2].astype(float)
# calculate apc in matrix form
ur, iur = np.unique(r, return_inverse=True) # ur: unique row label; iur: replace each r element with their unique label index
uc, iuc = np.unique(c, return_inverse=True) # ur: unique row label; iur: replace each r element with their unique label index
import scipy as sp
outmat = sp.sparse.coo_matrix((v, (iur, iuc)), shape=(len(ur), len(uc))).tocsr()
apcmat = _apc_rc(outmat.A)
# output flat format for visualization
#apcflat = np.array(['%.8f %s %s' % (apcmat[i[0], i[1]],ur[i[0]], uc[i[1]]) for i in zip(iur, iuc)])
apcflat = np.array([apcmat[i[0], i[1]] for i in zip(iur, iuc)])
outflat = np.hstack((flatdata, apcflat[:,None]))
return outflat
# bipartite apc procedure
# apc_rc = ((row_sum/col_n) outer (col_sum/row_n))/(total_sum/(col_n*row_n))
# sm (score matrix): numpy array score matrix; (pandas.to_numpy())
# r row by c column score matrix
# import matplotlib.pyplot as plt;plt.figure(figsize=(8,6));plt.scatter(sm.flatten(), apc.flatten());plt.show()
def _apc_rc(sm):
r,c = sm.shape
'''
row_mean = sm.sum(axis=1) / c # individual row mean
col_mean = sm.sum(axis=0) / r # individual column mean
apc = np.outer(row_mean, col_mean) / (sm.sum() / (r*c))
'''
return np.outer(sm.sum(axis=1)/c, sm.sum(axis=0)/r) / (sm.sum()/(r*c))
# symmetrical matrix apc
# apc_ij = mean_i * mean_j / mean_overall
# apc = (col_sum / (row_n-1)) outer (col_sum / (row_n-1)) / (total_sum / ((col_n^2-col_n))
# input: sm (score matrix): numpy array score matrix
# validation:
# awk '{print $5,$6,$7,$13,$7-$13}' PF01245-6TPQ_i-r2m2i2-mizp3-dcazp3-mipzp3-dcapzp3-dist.vec19 > PF01245-6TPQ_i.i2.mi.mip.apc.txt
# python utils_ce.py cflat2ccmat PF01245-6TPQ_i.i2.mi.mip.apc.txt 0,1 2 PF01245
# sm = = np.loadtxt('PF01245.ccmat')
# apc = _apc_sym(sm)
# $ grep "0.240928" PF012*.txt
# 1 2 0.571829 0.330901 0.240928
# apc values agree with APC column in PF01245-6TPQ_i.i2.mi.mip.apc.txt
# import matplotlib.pyplot as plt;plt.figure(figsize=(8,6));plt.scatter(mi, apc);plt.show()
def _apc_sym(sm):
return np.outer(sm.sum(axis=0)/(sm.shape[0]-1), sm.sum(axis=0)/(sm.shape[0]-1)) / (sm.sum()/(np.square(sm.shape[0])-sm.shape[0]))
############################################################################################
# h5ad assembling functions ---------------------------------------------------------------
############################################################################################
# generate sam h5ad file from expression counts
def _samh5ad(h5file,outfile):
sam = SAM()
sam.load_data(h5file)
sam.preprocess_data(
sum_norm="cell_median",
norm="log",
thresh_low=0.0,
thresh_high=0.96,
min_expression=1,
)
sam.run(
preprocessing="StandardScaler",
npcs=100,
weight_PCs=False,
k=20,
n_genes=3000,
weight_mode='rms'
)
sam.leiden_clustering(res=3)
if "PCs_SAMap" not in sam.adata.varm.keys():
prepare_SAMap_loadings(sam)
sam.save_anndata(outfile)
cp._info('save sam h5ad to %s' % outfile)
# slice adata obj by labels of cluster_name in obs
# all columns in var will be removed
# h5: input anndata object
# target_obs_column:'Jake'
# labels = ['Jake_10', 'Jake_17', 'Jake_20', 'Jake_22', 'Jake_27', 'Jake_46', 'Jake_54']; corresponds to xe_gastrodermis cell type family
# return: new anndata object contains expression counts for selected cells
def _slice_by_cluster(h5, target_obs_column, labels, obs_droplist=None):
c = h5.obs[target_obs_column]
filters = c[c.isin(labels)].index
sub_h5 = h5[filters]
obs = sub_h5.obs.drop(columns=obs_droplist) if obs_droplist is not None else sub_h5.obs
var = sub_h5.var.drop(columns=sub_h5.var.columns) if sub_h5.var.columns!=0 else sub_h5.var
return sc.AnnData(X=sub_h5.X, obs=obs, var=var)
# slice data by randomly sample n cells
# n = uniformly sample without replacement (no repeat)
def _slice_by_random(h5, n_cell, obs_droplist):
filters = np.random.sample(list(range(n_cell)))
sub_h5 = h5[filters]
return sc.AnnData(X=sub_h5.X, obs=sub_h5.obs.drop(columns=obs_droplist), var=sub_h5.var.drop(columns=sub_h5.var.columns))
# append clustering assignments to an h5ad file
# nan will be renamed by unclustered_label
def _append_assignments(h5, clusterfile, cluster_colnames=['cell_type'], index_name='index', unclustered_label='unclustered', delimiter='\t'):
cluster_assignments = pd.read_csv(clusterfile, header=None, sep=delimiter, index_col=0, names=cluster_colnames)
cluster_assignments.index.name = index_name
h5.obs=h5.obs.merge(cluster_assignments, how='left', left_index=True, right_index=True)
for c in cluster_colnames:
h5.obs[c].fillna(unclustered_label, inplace=True)
# convert RDS converted .mtx {rowname,colname}.tsv files into an h5ad object
# clusterfile: .tsv file without header line
# the h5 and cluster info have the same index name
def _mtx2h5ad(mtxfile, cellname_file, genename_file, index_name='index'):
import scanpy as sc
# generating h5ad object
h5 = sc.read(mtxfile).transpose()
cellnames = pd.read_csv(cellname_file, header=None, sep='\t', index_col=0)
cellnames.index.name = index_name
genenames = pd.read_csv(genename_file, header=None, sep='\t', index_col=0)
genenames.index.name = index_name
h5.obs = cellnames
h5.var = genenames
#h5.write_h5ad(adig.counts.h5ad.h5ad')
return h5
# generate h5ad file from mtx file for coral project
# python proc_coral_samap.py h5gen_mtx_cluster adig.counts.mtx adig.colnames.txt adig.rownames.txt ad_cluster_jake.tsv jake adig.counts.c_jake.h5ad
# >>> from proc_coral_samap import _mtx2h5ad, _append_assignments
# >>> h = _mtx2h5ad('adig.counts.mtx', 'adig.colnames.txt', 'adig.rownames.txt')
# >>> _append_assignments(h, 'ad_cluster_jake.tsv', ['jake'])
# h5.write_h5ad('adig.counts.c_jake.h5ad'
def h5gen_mtx_cluster(args):
assert len(args) == 6, 'Usage: python proc_coral_samap h5adgen mtxfile cellname_file genename_file cluster_file cluster_id outfile'
h5 = _mtx2h5ad(args[0], args[1], args[2])
_append_assignments(h5, args[3], [args[4]])
h5.write_h5ad(args[5])
cp._info('save to %s' % args[5])
############################################################################################
# alignment score functions --------------------------------------------------------------
############################################################################################
# For generate heatmap
# k=len(np.unique(s.sams['ad'].adata.obs['leiden_clusters']))
# $ ls *prost*family*|awk 'BEGIN {printf "python proc_coral_samap.py combine_scorefiles \""} {printf "%s ", $1} END{printf "\" k outfile"}'
# $ ls *blast*family*|awk 'BEGIN {printf "python proc_coral_samap.py combine_scorefiles \""} {printf "%s ", $1} END{printf "\" k outfile"}'
# outfile: tsv file of combined alignment scores with column ordered by cell type (family) labels
# visualize using the following R code
'''
library(pheatmap)
infile="outfile"
h=pheatmap(t(read.table(infile, row.names=1, header=TRUE, sep='\t')),cluster_rows=F)
source("save_pheatmap.R")
save_pheatmap(h, paste(infile,".png",sep=""), width=12,height=8)
'''
# separator indicate the number of Jake's clusters; all data it is 72; gastrodermis cells: 9 (['Jake_10', 'Jake_17', 'Jake_20', 'Jake_22', 'Jake_27', 'Jake_46', 'Jake_54', | 'Jake_64', 'Jake_25'])
def combine_scorefiles(args):
assert len(args)==3, 'Usage: python proc_coral_samap.py combine_scores_tsv "list of .tsv files" separator{72} outfile'
tsvfiles = [fn.strip() for fn in args[0].split(' ') if fn!='']
cp._info('%s files loaded.' % len(tsvfiles))
s=int(args[1])
outfile = args[2]
#scores = pd.concat([pd.read_csv(fn, sep='\t', index_col=0).iloc[:72,72:] for fn in tsvfiles], axis=1)
scores = pd.concat([pd.read_csv(fn, sep='\t', index_col=0).iloc[:s,s:] for fn in tsvfiles], axis=1)
c=list(scores.columns)
c.sort(key=lambda n: n.split('_')[1:]) # sort by call type families
scores[c].to_csv(outfile, sep='\t', index=True, header=True)
cp._info('save all scores %s to %s' % (scores[c].shape, outfile))
# python proc_coral_samap.py reorder_mat 04.sl2all.alignment_scores.blast.cell_type_sele_cell_type_family.tsv heatmap.x.sci.stub na out
# reorder the combined alignment scores to generate heatmap
# pheatmap(t(read.table(infile, row.names=1, header=TRUE, sep='\t')), cluster_rows=F, cluster_cols=F)
def reorder_mat(args):
assert len(args)==4, 'Usage: python proc_coral_samap.py reorder_mat mat_with_label.tsv x_order.stub y_order.stub outfile'
# row: x-axis; column: y-axis in the heatmap
scores = pd.read_csv(args[0], sep='\t', index_col=0, header=0)
x_labels = cp.loadlines(args[1]) if args[1]!= 'na' else list(scores.index)
y_labels = cp.loadlines(args[2]) if args[2]!= 'na' else list(scores.columns)
scores_ro = scores.reindex(index=x_labels, columns=y_labels)
scores_ro.to_csv(args[3], sep='\t', index=True, header=True)
cp._info('save re-ordered scores to %s' % args[3])
############################################################################################
# samap general pipeline functions -------------------------------------------------------
############################################################################################
# call samap main procedure
# sn1,sn2: species names
# sf1,sf2: sam object file names .h5ad (with pre-calculated clustering assignments)
# map_p: homolog graph path
# resolutions: leiden clustering resolution for each species
# assignments: = {"bf":"cluster", "mm":"tissue_celltype"}
# _run_samap_with_h5('ad', '00.adig.counts.Jake_xe_gastodermis.h5ad', 'xe', '00.xesp.counts.Jake_xe_gastodermis.h5ad', samobj=False, map_p='maps.blast/', assignments=assignments)
def _run_samap_with_h5(sn1, fn1, sn2, fn2, samobj=False, map_p='maps/', gnnm=None, resolutions=None, assignments=None, res_dk=1.0, gnnm_transform=True, umap=False):
from samap.mapping import SAMAP
from samap.analysis import (get_mapping_scores, GenePairFinder, sankey_plot)
from samap.utils import save_samap, load_samap
from samalg import SAM
import pandas as pd
if samobj:
cp._info('loading SAM objects.')
sam1 = SAM()
sam1.load_data(fn1)
sam2 = SAM()
sam2.load_data(fn2)
sams = {sn1: sam1, sn2: sam2}
else:
cp._info('loading from non-SAM h5 files')
sams = {sn1:fn1, sn2:fn2}
cp._info('running SAMap %s%s...' % (sn1,sn2))
# keys = {"bf":"cluster", "mm":"tissue_celltype"}
#sm = SAMAP(sams, f_maps=map_p, resolutions=resolutions, keys=assignments)
sm = SAMAP(sams, f_maps=map_p, gnnm=gnnm, resolutions=resolutions, keys=assignments, res_dk=res_dk, gnnm_transform=gnnm_transform)
# neigh_from_keys = {'bf':True, 'mm':True}
#neigh_from_keys = dict((k,True) for k in assignments.keys()) if assignments!=None else None
if assignments!=None:
neigh_from_keys={}
for k in assignments:
neigh_from_keys[k] = False if assignments[k]=='leiden_clusters' else True
print(k, neigh_from_keys[k])
else:
neigh_from_keys=None
#sm.run(pairwise=True, neigh_from_keys=neigh_from_keys, umap=False)
sm.run(pairwise=True, neigh_from_keys=neigh_from_keys, umap=umap) # umap is false for debugging
#save_samap(sm, 'ad_nv.samap.pkl')
cp._info('SAMap %s%s done.' %(sn1,sn2))
return sm
# iterate resolution parameters for leiden_clusters
# comparison between ad_['Jake_10', 'Jake_17', 'Jake_20', 'Jake_22', 'Jake_27', 'Jake_46', 'Jake_54'] and xe_gastrodermis
# seq = np.arange(1,20,4)/10
# for s in seq: _iter_res(s)
def _iter_res(c1='leiden_clusters', c2='leiden_clusters', res1=3, res2=3, output='png'):
cp._info('resolution iteration: ad: %s res %.2f, xe: %s res %.2f' % (c1, res1, c2, res2))
assignments={'ad': c1, 'xe': c2}
resolutions={'ad':res1,'xe':res2}
sm = _run_samap_with_h5('ad', '00.adig.counts.Jake_xe_gastodermis.h5ad', 'xe', '00.xesp.counts.Jake_xe_gastodermis.h5ad', samobj=False, map_p='maps.blast/', resolutions=resolutions, assignments=assignments)
D,MappingTable = get_mapping_scores(sm,{'ad':c1,'xe':c2},n_top = 0)
tsvfile = '02.adxe.xe_gastrodermis.leiden.ad_%.2f_xe_%.2f.tsv' % (res1,res2)
#MappingTable.to_csv(tsvfile, sep='\t', index=True, header=True)
#cp._info('save to %s' % tsvfile)
sk = sankey_plot(MappingTable, align_thr=0.00, species_order = ['ad','xe'])
hv.save(sk, tsvfile+'.'+output, fmt=output)
cp._info('save sankey to %s.%s' % (tsvfile, output))
return (tsvfile, MappingTable)
# prost homology fucntions -------------------------------------------------------------
# convert prost distance to similarity score
# input format: sid1\tsid2\tdistance\te-value
# target_column: 0-based column index, used by pd.iloc
#
# $ python proc_coral_samap.py prost2similarity 01.prost.adnt.tsv 2 02.prost.adnt.scores.tsv
# $ python proc_coral_samap.py prost2similarity 01.prost.ntad.tsv 2 02.prost.ntad.scores.tsv
# use awk to format it as SAMap mapping
# $ awk -F '\t' '{printf "%s\t%s\tp\tp\tp\tp\tp\tp\tp\tp\t0.0\t%s\n",$1,$2,$5}' 02.prost.adnt.scores.tsv > ad_to_nt.prost.txt
# $ awk -F '\t' '{printf "%s\t%s\tp\tp\tp\tp\tp\tp\tp\tp\t0.0\t%s\n",$1,$2,$5}' 02.prost.ntad.scores.tsv > nt_to_ad.prost.txt
def prost2similarity(args):
assert len(args) == 3, 'Usage: python proc_coral_samap.py prost2similarity prost.out.tsv target_column ad_to_hy.txt'
infile = args[0]
tc = int(args[1])
outfile = args[2]
df = pd.read_csv(infile, header=None, sep='\t')
# ignore all Nan columns
df = df.dropna(axis=1, how='all')
# rename previous column names
df = df.rename(columns=dict((n,i) for i,n in enumerate(df.columns)))
# append score columns
i = len(df.columns)
# inversed log
df[i+1]= 1/(np.log(df.iloc[:,tc]+1)+1)
# inversed sqrt
df[i+2]= 1/(np.sqrt(df.iloc[:,tc]+1)+1)
df.to_csv(outfile, sep='\t', index=False, header=False)
cp._info('apprend %d: log, %d: sqrt scores to %s' % (i+1, i+2, outfile))
##-------------------------------------------------------------------------------------
# gene_align_*: remove abnormal AA alphabets gaps = ['.','-',' ','*']
# : reformat fasta headers; unique ID + transcription ID
##-------------------------------------------------------------------------------------
def _seq_stat(proteome_file, ab_checklist):
from tqdm import tqdm
clean_flag = False
cp._info('checking for sequences contain: %s' % (' '.join(['[%s]' % c for c in ab_checklist])))
ab_seq = []
i=0
for s in tqdm(cp.fasta_iter(proteome_file)):
for c in ab_checklist:
if c in s[1]:
ab_seq.append('[ab]: %s [header]: %s [seq]: %s' % (c, s[0], s[1]))
i+=1
print('%d / %d ab sequences found' % (len(ab_seq), i))
if len(ab_seq)!=0:
abfile = proteome_file+'.ab.list'
cp._info('save ab list to %s' % abfile)
with open(abfile, 'w') as fout:
fout.write('%s\n' % '\n'.join(ab_seq))
clean_flag = True
return clean_flag
# for adig
# s[0]: adig-s0001.g1.t2
def _ad_parser(slist, arg='adig.proteome.rename.fas'):
_fn_h = lambda sarr: '%s.%s' % (sarr[0], sarr[1])
seqs = [[_fn_h(s[0].split('.')), s[0], s[1], len(s[1])] for s in slist]
df = pd.DataFrame(seqs, columns=['output_id','old_id', 'seq', 'length'])
df.to_csv(arg+'.stat.list', columns=['output_id', 'old_id', 'length'], sep='\t', header=False, index=False)
idx = df.groupby(['output_id'])['length'].idxmax()
return df.loc[idx]
# for new nematostella dataset
# input: cleaned sequences slist from proteome.fas
# output: pd type longest sequences
def _nt_parser(slist, arg='genes.LUT.tsv'):
mapping_df = pd.read_csv(arg, sep='\t', names=['id', 'nv2', 'desc', 'old'])
m = mapping_df.set_index('nv2')['desc'].to_dict()
# NV2.1 == Nv2g000001001 (transcript 1), Nv2g000001002 (transcript 2)
seqs = [[m['NV2.' + str(int(s[0][4:10]))], s[0], s[1], len(s[1])] for s in slist]
df = pd.DataFrame(seqs, columns=['output_id','old_id', 'seq', 'length'])
# find longest one amount sequences with identical transcript id
idx = df.groupby(['output_id'])['length'].idxmax()
return df.loc[idx]
# Spongilla_proteome_70AA.fasta
# map from "c100000_g1_i1_m.41809" to "c100000-g1"
def _sl_parser(slist, arg='Spongilla_proteome_70AA.fasta'):
_fn_h = lambda sarr: '%s-%s' % (sarr[0], sarr[1])
seqs = [[_fn_h(s[0].split('_')), s[0], s[1], len(s[1])] for s in slist]
df = pd.DataFrame(seqs, columns=['output_id','old_id', 'seq', 'length'])
df.to_csv(arg+'.stat.list', columns=['output_id', 'old_id', 'length'], sep='\t', header=False, index=False)
idx = df.groupby(['output_id'])['length'].idxmax()
return df.loc[idx]
# sm=load_samap('02.adxe.samap.prost.jake_cell_type_family.pkl')
# sm.sams['xe'].adata.var: xe_Xesp_000001
# _fn_get_var: Xesp_000001.t1 to Xesp_000001
def _xe_parser(slist, arg='xesp.proteome.rename.fas'):
_fn_get_var = lambda sarr: sarr[0]
seqs = [[_fn_get_var(s[0].split('.')), s[0], s[1], len(s[1])] for s in slist]
df = pd.DataFrame(seqs, columns=['output_id','old_id', 'seq', 'length'])
df.to_csv(arg+'.stat.list', columns=['output_id', 'old_id', 'length'], sep='\t', header=False, index=False)
idx = df.groupby(['output_id'])['length'].idxmax()
return df.loc[idx]
# same with xe
# sm = load_samap('02.adhy.samap.prost.jake_cell_type_family.pkl')
# sm.sams['hy'].adata.var: hy_Hvul_g10012_1
# _fn_get_var: Hvul_g10012_1.t1 to hy_Hvul_g10012_1
def _hy_parser(slist, arg='hvul.proteome.rename.fas'):
_fn_get_var = lambda sarr: sarr[0]
seqs = [[_fn_get_var(s[0].split('.')), s[0], s[1], len(s[1])] for s in slist]
df = pd.DataFrame(seqs, columns=['output_id','old_id', 'seq', 'length'])
df.to_csv(arg+'.stat.list', columns=['output_id', 'old_id', 'length'], sep='\t', header=False, index=False)
idx = df.groupby(['output_id'])['length'].idxmax()
return df.loc[idx]
# no need to change
def _sp_parser(slist, arg='spis.proteome.fas'):
seqs = [[s[0], s[0], s[1], len(s[1])] for s in slist]
df = pd.DataFrame(seqs, columns=['output_id','old_id', 'seq', 'length'])
df.to_csv(arg+'.stat.list', columns=['output_id', 'old_id', 'length'], sep='\t', header=False, index=False)
idx = df.groupby(['output_id'])['length'].idxmax()
return df.loc[idx]
# no need to change
def _nv_parser(slist, arg='nvec.proteome.rename.fas'):
seqs = [[s[0], s[0], s[1], len(s[1])] for s in slist]
df = pd.DataFrame(seqs, columns=['output_id','old_id', 'seq', 'length'])
df.to_csv(arg+'.stat.list', columns=['output_id', 'old_id', 'length'], sep='\t', header=False, index=False)
idx = df.groupby(['output_id'])['length'].idxmax()
return df.loc[idx]
# fa header:
# >TriadP8101 pep scaffold:ASM15027v1:scaffold_41:90438:90993:1 gene:TriadG8101 transcript:TriadT8101 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:SYT15 description:Synaptotagmin
# 15 [Source:UniProtKB/TrEMBL;Acc:B3SDG8]
#
# c1 is the same as c10
# TriadP55444 TriadG55444
#
# kjia@DESKTOP-L0MMU09 ~/workspace/samap_general/data/tr/stage.clean 2024-03-12 10:54:21
# $ awk -F '_' '{print $1}' tr_genes.tsv |sort|uniq -c
# 1 DLX
# 1 RPS5
# 1 SECP1
# 1 SYT15
# 1 SYTNTM
# 1 SYTWLL
# 16380 Tadh
# >>> h5.X.shape: (13236, 16386)
# 16380 (Tadh_TriadG51255) + manual convert 6 (convert gene_symbol to gene)
def _tr_parser(slist, arg='Trichoplax_adhaerens.ASM15027v1.pep.all.fa'):
symlist = ['DLX', 'RPS5', 'SECP1', 'SYT15', 'SYTNTM', 'SYTWLL']
def _fn_get_var(h):
sym=''
if 'gene_symbol' in h:
hsub = h.split('gene_symbol:')
sym = hsub[1].split(' ')[0]
return sym if sym in symlist else 'Tadh_TriadG%s' % h.split(' ')[0][6:]
seqs = [[_fn_get_var(s[0]), s[0], s[1], len(s[1])] for s in slist]
df = pd.DataFrame(seqs, columns=['output_id','old_id', 'seq', 'length'])
df.to_csv(arg+'.stat.list', columns=['output_id', 'old_id', 'length'], sep='\t', header=False, index=False)
idx = df.groupby(['output_id'])['length'].idxmax()
return df.loc[idx]
# no need to change
def _pd_parser(slist, arg='pdumv021.xloc.pep'):
seqs = [[s[0], s[0], s[1], len(s[1])] for s in slist]
df = pd.DataFrame(seqs, columns=['output_id','old_id', 'seq', 'length'])
df.to_csv(arg+'.stat.list', columns=['output_id', 'old_id', 'length'], sep='\t', header=False, index=False)
idx = df.groupby(['output_id'])['length'].idxmax()
return df.loc[idx]
def _default_parser(slist, arg):
cp._info('use default parser.')
seqs = [[s[0], s[0], s[1], len(s[1])] for s in slist]
df = pd.DataFrame(seqs, columns=['output_id','old_id', 'seq', 'length'])
df.to_csv(arg+'.stat.list', columns=['output_id', 'old_id', 'length'], sep='\t', header=False, index=False)
idx = df.groupby(['output_id'])['length'].idxmax()
return df.loc[idx]
# process proteome data
# 0. remove abnormal alphabets, save stat to *.ab.list
# 1. map header to andata.obs names
# 2. keep the longest sequence for duplicated transcripts
def process_proteome(args):
assert len(args) == 4, 'Usage: python proc_coral_samap.py process_proteome proteome.fas parser_id gene_name.tsv/na outfile'
_parser_list = {'def':_default_parser, 'nt': _nt_parser, 'ad': _ad_parser, 'sl': _sl_parser, 'xe': _xe_parser, 'hy': _hy_parser, 'sp': _sp_parser, 'nv': _nv_parser, 'tr': _tr_parser, 'pd': _pd_parser}
proteome_file = args[0]
_fn_s_parser=_parser_list[args[1]]
cp._info('Parser %s selected' % _fn_s_parser.__name__)
outfile = args[3]
# get sequence statistics
ab_checklist = set(cp.illab).union(set(cp.gaps))
_seq_stat(proteome_file, ab_checklist)
# clean sequence and find the longest amount transcripts
slist=[]
for h, s in cp.fasta_iter(proteome_file):
ts = s.translate(str.maketrans('','',''.join(ab_checklist)))
slist.append((h,ts))
# processing header and output processed proteome
cp._info('Filtering short transcripts')
df_filtered = _fn_s_parser(slist, proteome_file)
print(df_filtered.shape, df_filtered.iloc[:5])
#df_filtered.to_csv(proteome_file+'.filtered.list', columns=['output_id', 'old_id', 'length'], sep='\t', header=False, index=False)
cp._info('save filtered header mapping to %s' % proteome_file+'.stat.list')
if args[2]!='na':
cp._info('gene sym file: %s provided. checking matches' % args[2])
gene_syms = set(cp.loadlines(args[2]))
fa_syms = set(df_filtered['output_id'])
matches = gene_syms.intersection(fa_syms)
print('\n------- gene name mapping statistics ------')
cp._info('%d matches' % len(matches))
missing = pd.Series([g for g in gene_syms if g not in fa_syms])
cp._info('%d missed' % len(missing))
if len(missing)!=0:
missing.to_csv(args[0]+'.missing.tsv', header=False, index=False)
cp._info('save missed gene sym to %s.missing.tsv' % args[0])
else:
cp._info('skip checking gene sym matches')
# output the converted proteome
print('------------------------------------------\n')
outlist = ['>%s\n%s' % (df_filtered.loc[i]['output_id'], df_filtered.loc[i]['seq']) for i in df_filtered.index]
with open(outfile, 'w') as fout:
fout.write('%s\n' % ('\n'.join(outlist)))
cp._info('save cleaned proteome to %s' % outfile)
# kjia@DESKTOP-L0MMU09 ~/workspace/library/dataset/new.nematostella
# for new nematostella dataset
# proteome
# https://simrbase.stowers.org/files/pub/nematostella/Nvec/genomes/Nvec200/aligned/tcs_v2/20240221/NV2g.20240221.protein.fa
#
# gene name mapping
# proteome header format in NV2g.20240221.protein.fa
# NV2t025884002.1
# NV2 dataset identifier
# t stands for transcript
# 025884 gene number
# 002 transcript number
# .1 version 1
#
# gene name format in GSE200198_alldata.genes.csv
# NVE25884 from NV2t025884002.1
# NVE + strip(025884) = NVE25884
def gene_align_nv2(args):
assert len(args) == 2, 'Usage: python proc_coral_samap.py gene_align_nv2 NV2g.20240221.protein.fa 00.nv2.proteome.fa'
infile = args[0]
outfile = args[1]
outlist = []
for h,s in cp.fasta_iter(infile):
ts = s.translate(str.maketrans('','',''.join(cp.gaps))) # remove '.' at the end of some sequences
# h: NV2t025884002.1
new_h = 'NVE' + str(int(h[4:10])) + '_' + h[-5:]
outlist.append('>%s\n%s' % (new_h, ts))
with open(outfile, 'w') as fout:
fout.write('%s\n' % '\n'.join(outlist))
cp._info('save to %s' % outfile)
# kjia@DESKTOP-L0MMU09 ~/workspace/samap_coral/stage.nv_ad 2023-12-16 17:23:01
# $ awk '{print substr($1,1,5)}' nvec.genes.tsv|sort|uniq -c
# 33973 Nvec_
# change fasta header to align with nvec.genes.txt
# from:">v1g152167" to: ">Nvec_v1g152167"
def gene_align_nvec(args):
assert len(args) == 2, 'Usage: python proc_coral_samap.py gene_align_nvec xenSp1.proteins.fa xesp.proteome.rename.fa'
infile = args[0]
outfile = args[1]
outlist = []
for h,s in cp.fasta_iter(infile):
ts = s.translate(str.maketrans('','',''.join(cp.gaps))) # remove '.' at the end of some sequences
# h = '>v1g152167'
th = 'Nvec_%s' % h
outlist.append('>%s\n%s' % (th,ts))
with open(outfile, 'w') as fout:
fout.write('%s\n' % '\n'.join(outlist))
cp._info('save to %s' % outfile)
# kjia@DESKTOP-L0MMU09 ~/workspace/samap_coral/stage.xe_ad 2023-12-15 15:19:35
# $ awk '{print substr($1,1,4)}' *genes.tsv|sort|uniq -c
# 29015 Xesp
# 1795 orph
# Xesp records have the same length
# $ grep Xesp *genes.tsv|awk '{print length($1)}'|sort |uniq -c
# 29015 12
# kjia@DESKTOP-L0MMU09 ~/workspace/samap_coral/stage.xe_ad 2023-12-15 15:21:08
# $ grep ">" xenSp1.proteins.fa | awk '{print length($1)}'|sort|uniq -c
# 28010 23
# change fasta header to align with adig.rownames.txt
# from:">Xe_029129-T2 Xe_029129" to: "Xesp_029129.t2"
def gene_align_xesp(args):
assert len(args) == 2, 'Usage: python proc_coral_samap.py gene_align_xesp xenSp1.proteins.fa xesp.proteome.rename.fa'
infile = args[0]
outfile = args[1]
import re
outlist = []
for h,s in cp.fasta_iter(infile):
ts = s.translate(str.maketrans('','',''.join(cp.gaps))) # remove '.' at the end of some sequences
# h = '>Xe_029168-T1 Xe_029168'
sarr = re.split(r'[_\-\s]',h) # ['Xe', '029168', 'T1', 'Xe', '029168']
#print(sarr)
th = 'Xesp_%s.%s' % (sarr[1], sarr[2].lower())
outlist.append('>%s\n%s' % (th,ts))
with open(outfile, 'w') as fout:
fout.write('%s\n' % '\n'.join(outlist))
cp._info('save to %s' % outfile)
# change fasta header to align with adig.rownames.txt
# from:">Sc4wPfr_165.g10029.t1" to: "Hvul_g10029_1.t1"
def gene_align_hvul(args):
assert len(args) == 2, 'Usage: python proc_coral_samap.py gene_align_hvul hydra2.0_genemodels.aa hvul.proteome.rename.fasta'
infile = args[0]
outfile = args[1]
outlist = []
for h,s in cp.fasta_iter(infile):
ts = s.translate(str.maketrans('','',''.join(cp.gaps))) # remove '.' at the end of some sequences
#print(">%s\n%s\n" % (h,ts))
sarr0 = h.split('.') # ['>Sc4wPfr_165', 'g10029', 't1']
th = 'Hvul_%s_1.%s' % (sarr0[1], sarr0[2]) # Hvul_g10029_1.t1
outlist.append('>%s\n%s' % (th,ts))
with open(outfile, 'w') as fout:
fout.write('%s\n' % '\n'.join(outlist))
cp._info('save to %s' % outfile)
# change fasta header to align with adig.rownames.txt
# from:">adig_s0001.g1.t2 gene=adig_s0001.g1" to: "adig-s0001.g1.t2"
def gene_align_adig(args):
assert len(args) == 2, 'Usage: python proc_coral_samap.py gene_align_adig adig.pep.fasta output.fastas'
infile = args[0]
outfile = args[1]
outlist = []
for h,s in cp.fasta_iter(infile):
ts = s.translate(str.maketrans('','',''.join(cp.gaps))) # remove '.' at the end of some sequences
#print(">%s\n%s\n" % (h,ts))
sarr0 = h.split(' ') # ['>adig_s0001.g1.t2' 'gene=adig_s0001.g1']
sarr1 = sarr0[0].split('_') # ['adig', 's0001.g1.t2']
th = '%s-%s' % (sarr1[0], sarr1[1]) # adig-s0001.g1.t2
outlist.append('>%s\n%s' % (th,ts))
with open(outfile, 'w') as fout:
fout.write('%s\n' % '\n'.join(outlist))
cp._info('save to %s' % outfile)
# append meta_cell assignments to ../stage2.blast/01.xxxx.counts_pr.cluster_info.h5ad
# change assignment name to have cell type family appended at the end
# slice expression data by labels=['gastrodermis', 'alga-hosting_cells']
# save to h5ad file for samap ppl
def _proc_stage4_prepare_h5(h5file, meta_assignment_file, outfile):
s=sc.read_h5ad(h5file)
_append_assignments(s,meta_assignment_file,cluster_colnames=['metacell'])
# change cluster name
# s.obs['cell_type']=s.obs['cell_type'].astype(str)+' ('+s.obs['cell_type_family'].astype(str)+')'
s.obs['metacell']=s.obs['metacell'].astype(str)+' ('+s.obs['cell_type'].astype(str)+')'
# slice expression data
target = 'cell_type_family'
labels=['gastrodermis', 'alga-hosting_cells']
n=_slice_by_cluster(s, target, labels, ['leiden_clusters'])
# save to h5 file
n.write_h5ad(outfile)
cp._info('save to %s' % outfile)
# append cell type label to metacell labels
def _proc_anno_metacell():
sam = SAM()
sam.load_data('01.hvul.sam.gastodermis.all_info.h5ad')
sam.adata.obs['metacell']=sam.adata.obs['metacell'].astype(str)+' ('+sam.adata.obs['cell_type'].astype(str)+')'
sam.save_anndata('01.hvul.sam.gastodermis.all_info.h5ad-1')
sam = SAM()
sam.load_data('01.nvec.sam.gastodermis.all_info.h5ad')
sam.adata.obs['metacell']=sam.adata.obs['metacell'].astype(str)+' ('+sam.adata.obs['cell_type'].astype(str)+')'
sam.save_anndata('01.nvec.sam.gastodermis.all_info.h5ad-1')
sam = SAM()
sam.load_data('01.spis.sam.gastodermis.all_info.h5ad')
sam.adata.obs['metacell']=sam.adata.obs['metacell'].astype(str)+' ('+sam.adata.obs['cell_type'].astype(str)+')'
sam.save_anndata('01.spis.sam.gastodermis.all_info.h5ad-1')
sam = SAM()
sam.load_data('01.xesp.sam.gastodermis.all_info.h5ad')
sam.adata.obs['metacell']=sam.adata.obs['metacell'].astype(str)+' ('+sam.adata.obs['cell_type'].astype(str)+')'
sam.save_anndata('01.xesp.sam.gastodermis.all_info.h5ad-1')
# prepend sid to all the clusters
def _proc_prepend_sid(samap_file):
sm = load_samap(samap_file)
for sid in sm.sams.keys():
for col in sm.sams[sid].adata.obs:
sm.sams[sid].adata.obs[col]= sid +'_'+ sm.sams[sid].adata.obs[col].astype(str)
return sm
# append cell_type annotation to leiden clusters
# target_label: 'leiden_clusters'
# anno_label: 'cell_type'
def _proc_append_anno(samap_file, target_label, anno_label):
# 02.adhy.samap.blast.gastrodermis.leiden.pkl
sm=load_samap(samap_file)
pid = samap_file.split('.')[1]
cp._info('pid: %s' % pid)
s1 = sm.sams[pid[:2]].adata.obs
if anno_label in s1:
s1[target_label]=s1[target_label].astype(str)+' ('+s1[anno_label].astype(str)+')'
s2 = sm.sams[pid[2:]].adata.obs
if anno_label in s2:
s2[target_label]=s2[target_label].astype(str)+' ('+s2[anno_label].astype(str)+')'
return sm
# alns = dict((pc._proc_calc_alignments(pc._proc_anno_leiden(sf), sf+'.alignment.tsv')) for sf in samap_files)
# alns for plot sankey plot
def _proc_calc_alignments(sm, outfile):
ks = list(sm.sams.keys())
ks.sort()
pid = ''.join(ks)
assignments = dict((k,'leiden_clusters') for k in sm.sams.keys())
D,MappingTable = get_mapping_scores(sm,assignments,n_top = -1)
MappingTable.to_csv(outfile, sep='\t', index=True, header=True)
return pid, (D, MappingTable)
# for stage3.gastrodermis comparison
# run samap for all ad vs {sp, hy, xe, nv}
# label: {'cell_type', 'metacell', 'cell_type_family'}
# map_opt: {'blast', 'prost'}
def _proc_gastrodermis(map_opt='blast', label='cell_type'):
maps= 'maps.%s/' % map_opt
alignments={}
assignments = {'ad':'Jake', 'sp':label} if label!=None else None
outprefix = '02.adsp.samap.%s.gastrodermis.jake_%s' % (map_opt, label) if label!=None else '02.adsp.samap.%s.gastrodermis.leiden' % (map_opt)
sm = _run_samap_with_h5('ad', '01.adig.sam.gastodermis.all_info.h5ad', 'sp', '01.spis.sam.gastodermis.all_info.h5ad', samobj=True, map_p=maps, assignments=assignments)
save_samap(sm, outprefix+'.pkl')
if label==None:
assignments={'ad':'leiden_clusters', 'sp': 'leiden_clusters'}
D,MappingTable = get_mapping_scores(sm,assignments,n_top = -1)
MappingTable.to_csv(outprefix+'.alignment_score.tsv', sep='\t', index=True, header=True)
alignments['adsp'] = (D, MappingTable)
assignments = {'ad':'Jake', 'hy':label} if label!=None else None
outprefix = '02.adhy.samap.%s.gastrodermis.jake_%s' % (map_opt, label) if label!=None else '02.adhy.samap.%s.gastrodermis.leiden' % (map_opt)
sm = _run_samap_with_h5('ad', '01.adig.sam.gastodermis.all_info.h5ad', 'hy', '01.hvul.sam.gastodermis.all_info.h5ad', samobj=True, map_p=maps, assignments=assignments)
save_samap(sm, outprefix+'.pkl')
if label==None:
assignments={'ad':'leiden_clusters', 'hy': 'leiden_clusters'}
D,MappingTable = get_mapping_scores(sm,assignments,n_top = -1)
MappingTable.to_csv(outprefix+'.alignment_score.tsv', sep='\t', index=True, header=True)
alignments['adhy'] = (D, MappingTable)
assignments = {'ad':'Jake', 'xe':label} if label!=None else None
outprefix = '02.adxe.samap.%s.gastrodermis.jake_%s' % (map_opt, label) if label!=None else '02.adxe.samap.%s.gastrodermis.leiden' % (map_opt)
sm = _run_samap_with_h5('ad', '01.adig.sam.gastodermis.all_info.h5ad', 'xe', '01.xesp.sam.gastodermis.all_info.h5ad', samobj=True, map_p=maps, assignments=assignments)
save_samap(sm, outprefix+'.pkl')
if label==None:
assignments={'ad':'leiden_clusters', 'xe': 'leiden_clusters'}
D,MappingTable = get_mapping_scores(sm,assignments,n_top = -1)
MappingTable.to_csv(outprefix+'.alignment_score.tsv', sep='\t', index=True, header=True)
alignments['adxe'] = (D, MappingTable)
assignments = {'ad':'Jake', 'nv':label} if label!=None else None
outprefix = '02.adnv.samap.%s.gastrodermis.jake_%s' % (map_opt, label) if label!=None else '02.adnv.samap.%s.gastrodermis.leiden' % (map_opt)
sm = _run_samap_with_h5('ad', '01.adig.sam.gastodermis.all_info.h5ad', 'nv', '01.nvec.sam.gastodermis.all_info.h5ad', samobj=True, map_p=maps, assignments=assignments)
save_samap(sm, outprefix+'.pkl')
if label==None:
assignments={'ad':'leiden_clusters', 'nv': 'leiden_clusters'}
D,MappingTable = get_mapping_scores(sm,assignments,n_top = -1)
MappingTable.to_csv(outprefix+'.alignment_score.tsv', sep='\t', index=True, header=True)
alignments['adnv'] = (D, MappingTable)
return alignments
# save: samap.adata.obs["leiden_clusters_dk"] = pd.Categorical(pc._leiden(samap.adata.uns['Dk'],res=3))
def _leiden(X, res=1, method="modularity", seed = 0):
if not sp.sparse.isspmatrix_csr(X):
X = sp.sparse.csr_matrix(X)
import igraph as ig
import leidenalg
sources, targets = X.nonzero()
weights = X[sources, targets]
if isinstance(weights, np.matrix):
weights = weights.A1
g = ig.Graph(directed=True)
g.add_vertices(X.shape[0])
g.add_edges(list(zip(sources, targets)))
try:
g.es["weight"] = weights
except BaseException:
pass
if method == "significance":
cl = leidenalg.find_partition(g, leidenalg.SignificanceVertexPartition,seed=seed)
else:
cl = leidenalg.find_partition(
g, leidenalg.RBConfigurationVertexPartition, resolution_parameter=res,seed=seed
)
return np.array(cl.membership)
# calculate leiden cluster using given res
# compare new calculated leiden clusters with leiden_clusters_dk
# ARI and AMI are used for cluster similarity comparison
def _leiden_comp(sam, res):
import sklearn.metrics as sk
sam.leiden_clustering(res=res)
# np conversion is not necessary for now
a = np.array(sam.adata.obs['leiden_clusters'])
b = np.array(sam.adata.obs['leiden_clusters_dk'])
#return [sk.adjusted_mutual_info_score(a,b), sk.adjusted_rand_score(a,b)]
#return (res, sk.adjusted_rand_score(a,b))
return (res, sk.adjusted_mutual_info_score(a,b))
############################################################################################
# visualization related --------------------------------------------------------------
############################################################################################
def _heatmap(m):
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=0.5)
fig, ax = plt.subplots(figsize=(14,11))