/
pipeline.cwl
202 lines (201 loc) · 5.63 KB
/
pipeline.cwl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env cwl-runner
class: Workflow
cwlVersion: v1.0
label: scRNA-seq pipeline using Salmon and Alevin
requirements:
SubworkflowFeatureRequirement: {}
ScatterFeatureRequirement: {}
inputs:
fastq_dir:
label: "Directory containing FASTQ files"
type: Directory[]
assay:
label: "scRNA-seq assay"
type: string
threads:
label: "Number of threads for Salmon"
type: int
default: 1
expected_cell_count:
type: int?
outputs:
salmon_output:
outputSource: salmon/output_dir
type: Directory
label: "Full output of `salmon alevin`"
count_matrix_h5ad:
outputSource: annotate_cells/annotated_h5ad_file
type: File
label: "Unfiltered count matrix from Alevin, converted to H5AD, spliced and unspliced counts"
raw_count_matrix:
outputSource: alevin_to_anndata/raw_expr_h5ad
type: File
label: "Unfiltered count matrix from Alevin, converted to H5AD, with intronic regions"
fastqc_dir:
outputSource: fastqc/fastqc_dir
type: Directory[]
label: "Directory of FastQC output files, mirroring input directory structure"
scanpy_qc_results:
outputSource: compute_qc_results/scanpy_qc_results
type: File
label: "Quality control metrics from Scanpy"
qc_report:
outputSource: compute_qc_results/qc_metrics
type: File
label: "Quality control report in JSON format"
dispersion_plot:
outputSource: scanpy_analysis/dispersion_plot
type: File
label: "Gene expression dispersion plot"
umap_plot:
outputSource: scanpy_analysis/umap_plot
type: File
label: "UMAP dimensionality reduction plot"
umap_density_plot:
outputSource: scanpy_analysis/umap_density_plot
type: File
label: "UMAP dimensionality reduction plot, colored by cell density"
slideseq_plot:
outputSource: scanpy_analysis/slideseq_plot
type: File?
label: "Slide-seq bead plot, colored by Leiden cluster"
filtered_data_h5ad:
outputSource: scanpy_analysis/filtered_data_h5ad
type: File
label: Full data set of filtered results
doc: >-
Full data set of filtered results: expression matrix, coordinates in
dimensionality-reduced space (PCA and UMAP), cluster assignments via
the Leiden algorithm, and marker genes for one cluster vs. rest
marker_gene_plot_t_test:
outputSource: scanpy_analysis/marker_gene_plot_t_test
type: File
label: "Cluster marker genes, t-test"
marker_gene_plot_logreg:
outputSource: scanpy_analysis/marker_gene_plot_logreg
type: File
label: "Cluster marker genes, logreg method"
scvelo_annotated_h5ad:
outputSource: scvelo_analysis/annotated_h5ad_file
type: File
label: "scVelo-annotated h5ad file, including cell RNA velocity"
scvelo_embedding_grid_plot:
outputSource: scvelo_analysis/embedding_grid_plot
type: File
label: "scVelo velocity embedding grid plot"
scvelo_embedding_stream_plot:
outputSource: scvelo_analysis/embedding_stream_plot
type: File?
label: "scVelo velocity embedding stream plot"
steps:
adjust_barcodes:
in:
fastq_dir:
source: fastq_dir
assay:
source: assay
out: [adj_fastq_dir, metadata_json]
run: steps/adjust-barcodes.cwl
trim_reads:
in:
orig_fastq_dirs:
source: fastq_dir
adj_fastq_dir:
source: adjust_barcodes/adj_fastq_dir
assay:
source: assay
threads:
source: threads
out: [trimmed_fastq_dir]
run: steps/trim-reads.cwl
salmon:
in:
orig_fastq_dirs:
source: fastq_dir
trimmed_fastq_dir:
source: trim_reads/trimmed_fastq_dir
assay:
source: assay
threads:
source: threads
expected_cell_count:
source: expected_cell_count
out:
- output_dir
run: steps/salmon.cwl
label: "Salmon Alevin, with index from GRCh38 transcriptome"
fastqc:
scatter: [fastq_dir]
scatterMethod: dotproduct
in:
fastq_dir:
source: fastq_dir
threads:
source: threads
out:
- fastqc_dir
run: steps/fastqc.cwl
label: "Run fastqc on all fastq files in fastq directory"
alevin_to_anndata:
in:
alevin_dir:
source: salmon/output_dir
out:
- expr_h5ad
- raw_expr_h5ad
run: steps/alevin-to-anndata.cwl
label: "Convert Alevin output to AnnData object in h5ad format"
annotate_cells:
in:
orig_fastq_dirs:
source: fastq_dir
assay:
source: assay
h5ad_file:
source: alevin_to_anndata/expr_h5ad
metadata_json:
source: adjust_barcodes/metadata_json
out:
- annotated_h5ad_file
run: steps/annotate-cells.cwl
scanpy_analysis:
in:
assay:
source: assay
h5ad_file:
source: annotate_cells/annotated_h5ad_file
out:
- filtered_data_h5ad
- umap_plot
- marker_gene_plot_t_test
- marker_gene_plot_logreg
- dispersion_plot
- umap_density_plot
- slideseq_plot
run: steps/scanpy-analysis.cwl
label: "Secondary analysis via ScanPy"
scvelo_analysis:
in:
spliced_h5ad_file:
source: annotate_cells/annotated_h5ad_file
out:
- annotated_h5ad_file
- embedding_grid_plot
- embedding_stream_plot
run: steps/scvelo-analysis.cwl
label: "RNA velocity analysis via scVelo"
compute_qc_results:
in:
assay:
source: assay
h5ad_primary:
source: annotate_cells/annotated_h5ad_file
h5ad_secondary:
source: scanpy_analysis/filtered_data_h5ad
salmon_dir:
source: salmon/output_dir
out:
- scanpy_qc_results
- qc_metrics
run: steps/compute-qc-metrics.cwl
label: "Compute QC metrics"