In [1]:
#=== Install and Imports ===#

## installs (in shell) ##
## install in a conda env:
# conda activate salmon
# conda install -c conda-forge -c bioconda salmon


## imports ##
import pipeline_utils
import os
import urllib.request
from datetime import datetime
import sys
from contextlib import redirect_stdout
import subprocess


In [2]:
#=== Step 1: Generate and document our reference genomes ===#
## NOTE: genome assemblies can be obtained at: http://ftp.ensembl.org/pub/
## NOTE: check for current builds
## NOTE: for documentation purposes, save the date/time and urls in a logfile along with genome files
## NOTE: for GRCh37 build;
# dna_rm' - masked genomic DNA.  Interspersed repeats and low complexity regions are detected with the RepeatMasker tool and masked by replacing repeats with 'N's.
#'dna_sm' - soft-masked genomic DNA. All repeats and low complexity regions have been replaced with lowercased versions of their nucleic base
## NOTE: hg19 (maintained by UCSC) = GRCh37 (maintained by genome reference consortium), same for hg38/GRCh38


## Define human genome sequences and annotations to download ##
human_output_dir = "../0.local/generic-single-cell-pipeline/genomes/human/"
os.makedirs(human_output_dir, exist_ok=True)
human_genomes = {
    "GRCh38": {
        "genome_url": "http://ftp.ensembl.org/pub/release-113/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
        "gtf_url": "http://ftp.ensembl.org/pub/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh38.113.gtf.gz",
        "cdna_url": "http://ftp.ensembl.org/pub/release-113/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz",
        "cds_url": "http://ftp.ensembl.org/pub/release-113/fasta/homo_sapiens/cds/Homo_sapiens.GRCh38.cds.all.fa.gz",
        "ncrna_url": "http://ftp.ensembl.org/pub/release-113/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz",
        "pep_url": "http://ftp.ensembl.org/pub/release-113/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz"
    }
}

## Define mouse genome sequences and annotations to download
mouse_output_dir = "../0.local/generic-single-cell-pipeline/genomes/mouse/"
os.makedirs(mouse_output_dir, exist_ok=True)
mouse_genomes = {
    "GRCm38": {
        "fasta_url": "http://ftp.ensembl.org/pub/release-111/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna_sm.primary_assembly.fa.gz",
        "gtf_url": "http://ftp.ensembl.org/pub/release-111/gtf/mus_musculus/Mus_musculus.GRCm39.111.gtf.gz" 
     },
    "GRCm39": {
        "fasta_url": "http://ftp.ensembl.org/pub/release-113/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna.primary_assembly.fa.gz",
        "gtf_url": "http://ftp.ensembl.org/pub/release-113/gtf/mus_musculus/Mus_musculus.GRCm39.113.gtf.gz"
    }
}

#### Download
pipeline_utils.download_genome(human_genomes, human_output_dir)




Downloading 1 genomes
Currently downloading genome GRCh38
Saving genome data at ../0.local/generic-single-cell-pipeline/genomes/human/GRCh38
Saving logfile to: ../0.local/generic-single-cell-pipeline/genomes/human/GRCh38/GRCh38_download_log.txt


In [4]:
# Salmon, like eXpress 1, uses a streaming inference method to perform transcript-level quantification.
# One of the fundamental assumptions of such inference methods is that observations (i.e. reads or alignments)
# are made “at random”. This means, for example, that alignments should not be sorted by target or position.
# If your reads or alignments do not appear in a random order with respect to the target transcripts,
# please randomize / shuffle them before performing quantification with Salmon.

#=== Build transcript indexes using salmon ===#
cdna_path = "/home/ubuntu1/GitHub/0.local/generic-single-cell-pipeline/genomes/human/GRCh38/Homo_sapiens.GRCh38.cdna.all.fa.gz"
index_path = "/home/ubuntu1/GitHub/0.local/generic-single-cell-pipeline/genomes/human/GRCh38/index/transcript/salmon/"
#logfile_path = "index/transcript/salmon/nohup_log.txt"
#working_dir = "/home/ubuntu1/GitHub/0.local/generic-single-cell-pipeline/genomes/human/GRCh38/"


subprocess.run([
    "salmon", "index",
    "-t", cdna_path,
    "-i", index_path,
    "--threads", "16"
])

Version Info: This is the most recent version of salmon.
[2025-04-18 13:15:15.496] [jLog] [info] building index
out : /home/ubuntu1/GitHub/0.local/generic-single-cell-pipeline/genomes/human/GRCh38/index/transcript/salmon/
[2025-04-18 13:15:15.496] [puff::index::jointLog] [info] Running fixFasta

[Step 1 of 4] : counting k-mers



[2025-04-18 13:15:33.753] [puff::index::jointLog] [info] Replaced 100,005 non-ATCG nucleotides
[2025-04-18 13:15:33.753] [puff::index::jointLog] [info] Clipped poly-A tails from 1,517 transcripts
wrote 194120 cleaned references
[2025-04-18 13:15:35.100] [puff::index::jointLog] [info] Filter size not provided; estimating from number of distinct k-mers
[2025-04-18 13:15:41.062] [puff::index::jointLog] [info] ntHll estimated 115908649 distinct k-mers, setting filter size to 2^31


Threads = 16
Vertex length = 31
Hash functions = 5
Filter size = 2147483648
Capacity = 2
Files: 
/home/ubuntu1/GitHub/0.local/generic-single-cell-pipeline/genomes/human/GRCh38/index/transcript/salmon/ref_k31_fixed.fa
--------------------------------------------------------------------------------
Round 0, 0:2147483648
Pass	Filling	Filtering
1	14	22	
2	17	0
True junctions count = 796453
False junctions count = 1438281
Hash table size = 2234734
Candidate marks count = 13034038
--------------------------------------------------------------------------------
Reallocating bifurcations time: 1
True marks count: 8240193
Edges construction time: 17
--------------------------------------------------------------------------------
Distinct junctions = 796453



TwoPaCo::buildGraphMain:: allocated with scalable_malloc; freeing.
TwoPaCo::buildGraphMain:: Calling scalable_allocation_command(TBBMALLOC_CLEAN_ALL_BUFFERS, 0);
allowedIn: 18
Max Junction ID: 903709
seen.size():7229681 kmerInfo.size():903710
approximateContigTotalLength: 81004901
counters for complex kmers:
(prec>1 & succ>1)=55259 | (succ>1 & isStart)=849 | (prec>1 & isEnd)=854 | (isStart & isEnd)=80
contig count: 1195650 element count: 152578102 complex nodes: 57042
# of ones in rank vector: 1195649
[2025-04-18 13:17:23.022] [puff::index::jointLog] [info] Starting the Pufferfish indexing by reading the GFA binary file.
[2025-04-18 13:17:23.022] [puff::index::jointLog] [info] Setting the index/BinaryGfa directory /home/ubuntu1/GitHub/0.local/generic-single-cell-pipeline/genomes/human/GRCh38/index/transcript/salmon
size = 152578102
-----------------------------------------
| Loading contigs | Time = 26.286 ms
-----------------------------------------
size = 152578102
------------------

for info, total work write each  : 2.331    total work inram from level 3 : 4.322  total work raw : 25.000 
Bitarray       611521472  bits (100.00 %)   (array + ranks )
final hash             0  bits (0.00 %) (nb in final hash 0)


[2025-04-18 13:17:37.666] [puff::index::jointLog] [info] finished writing dense pufferfish index
[2025-04-18 13:17:37.825] [jLog] [info] done building index


CompletedProcess(args=['salmon', 'index', '-t', '/home/ubuntu1/GitHub/0.local/generic-single-cell-pipeline/genomes/human/GRCh38/Homo_sapiens.GRCh38.cdna.all.fa.gz', '-i', '/home/ubuntu1/GitHub/0.local/generic-single-cell-pipeline/genomes/human/GRCh38/index/transcript/salmon/', '--threads', '16'], returncode=0)

In [21]:
#=== Processing single-cell data generated by 10X with cell ranger ===#
# see link for guide: https://www.10xgenomics.com/support/software/cell-ranger/downloads#download-links

## get the compiled binary file ##
#url = "https://cf.10xgenomics.com/releases/cell-exp/cellranger-9.0.1.tar.gz?Expires=1745041256&Key-Pair-Id=APKAI7S6A5RYOXBWRPDA&Signature=JIIo9YRo~c4N0qV9Jk~pf0bIijj7E4wYUbHbcyVZEhj0lDIgrVsXSzV2623rrjdlZ2EVM8M5R-1Kaej5HsAtDy2Ib~Jjr9YlSvRMO0U5mEDo9cYK0kd5jZFpRkDs2QByTEb0iE-5MFAIqq2yTLnyyjp4xGQB1UTOQ2M2romVOBLBZpZ2ulx0Myg-KcSWxTsbmzVuhM7YW~wYKJ22s~R1sQl9bJ~ybQTa-eoCL1x7LjFlfGdUzKQPjapTBvhzqU0xkPIhHWiiqjKRSZmEDDQKAcL4y~P~WauLdUP6VqTBRAtdW4NcZ6wE03t67M~nVGVffE6LWsuVFFSFdv43xUV-YQ__"
#
#subprocess.run(["wget", "-O", "cellranger-9.0.1.tar.gz", url],
#              cwd="/home/ubuntu1/programs/cellranger/")

# untar gunzip
#subprocess.run(["tar", "-xvzf", "cellranger-9.0.1.tar.gz"],
#              cwd="/home/ubuntu1/programs/cellranger/")

# add to path variable, will only work for interactive shell
#cellranger_path = "/home/ubuntu1/programs/cellranger/cellranger-9.0.1/bin/"
#bashrc_line = f'\nexport PATH="{cellranger_path}:$PATH"\n'
#subprocess.run(
#    f'echo "{bashrc_line}" >> ~/.bashrc',
#    shell=True,
#    executable="/bin/bash"
#)

# check the path in python
#import os
#print(os.environ["PATH"])

## NOTE: it is required to explicitly specify the path of cellranger here to run the subprocess command ##
# test install
#env = os.environ.copy()
#env["PATH"] = f"/home/ubuntu1/programs/cellranger/cellranger-9.0.1/bin/:{env['PATH']}"
#subprocess.run(["cellranger", "--version"], env=env)

## download the reference genome ##
#url = "https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-GRCh38-2024-A.tar.gz"
#subprocess.run(["mkdir", "reference"],
              #cwd="/home/ubuntu1/programs/cellranger/")
#subprocess.run(["wget", "-O", "refdata-gex-GRCh38-2024-A.tar.gz", url],
              #cwd="/home/ubuntu1/programs/cellranger/reference")

# untar gunzip
#subprocess.run(["tar", "-xvzf", "refdata-gex-GRCh38-2024-A.tar.gz"],
#              cwd="/home/ubuntu1/programs/cellranger/reference")




refdata-gex-GRCh38-2024-A/
refdata-gex-GRCh38-2024-A/reference.json
refdata-gex-GRCh38-2024-A/star/
refdata-gex-GRCh38-2024-A/star/exonGeTrInfo.tab
refdata-gex-GRCh38-2024-A/star/sjdbInfo.txt
refdata-gex-GRCh38-2024-A/star/chrLength.txt
refdata-gex-GRCh38-2024-A/star/exonInfo.tab
refdata-gex-GRCh38-2024-A/star/Genome
refdata-gex-GRCh38-2024-A/star/chrName.txt
refdata-gex-GRCh38-2024-A/star/chrStart.txt
refdata-gex-GRCh38-2024-A/star/chrNameLength.txt
refdata-gex-GRCh38-2024-A/star/geneInfo.tab
refdata-gex-GRCh38-2024-A/star/transcriptInfo.tab
refdata-gex-GRCh38-2024-A/star/sjdbList.fromGTF.out.tab
refdata-gex-GRCh38-2024-A/star/sjdbList.out.tab
refdata-gex-GRCh38-2024-A/star/genomeParameters.txt
refdata-gex-GRCh38-2024-A/star/SA
refdata-gex-GRCh38-2024-A/star/SAindex
refdata-gex-GRCh38-2024-A/fasta/
refdata-gex-GRCh38-2024-A/fasta/genome.fa.fai
refdata-gex-GRCh38-2024-A/fasta/genome.fa
refdata-gex-GRCh38-2024-A/genes/
refdata-gex-GRCh38-2024-A/genes/genes.gtf.gz


CompletedProcess(args=['tar', '-xvzf', 'refdata-gex-GRCh38-2024-A.tar.gz'], returncode=0)

In [None]:
#=== Find sequencing runs that use 3' capture technology ===#


