🎯 Goal: Build a Bash script that runs your entire RNA-seq pipeline

A fully automated script that:
Downloads .fastq.gz files using a provided runinfo.csv
Trims reads in parallel using fastp
Indexes the genome if needed
Aligns reads with hisat2 (maximizing thread efficiency)
Generates a gene count matrix using featureCounts

🧠 Assumptions

✅ You need to provide to /rnaseq_project/reference/ folder:
runinfo.csv	eg runinfo.csv
Genome FASTA	eg GCF_000484505.2_ASM48450v2_genomic.fna
GTF annotation file	eg GCF_000484505.2_ASM48450v2_genomic.gtf
GFF file (optional)	eg GCF_000484505.2_ASM48450v2_genomic.gff
✅ You’re using ~/BifrostOmics/rnaseq_project/
✅ Conda env bioinformaatika0 active

🛠️ Step-by-step Bash Pipeline Script:

In [None]:
#!/bin/bash
# File: rnaseq_pipeline.sh
# Location: ~/BifrostOmics/rnaseq_project/
# Description: Full RNA-seq pipeline from SRA download to read count matrix

set -e  # Exit on error

# -------------------
# PARAMETERS
# -------------------
RAW_DIR="raw_data"
TRIMMED_DIR="trimmed_data"
COUNT_DIR="counts"
REF_DIR="reference"
INDEX_DIR="$REF_DIR/hisat_index"

GENOME_FA="$REF_DIR/GCF_000484505.2_ASM48450v2_genomic.fna"
GTF_FILE="$REF_DIR/GCF_000484505.2_ASM48450v2_genomic.gtf"
GFF_FILE="$REF_DIR/GCF_000484505.2_ASM48450v2_genomic.gff"
HISAT_INDEX="$INDEX_DIR/genome_index"

THREADS=8
CORES_ALIGN=6
CORES_SAMTOOLS=2

# -------------------
# CHECK REQUIRED FILES
# -------------------
if [[ ! -f "$REF_DIR/runinfo.csv" ]]; then
  echo "❌ runinfo.csv not found in $REF_DIR"
  exit 1
fi
if [[ ! -f "$GENOME_FA" || ! -f "$GTF_FILE" ]]; then
  echo "❌ Reference genome (.fna) and/or annotation (.gtf) missing in $REF_DIR"
  exit 1
fi

# -------------------
# PREP FOLDERS
# -------------------
mkdir -p "$RAW_DIR" "$TRIMMED_DIR" "$ALIGN_DIR" "$COUNT_DIR" "$INDEX_DIR"

# -------------------
# STEP 1: DOWNLOAD FASTQ FILES
# -------------------
echo "🔽 Downloading FASTQ files..."
cd "$RAW_DIR"
awk -F, 'NR>1 {print $1}' "../$REF_DIR/runinfo.csv" > SRR_list.txt

# Download using fasterq-dump
parallel --jobs 4 "fasterq-dump --split-files --gzip {}" :::: SRR_list.txt
cd -

# -------------------
# STEP 2: TRIM READS IN PARALLEL
# -------------------
echo "✂️ Trimming reads in parallel..."

echo "$(pwd)"

process_trim() {
    srr="$1"
    fastp \
      -i "$RAW_DIR/${srr}_1.fastq.gz" \
      -I "$RAW_DIR/${srr}_2.fastq.gz" \
      -o "$TRIMMED_DIR/${srr}_1.trimmed.fastq.gz" \
      -O "$TRIMMED_DIR/${srr}_2.trimmed.fastq.gz" \
      --detect_adapter_for_pe \
      --thread 2 \
      --html "$TRIMMED_DIR/${srr}.html" \
      --json "$TRIMMED_DIR/${srr}.json"
}

export -f process_trim
parallel --jobs 4 process_trim :::: "$RAW_DIR/SRR_list.txt"

# -------------------
# STEP 3: HISAT2 INDEXING
# -------------------
echo "🧬 Building genome index (if needed)..."
if [ ! -f "$HISAT_INDEX.1.ht2" ]; then
  hisat2-build -p $THREADS "$GENOME_FA" "$HISAT_INDEX"
else
  echo "Index already exists. Skipping."
fi

# -------------------
# STEP 4: ALIGN READS
# -------------------
echo "🎯 Aligning reads..."

align_sample() {
    srr="$1"
    r1="$TRIMMED_DIR/${srr}_1.trimmed.fastq.gz"
    r2="$TRIMMED_DIR/${srr}_2.trimmed.fastq.gz"
    out_bam="$ALIGN_DIR/${srr}.sorted.bam"

    hisat2 -p $CORES_ALIGN -x "$HISAT_INDEX" -1 "$r1" -2 "$r2" | \
        samtools sort -@ $CORES_SAMTOOLS -o "$out_bam"

    samtools index "$out_bam"
}

export -f align_sample
parallel --jobs 1 align_sample :::: "$RAW_DIR/SRR_list.txt"

# -------------------
# STEP 5: FEATURECOUNTS
# -------------------
echo "📊 Counting reads with featureCounts..."
featureCounts -T $THREADS -a "$GTF_FILE" -o "$COUNT_DIR/gene_counts.txt" "$ALIGN_DIR"/*.bam

# -------------------
# DONE!
# -------------------
echo "✅ All steps complete!"

To run the script:

In [None]:
cd ~/BifrostOmics/rnaseq_project
bash rnaseq_pipeline.sh