bash script for initial fastqc on raw reads - important to specify an output directory (with -o flag)

sarah said any job under 24 hours is still considered a short job, so good to just have that be your default so things don't run out of time

In [None]:
#!/bin/bash

#!/bin/bash
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID

module load miniconda/4.11.0

conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge
conda config --set channel_priority strict

conda create --name myenv python=3.11
conda activate myenv


conda install fastqc


fastqc -o /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/fastqc /project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/methyl_raw/*

easier to look and interpret the fastqc results with multiqc

In [None]:
#!/bin/bash
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID

module load miniconda/4.11.0

conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge
conda config --set channel_priority strict

conda create --name myenv python=3.11
conda activate myenv

conda install multiqc


multiqc -o /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/multiqc /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/fastqc/fastqc_html/*


results from this is an html link - have to download it, then open it on a browser

assuming everything looks okay (or good enough, quality won't be there yet) - trim reads - I used trim-galore

In [None]:
#!/bin/bash
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID

#-----------------modules-----------------#
module load miniconda/4.11.0

# have to make sure you have cutadapt AND fastqc installed before trim_galore

conda create -n cutadaptenv cutadapt
conda activate cutadaptenv

conda install trim-galore

#---------------change wd----------------#

cd project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/methyl_raw/

#-----------------commands----------------#

for f in $(ls project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/methyl_raw/ | sed 's/[^-]*$//'); 
do 
    trim_galore -q 20 --phred33 --length 20 --max_length 40 --paired "$f"CV_R1_001.fastq.gz "$f"CV_R2_001.fastq.gz --output_dir project/pi_sarah_gignouxwolfsohn_uml_edu/julia/trimmed/filtered_auto_trim_sequences/;
done

#
echo = `date` job $JOB_NAME done

phred score min of 33, min length 20bp, max length 40bp, paired-ends, didn't specify an adapter to remove so the default is auto-detect - and need to specify output directory

then fastqc and multiqc on these filtered and trimmed samples and compared - made sure quality checks were better

for mapping, tried hisat2 but was getting low alignment rates - sticking with bowtie2 instead - get alignment rates between ~85-90%

Submitted GenBank assembly GCA_002022765.4 https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_002022765.2/

In [None]:
#!/bin/bash
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID

# ----------------Modules------------------------- #

module load miniconda/4.11.0
module load bowtie2/2.4.2

#------------build index------------------#

# This line creates the index in the current directory
bowtie2-build GCA_002022765.4_C_virginica-3.0_genomic.fna reference_index

# Set the paths and directories
reference_index="reference_index"

# Directory containing your input FASTQ files
input_dir="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/trimmed/filtered_auto_trim_sequences/trim_files"

# Output directory for Bowtie2 results
output_dir="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2/filtered_alignment_output"

# Create an array with your sample names
sample_names=( "2018--BBB-WBO-B21-CV"
"2018--BBB-WBV-B70-CV"
"2018--BBO-BBO-B16-CV"
"2018--BBO-BBY-B27-CV"
"2018--BBO-WBO-B16-CV"
"2018--BBO-WBV-B64-CV"
"2018--BBR-BBB-B50-CV"
"2018--BBR-BBG-B38-CV"
"2018--BBR-BBY-B26-CV"
"2018--BBY-WBG-B42-CV"
"2018--BPO-BPO-O16-CV"
"2018--BPR-BPG-O38-CV"
"2018--BPR-BPR-O02-CV"
"2018--BPY-BPG-O42-CV"
"2018--BPY-BPY-O29-CV"
"2018--WBB-WBV-W69-CV"
"2018--WBG-BBB-W56-CV"
"2018--WBG-WBG-W44-CV"
"2018--WBO-BBR-W03-CV"
"2018--WBO-WBV-W64-CV"
"2018--WBR-BBY-W25-CV"
"2018--WBV-WBO-W23-CV"
"2018--WBV-WBR-W12-CV"
"2018--WBY-BBV-W65-CV"
"2018--WBY-BBY-W30-CV"
"2018--WPB-BPG-G45-CV"
"2018--WPO-BPO-G16-CV"
"2018--WPO-BPY-G28-CV"
"2018--WPR-BPY-G25-CV"
"2018--WPV-BPR-G11-CV" )


# Loop through the sample names
for sample_name in "${sample_names[@]}"; do
    # Construct the file names for R1 and R2
    read1="${input_dir}/${sample_name}_R1_001_val_1.fq.gz"
    read2="${input_dir}/${sample_name}_R2_001_val_2.fq.gz"

    # Output SAM file with full path to the output directory
    output_sam="${output_dir}/${sample_name}_alignment.sam"

    # Run Bowtie2 for paired-end reads
    bowtie2 --very-sensitive --local -x "${reference_index}" -1 "${read1}" -2 "${read2}" -S "${output_sam}"
done

need to find another way to do the sample names better - more automated - but struggling with getting that part of the code to work

In [None]:
# Define the list to store the matching lines
alignment_rate_lines = []

# Specify the Slurm output file path
slurm_out_file = '/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2/slurm-13167043.out'

# Open the Slurm output file and search for lines containing 'overall alignment rate'
with open(slurm_out_file, 'r') as file:
    for line in file:
        if 'overall alignment rate' in line:
            alignment_rate_lines.append(line.strip())

# Print the extracted lines or perform further processing
for line in alignment_rate_lines:
    print(line)

finding the overall alignment rates for each sample from the slurm.out file

88.69% overall alignment rate
87.89% overall alignment rate
88.79% overall alignment rate
88.51% overall alignment rate
88.17% overall alignment rate
88.19% overall alignment rate
88.64% overall alignment rate
88.59% overall alignment rate
88.58% overall alignment rate
89.15% overall alignment rate
88.38% overall alignment rate
88.77% overall alignment rate
88.71% overall alignment rate
88.28% overall alignment rate
88.79% overall alignment rate
87.70% overall alignment rate
88.44% overall alignment rate
88.46% overall alignment rate
87.39% overall alignment rate
87.72% overall alignment rate
88.25% overall alignment rate
88.66% overall alignment rate
88.38% overall alignment rate
88.12% overall alignment rate
88.34% overall alignment rate
87.25% overall alignment rate
88.63% overall alignment rate
87.33% overall alignment rate
88.85% overall alignment rate
88.54% overall alignment rate

now need to convert SAM files to BAM files and sort those for downstream analysis

In [None]:
#!/bin/bash
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID

# ----------------Modules------------------------- #

module load miniconda/4.11.0
module load samtools/1.9
#
# ----------------Your Commands------------------- #
#
echo + `date` job $JOB_NAME started in $QUEUE with jobID=$JOB_ID on $HOSTNAME
#

for f in $(ls /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2/SAM_files/* | sed 's/[^-]*$//'); 
do 
samtools view -b "$f"CV_alignment.sam > "$f"CV_alignment.bam;
done
#
echo = `date` job $JOB_NAME done

now have BAM files, and needed to move the BAM files to the right directory - now sorting BAM files

In [None]:
#!/bin/bash
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID

# ----------------Modules------------------------- #
#
module load miniconda/4.11.0
module load samtools/1.9
#
# ----------------Your Commands------------------- #
#
echo + `date` job $JOB_NAME started in $QUEUE with jobID=$JOB_ID on $HOSTNAME
#
for f in $(ls /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2/BAM_files/* | sed 's/[^-]*$//'); 
do 
samtools sort "$f"CV_alignment.bam -o "$f"CV_sorted.bam;
done
#
echo = `date` job $JOB_NAME done

now have sorted BAM files - need to run picard tool and mark optical duplicates

In [None]:
#!/bin/bash
# ----------------Parameters---------------------- #
#
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID
#
# ----------------Modules------------------------- #
#
module load miniconda/4.11.0

conda create -n my-java-environment -c conda-forge openjdk=17
conda activate my-java-environment


#
# ----------------Your Commands------------------- #
#
# Define the path to the Picard JAR file
PICARD_JAR="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/picard.jar"  # Replace with the actual path

echo + `date` job $JOB_NAME started in $QUEUE with jobID=$JOB_ID on $HOSTNAME
#
for f in $(ls /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2/sorted_BAM_files/* | sed 's/[^-]*$//'); 
do 
java -jar "$PICARD_JAR" MarkDuplicates \
      I="$f"CV_sorted.bam \
      O="$f"marked_duplicates.bam \
      M="$f"marked_dup_metrics.txt;
done
#
echo = `date` job $JOB_NAME done

so i checked sarah's code - she made notes that she should've aligned to the RefSeq assembly since it is annotated - so I'm going to go back through the assembly and picard duplicate steps with the RefSeq file

NCBI RefSeq assembly: GCF_002022765.2 https://www.ncbi.nlm.nih.gov/genome/annotation_euk/Crassostrea_virginica/100/

In [None]:
#!/bin/bash
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID

# ----------------Modules------------------------- #

module load miniconda/4.11.0
module load bowtie2/2.4.2

#------------build index------------------#

# This line creates the index in the current directory
bowtie2-build GCF_002022765.2_C_virginica-3.0_genomic.fna reference_index

# Set the paths and directories
reference_index="reference_index"

# Directory containing your input FASTQ files
input_dir="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/trimmed/filtered_auto_trim_sequences/trim_files"

# Output directory for Bowtie2 results
output_dir="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2/filtered_alignment_output"

# Create an array with your sample names
sample_names=( "2018--BBB-WBO-B21-CV"
"2018--BBB-WBV-B70-CV"
"2018--BBO-BBO-B16-CV"
"2018--BBO-BBY-B27-CV"
"2018--BBO-WBO-B16-CV"
"2018--BBO-WBV-B64-CV"
"2018--BBR-BBB-B50-CV"
"2018--BBR-BBG-B38-CV"
"2018--BBR-BBY-B26-CV"
"2018--BBY-WBG-B42-CV"
"2018--BPO-BPO-O16-CV"
"2018--BPR-BPG-O38-CV"
"2018--BPR-BPR-O02-CV"
"2018--BPY-BPG-O42-CV"
"2018--BPY-BPY-O29-CV"
"2018--WBB-WBV-W69-CV"
"2018--WBG-BBB-W56-CV"
"2018--WBG-WBG-W44-CV"
"2018--WBO-BBR-W03-CV"
"2018--WBO-WBV-W64-CV"
"2018--WBR-BBY-W25-CV"
"2018--WBV-WBO-W23-CV"
"2018--WBV-WBR-W12-CV"
"2018--WBY-BBV-W65-CV"
"2018--WBY-BBY-W30-CV"
"2018--WPB-BPG-G45-CV"
"2018--WPO-BPO-G16-CV"
"2018--WPO-BPY-G28-CV"
"2018--WPR-BPY-G25-CV"
"2018--WPV-BPR-G11-CV" )


# Loop through the sample names
for sample_name in "${sample_names[@]}"; do
    # Construct the file names for R1 and R2
    read1="${input_dir}/${sample_name}_R1_001_val_1.fq.gz"
    read2="${input_dir}/${sample_name}_R2_001_val_2.fq.gz"

    # Output SAM file with full path to the output directory
    output_sam="${output_dir}/${sample_name}_alignment.sam"

    # Run Bowtie2 for paired-end reads
    bowtie2 --very-sensitive --local -x "${reference_index}" -1 "${read1}" -2 "${read2}" -S "${output_sam}"
done

In [1]:
# Define the list to store the matching lines
alignment_rate_lines = []

# Specify the Slurm output file path
slurm_out_file = '/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2_refseq/slurm-13289020.out'

# Open the Slurm output file and search for lines containing 'overall alignment rate'
with open(slurm_out_file, 'r') as file:
    for line in file:
        if 'overall alignment rate' in line:
            alignment_rate_lines.append(line.strip())

# Print the extracted lines or perform further processing
for line in alignment_rate_lines:
    print(line)

88.69% overall alignment rate
87.89% overall alignment rate
88.79% overall alignment rate
88.51% overall alignment rate
88.17% overall alignment rate
88.19% overall alignment rate
88.64% overall alignment rate
88.59% overall alignment rate
88.58% overall alignment rate
89.15% overall alignment rate
88.38% overall alignment rate
88.77% overall alignment rate
88.71% overall alignment rate
88.29% overall alignment rate
88.79% overall alignment rate
87.70% overall alignment rate
88.44% overall alignment rate
88.46% overall alignment rate
87.39% overall alignment rate
87.72% overall alignment rate
88.25% overall alignment rate
88.66% overall alignment rate
88.38% overall alignment rate
88.12% overall alignment rate
88.34% overall alignment rate
87.25% overall alignment rate
88.63% overall alignment rate
87.33% overall alignment rate
88.85% overall alignment rate
88.54% overall alignment rate


overall alignment rates:
88.69% overall alignment rate 87.89% overall alignment rate 88.79% overall alignment rate 88.51% overall alignment rate 88.17% overall alignment rate 88.19% overall alignment rate 88.64% overall alignment rate 88.59% overall alignment rate 88.58% overall alignment rate 89.15% overall alignment rate 88.38% overall alignment rate 88.77% overall alignment rate 88.71% overall alignment rate 88.29% overall alignment rate 88.79% overall alignment rate 87.70% overall alignment rate 88.44% overall alignment rate 88.46% overall alignment rate 87.39% overall alignment rate 87.72% overall alignment rate 88.25% overall alignment rate 88.66% overall alignment rate 88.38% overall alignment rate 88.12% overall alignment rate 88.34% overall alignment rate 87.25% overall alignment rate 88.63% overall alignment rate 87.33% overall alignment rate 88.85% overall alignment rate 88.54% overall alignment rate

converting SAM to BAM files

In [None]:
#!/bin/bash
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID

# ----------------Modules------------------------- #

module load miniconda/4.11.0
module load samtools/1.9
#
# ----------------Your Commands------------------- #
#
echo + `date` job $JOB_NAME started in $QUEUE with jobID=$JOB_ID on $HOSTNAME
#

for f in $(ls /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2_refseq/SAM_files/* | sed 's/[^-]*$//'); 
do 
samtools view -b "$f"CV_alignment.sam > "$f"CV_alignment.bam;
done
#
echo = `date` job $JOB_NAME done

sorting BAM files

In [None]:
#!/bin/bash
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID

# ----------------Modules------------------------- #
#
module load miniconda/4.11.0
module load samtools/1.9
#
# ----------------Your Commands------------------- #
#
echo + `date` job $JOB_NAME started in $QUEUE with jobID=$JOB_ID on $HOSTNAME
#
for f in $(ls /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2_refseq/BAM_files/* | sed 's/[^-]*$//'); 
do 
samtools sort "$f"CV_alignment.bam -o "$f"CV_sorted.bam;
done
#
echo = `date` job $JOB_NAME done

In [None]:
now using picard to mark duplicates

In [None]:
#!/bin/bash
# ----------------Parameters---------------------- #
#
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID
#
# ----------------Modules------------------------- #
#
module load miniconda/4.11.0
#
conda create -n my-java-environment -c conda-forge openjdk=17
conda activate my-java-environment
#
# ----------------Your Commands------------------- #
#
# Define the path to the Picard JAR file
PICARD_JAR="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_MethylRAD_analysis_2018/picard.jar"  # Replace with the actual path

echo + `date` job $JOB_NAME started in $QUEUE with jobID=$JOB_ID on $HOSTNAME
#
for f in $(ls /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2_refseq/sorted_BAM_files/* | sed 's/[^-]*$//'); 
do 
java -jar "$PICARD_JAR" MarkDuplicates \
      I="$f"CV_sorted.bam \
      O="$f"marked_duplicates.bam \
      M="$f"marked_dup_metrics.txt;
done
#
echo = `date` job $JOB_NAME done

to use bedtools multicov (reports the count of alignments from multiple position-sorted and indexed BAM files that overlap intervals in a BED file)

first need to index BAM files

In [None]:
#!/bin/bash
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 1:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID
#
# ----------------Modules------------------------- #
#
module load miniconda/4.11.0
module load samtools/1.9
#
# ----------------Your Commands------------------- #
#
#
for f in $(ls /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2_refseq/sorted_BAM_files/*_sorted.bam); 
do 
samtools index -b "$f" "$f".bai;
done
#
echo = `date` job $JOB_NAME done

htseq-counts to find the number of reads that align to features according to the gtf file from NCBI - output goes into txt file

In [None]:
#!/bin/bash
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID
#
# ----------------Modules------------------------- #
pip install HTseq
# ----------------Your Commands------------------- #

htseq-count -r pos -f bam /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2_refseq/sorted_BAM_files/*_sorted.bam /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/reference_genomes/genomic.gtf >counts.txt

So - in CV_CE18_pipeline_counts.ipynb, went through and checked how many reads actually have a methyl group somewhere in the sequence - found about 50-70% of reads have that pattern for each sample. So now want to go through and filter for only the reads that have that specific pattern - then realign and go through the above pipeline before doing analysis

In [1]:
from Bio import SeqIO
import gzip
import os

patterns = ["CCAGG", "CCCGG", "CCTGG", "CCGG", "GGCC", "GGACC", "GGGCC", "GGTCC"]

def write_matching_reads(file_path, output_dir):
    output_file = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(file_path))[0]}_matching_reads.fq.gz")
    with gzip.open(file_path, "rt") as handle:  # Use gzip.open for compressed files
        with gzip.open(output_file, "wt") as out_handle:
            for record in SeqIO.parse(handle, "fastq"):
                if any(pattern in str(record.seq) for pattern in patterns):
                    SeqIO.write(record, out_handle, "fastq")

directory_path = "/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/trimmed/filtered_auto_trim_sequences/trim_files"
output_directory = "/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/working_seq"

for filename in os.listdir(directory_path):
    if filename.endswith("fq.gz"):
        file_path = os.path.join(directory_path, filename)
        write_matching_reads(file_path, output_directory)
        print(f"Sample {filename}: Reads matching any pattern written to {os.path.splitext(filename)[0]}_matching_reads.fq.gz in {output_directory}")

Sample 2018--WBO-BBR-W03-CV_R1_001_val_1.fq.gz: Reads matching any pattern written to 2018--WBO-BBR-W03-CV_R1_001_val_1.fq_matching_reads.fq.gz in /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/working_seq
Sample 2018--BBR-BBG-B38-CV_R2_001_val_2.fq.gz: Reads matching any pattern written to 2018--BBR-BBG-B38-CV_R2_001_val_2.fq_matching_reads.fq.gz in /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/working_seq
Sample 2018--WBO-BBR-W03-CV_R2_001_val_2.fq.gz: Reads matching any pattern written to 2018--WBO-BBR-W03-CV_R2_001_val_2.fq_matching_reads.fq.gz in /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/working_seq
Sample 2018--BBR-BBG-B38-CV_R1_001_val_1.fq.gz: Reads matching any pattern written to 2018--BBR-BBG-B38-CV_R1_001_val_1.fq_matching_reads.fq.gz in /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/working_seq
Sample 2018--BBR-BBB-B50-CV_R1_001_val_1.fq.gz: Reads matching any pattern written to 2018--BBR-BBB-B50-CV_R1_001_val_1.fq_matching_read

so the above code worked but didn't account for the sequences being paired-end reads, so bowtie2 alignment gives errors because the reads are uneven lenghts - trying to fix the code so that a sequence is kept only if both reads have the sequence pattern.

In [None]:
from Bio import SeqIO
import gzip
import os

patterns = ["CCAGG", "CCCGG", "CCTGG", "CCGG", "GGCC", "GGACC", "GGGCC", "GGTCC"]

def write_matching_reads(file_path1, file_path2, output_dir):
    output_file1 = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(file_path1))[0]}_matching_reads.fq.gz")
    output_file2 = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(file_path2))[0]}_matching_reads.fq.gz")

    with gzip.open(file_path1, "rt") as handle1, gzip.open(file_path2, "rt") as handle2:
        with gzip.open(output_file1, "wt") as out_handle1, gzip.open(output_file2, "wt") as out_handle2:
            for record1, record2 in zip(SeqIO.parse(handle1, "fastq"), SeqIO.parse(handle2, "fastq")):
                if any(pattern in str(record1.seq) for pattern in patterns) and any(pattern in str(record2.seq) for pattern in patterns):
                    SeqIO.write(record1, out_handle1, "fastq")
                    SeqIO.write(record2, out_handle2, "fastq")

directory_path = "/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/trimmed/filtered_auto_trim_sequences/trim_files"
output_directory = "/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/working_seq"

# Get a list of all files in the directory
file_list = [filename for filename in os.listdir(directory_path) if filename.endswith("_001_val_1.fq.gz")]

# Process paired reads
for forward_filename in file_list:
    base_name = os.path.splitext(forward_filename)[0]  # Remove the file extension
    reverse_filename = forward_filename.replace("_R1_001_val_1.fq.gz", "_R2_001_val_2.fq.gz")

    forward_file_path = os.path.join(directory_path, forward_filename)
    reverse_file_path = os.path.join(directory_path, reverse_filename)

    if os.path.exists(reverse_file_path):
        write_matching_reads(forward_file_path, reverse_file_path, output_directory)
        print(f"Sample {base_name}: Paired-end reads matching any pattern written to {base_name}_matching_reads.fq.gz in {output_directory}")
    else:
        print(f"Error: Reverse file not found for {forward_filename}")


Sample 2018--WBO-BBR-W03-CV_R1_001_val_1.fq: Paired-end reads matching any pattern written to 2018--WBO-BBR-W03-CV_R1_001_val_1.fq_matching_reads.fq.gz in /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/working_seq


bowtie2 alignment with methyl reads

In [None]:
#!/bin/bash
#SBATCH -c 4  # Number of Cores per Task
#SBATCH --mem=8192  # Requested Memory
#SBATCH -p cpu-long  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o slurm-%j.out  # %j = job ID

# ----------------Modules------------------------- #

module load miniconda/4.11.0
module load 2.4.2+py3.8.12

#------------build index------------------#

# This line creates the index in the current directory
bowtie2-build /project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/reference_genomes/GCF_002022765.2_C_virginica-3.0_genomic.fna reference_index

# Set the paths and directories
reference_index="reference_index"

# Directory containing your input FASTQ files
input_dir="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/working_seq"

# Output directory for Bowtie2 results
output_dir="/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/pipeline_work_seq/bowtie2"

# Create an array with your sample names
sample_names=( "2018--BBB-WBO-B21-CV"
"2018--BBB-WBV-B70-CV"
"2018--BBO-BBO-B16-CV"
"2018--BBO-BBY-B27-CV"
"2018--BBO-WBO-B16-CV"
"2018--BBO-WBV-B64-CV"
"2018--BBR-BBB-B50-CV"
"2018--BBR-BBG-B38-CV"
"2018--BBR-BBY-B26-CV"
"2018--BBY-WBG-B42-CV"
"2018--BPO-BPO-O16-CV"
"2018--BPR-BPG-O38-CV"
"2018--BPR-BPR-O02-CV"
"2018--BPY-BPG-O42-CV"
"2018--BPY-BPY-O29-CV"
"2018--WBB-WBV-W69-CV"
"2018--WBG-BBB-W56-CV"
"2018--WBG-WBG-W44-CV"
"2018--WBO-BBR-W03-CV"
"2018--WBO-WBV-W64-CV"
"2018--WBR-BBY-W25-CV"
"2018--WBV-WBO-W23-CV"
"2018--WBV-WBR-W12-CV"
"2018--WBY-BBV-W65-CV"
"2018--WBY-BBY-W30-CV"
"2018--WPB-BPG-G45-CV"
"2018--WPO-BPO-G16-CV"
"2018--WPO-BPY-G28-CV"
"2018--WPR-BPY-G25-CV"
"2018--WPV-BPR-G11-CV" )

# Loop through the sample names
for sample_name in "${sample_names[@]}"; do
    # Construct the file names for R1 and R2
    read1="${input_dir}/${sample_name}_R1_001_val_1.fq_matching_reads.fq.gz"
    read2="${input_dir}/${sample_name}_R2_001_val_2.q_matching_reads.fq.gz"

    # Output SAM file with full path to the output directory
    output_sam="${output_dir}/${sample_name}_alignment.sam"

    # Run Bowtie2 for paired-end reads
    bowtie2 --very-sensitive --local -x "${reference_index}" -1 "${read1}" -2 "${read2}" -S "${output_sam}"
done