## Pipeline Counts for methylRAD data

#### counts of raw reads and trimmed reads

In [1]:
# importing package to move through directories
import os
# to change directory
os.chdir("/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/trimmed/filtered_auto_trim_sequences/trim_files/")

In [None]:
import gzip
from Bio import SeqIO

# read directories
raw_reads_dir = "/project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/methyl_raw/"
trimmed_reads_dir = "/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/trimmed/filtered_auto_trim_sequences/trim_files/"

# Collect unique sample names from raw reads directory
sample_names = set()
for filename in os.listdir(raw_reads_dir):
    if filename.endswith(".fastq.gz"):
        sample_name = filename.split("_")[0]
        sample_names.add(sample_name)

counts_df = {}
# Count reads for each unique sample
for sample in sample_names:
    trim_filename = f"{trimmed_reads_dir}{sample}_R1_001_val_1.fq.gz"
    raw_filename = f"{raw_reads_dir}{sample}_R1_001.fastq.gz"
    
    try:
        with gzip.open(raw_filename, 'rt') as raw:
            raw_count = sum(1 for _ in raw)
        
        with gzip.open(trim_filename, 'rt') as trim:
            trim_count = sum(1 for _ in trim)
        
        counts_df[sample] = trim_count / raw_count
        print(f"Sample: {sample}, Raw Reads: {raw_count}, Trimmed Reads: {trim_count}, Ratio: {trim_count/raw_count}")
    except FileNotFoundError:
        print(f"File not found for sample: {sample}")
        
counts_df

Sample: 2018--BBO-WBO-B16-CV, Raw Reads: 59769484, Trimmed Reads: 9603844, Ratio: 0.1606813938698216
Sample: 2018--BPO-BPO-O16-CV, Raw Reads: 17002472, Trimmed Reads: 936764, Ratio: 0.05509575313519117
Sample: 2018--WBV-WBO-W23-CV, Raw Reads: 20244836, Trimmed Reads: 2744600, Ratio: 0.13557037458836416
Sample: 2018--BPR-BPG-O38-CV, Raw Reads: 38746892, Trimmed Reads: 4390176, Ratio: 0.11330395222409065
Sample: 2018--BBO-WBV-B64-CV, Raw Reads: 49958272, Trimmed Reads: 8379816, Ratio: 0.167736306011545
Sample: 2018--WBB-WBV-W69-CV, Raw Reads: 41913360, Trimmed Reads: 8547732, Ratio: 0.20393812378678303
Sample: 2018--WPO-BPY-G28-CV, Raw Reads: 26339844, Trimmed Reads: 1346688, Ratio: 0.05112740986620878
Sample: 2018--WPB-BPG-G45-CV, Raw Reads: 38780388, Trimmed Reads: 4035068, Ratio: 0.10404919104986779
Sample: 2018--BBB-WBV-B70-CV, Raw Reads: 47021020, Trimmed Reads: 7920020, Ratio: 0.16843573363572292
Sample: 2018--BBB-WBO-B21-CV, Raw Reads: 60015908, Trimmed Reads: 9142520, Ratio: 0.15

#### finding number of paired reads that aligned concordantly with bowtie2 alignment and the refseq 

In [12]:
import os

# Directory containing Bowtie2 output files for your samples
input_dir = '/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2_refseq/SAM_files/'

# List of sample names (corresponding to your input files)
sample_names=( "2018--BBB-WBO-B21-CV",
"2018--BBB-WBV-B70-CV",
"2018--BBO-BBO-B16-CV",
"2018--BBO-BBY-B27-CV",
"2018--BBO-WBO-B16-CV",
"2018--BBO-WBV-B64-CV",
"2018--BBR-BBB-B50-CV",
"2018--BBR-BBG-B38-CV",
"2018--BBR-BBY-B26-CV",
"2018--BBY-WBG-B42-CV",
"2018--BPO-BPO-O16-CV",
"2018--BPR-BPG-O38-CV",
"2018--BPR-BPR-O02-CV",
"2018--BPY-BPG-O42-CV",
"2018--BPY-BPY-O29-CV",
"2018--WBB-WBV-W69-CV",
"2018--WBG-BBB-W56-CV",
"2018--WBG-WBG-W44-CV",
"2018--WBO-BBR-W03-CV",
"2018--WBO-WBV-W64-CV",
"2018--WBR-BBY-W25-CV",
"2018--WBV-WBO-W23-CV",
"2018--WBV-WBR-W12-CV",
"2018--WBY-BBV-W65-CV",
"2018--WBY-BBY-W30-CV",
"2018--WPB-BPG-G45-CV",
"2018--WPO-BPO-G16-CV",
"2018--WPO-BPY-G28-CV",
"2018--WPR-BPY-G25-CV",
"2018--WPV-BPR-G11-CV" )

# Dictionary to store concordant alignment counts for each sample
concordant_counts = {}

# Iterate through the list of samples
for sample in sample_names:
    # Construct the Bowtie2 output file path for the sample
    bowtie2_output_file = os.path.join(input_dir, f'{sample}_alignment.sam')  # Assuming you have SAM output files

    # Initialize the count for this sample
    concordant_counts[sample] = 0

    # Open and read the Bowtie2 output file
    with open(bowtie2_output_file, 'r') as file:
        for line in file:
            if line.startswith('@'):
                continue  # Skip header lines
            fields = line.split('\t')
            if fields[1] == '99' or fields[1] == '147':
                # Check for the YT flag (99 or 147 indicates concordant alignments)
                if 'YT:Z:CP' in line:
                    concordant_counts[sample] += 1

# Print the concordant alignment counts for each sample
for sample, count in concordant_counts.items():
    print(f'Sample: {sample}, Concordant Alignments: {count}')


Sample: 2018--BBB-WBO-B21-CV, Concordant Alignments: 1987316
Sample: 2018--BBB-WBV-B70-CV, Concordant Alignments: 1702318
Sample: 2018--BBO-BBO-B16-CV, Concordant Alignments: 1504682
Sample: 2018--BBO-BBY-B27-CV, Concordant Alignments: 963810
Sample: 2018--BBO-WBO-B16-CV, Concordant Alignments: 2076074
Sample: 2018--BBO-WBV-B64-CV, Concordant Alignments: 1812804
Sample: 2018--BBR-BBB-B50-CV, Concordant Alignments: 859402
Sample: 2018--BBR-BBG-B38-CV, Concordant Alignments: 1538688
Sample: 2018--BBR-BBY-B26-CV, Concordant Alignments: 1451844
Sample: 2018--BBY-WBG-B42-CV, Concordant Alignments: 492580
Sample: 2018--BPO-BPO-O16-CV, Concordant Alignments: 202704
Sample: 2018--BPR-BPG-O38-CV, Concordant Alignments: 958966
Sample: 2018--BPR-BPR-O02-CV, Concordant Alignments: 231214
Sample: 2018--BPY-BPG-O42-CV, Concordant Alignments: 123478
Sample: 2018--BPY-BPY-O29-CV, Concordant Alignments: 358770
Sample: 2018--WBB-WBV-W69-CV, Concordant Alignments: 1841576
Sample: 2018--WBG-BBB-W56-CV, Co

#### marked duplicates counts from picard tools
returns paired reads, unpaired reads, and read pair optical duplicates

In [87]:
# base code for one file
metrics_file = pd.read_csv("/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2_refseq/mark_dups/mark_dups_metrics/2018--BBB-WBO-B21-marked_dup_metrics.txt")
metrics_file.head(10)

metrics_file.index == 5
metrics_file[metrics_file.index == 5]

header_row_values = metrics_file['## htsjdk.samtools.metrics.StringHeader'].str.split('\t', expand=True)
header_row_values.head(6)


header_row_values.columns = header_row_values.iloc[4]
df = pd.DataFrame(header_row_values.iloc[5, :])
df = df.transpose().reset_index()
rpe = df['READ_PAIRS_EXAMINED'][0]
ure = df['UNPAIRED_READS_EXAMINED'][0]
rpod = df['READ_PAIR_OPTICAL_DUPLICATES'][0]

print('rpe:', rpe, 'ure:', ure, 'rpod:', rpod)

rpe: 2020021 ure: 14421 rpod: 9850


In [89]:
# same code but adapted to loop through all files
import os
import re

# Set the path to the directory containing your Picard metrics files
input_directory = "/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2_refseq/mark_dups/mark_dups_metrics"

# List of sample names (corresponding to your input files)
sample_names=['2018--BBB-WBO-B21', '2018--BBB-WBV-B70', 
              '2018--BBO-BBO-B16', '2018--BBO-BBY-B27', 
              '2018--BBO-WBO-B16', '2018--BBO-WBV-B64', 
              '2018--BBR-BBB-B50', '2018--BBR-BBG-B38', 
              '2018--BBR-BBY-B26', '2018--BBY-WBG-B42', 
              '2018--BPO-BPO-O16', '2018--BPR-BPG-O38', 
              '2018--BPR-BPR-O02', '2018--BPY-BPG-O42', 
              '2018--BPY-BPY-O29', '2018--WBB-WBV-W69', 
              '2018--WBG-BBB-W56', '2018--WBG-WBG-W44', 
              '2018--WBO-BBR-W03', '2018--WBO-WBV-W64', 
              '2018--WBR-BBY-W25', '2018--WBV-WBO-W23', 
              '2018--WBV-WBR-W12', '2018--WBY-BBV-W65', 
              '2018--WBY-BBY-W30', '2018--WPB-BPG-G45', 
              '2018--WPO-BPO-G16', '2018--WPO-BPY-G28', 
              '2018--WPR-BPY-G25', '2018--WPV-BPR-G11']


# Initialize variables to store metrics values
upe = 0
rpe = 0
rpod = 0

# Loop through each sample name
for sample in sample_names:
    # Construct the metrics file path for the current sample
    file = f"{sample}-marked_dup_metrics.txt"
    metrics_file_path = os.path.join(input_directory, file)

    # Read the metrics file for the current sample
    metrics_file = pd.read_csv(metrics_file_path, sep='\t', skiprows=6, nrows=1)

    # Extract relevant metrics
    rpe = metrics_file['READ_PAIRS_EXAMINED'].values[0]
    ure = metrics_file['UNPAIRED_READS_EXAMINED'].values[0]
    rpod = metrics_file['READ_PAIR_OPTICAL_DUPLICATES'].values[0]

    # Print or use the information as needed for each sample
    print(f"Sample: {sample}, Paired Reads: {rpe}, Unpaired Reads: {ure}, Optical Duplicates: {rpod}")


Sample: 2018--BBB-WBO-B21, Paired Reads: 2020021, Unpaired Reads: 14421, Optical Duplicates: 9850
Sample: 2018--BBB-WBV-B70, Paired Reads: 1733903, Unpaired Reads: 12637, Optical Duplicates: 9107
Sample: 2018--BBO-BBO-B16, Paired Reads: 1527667, Unpaired Reads: 11081, Optical Duplicates: 4699
Sample: 2018--BBO-BBY-B27, Paired Reads: 978519, Unpaired Reads: 6871, Optical Duplicates: 3552
Sample: 2018--BBO-WBO-B16, Paired Reads: 2109689, Unpaired Reads: 14556, Optical Duplicates: 10334
Sample: 2018--BBO-WBV-B64, Paired Reads: 1840935, Unpaired Reads: 13313, Optical Duplicates: 8994
Sample: 2018--BBR-BBB-B50, Paired Reads: 874456, Unpaired Reads: 6878, Optical Duplicates: 2652
Sample: 2018--BBR-BBG-B38, Paired Reads: 1562707, Unpaired Reads: 11822, Optical Duplicates: 5177
Sample: 2018--BBR-BBY-B26, Paired Reads: 1476603, Unpaired Reads: 11046, Optical Duplicates: 4909
Sample: 2018--BBY-WBG-B42, Paired Reads: 501016, Unpaired Reads: 4711, Optical Duplicates: 1523
Sample: 2018--BPO-BPO-O16

### checking if our reads actually have a methyl group in the middle
enzyme: FspEI https://www.neb.com/en-us/products/r0662-fspei#Product%20Information

At fully methylated CpG sites: 
5´. . . C mC  G G . . . 3´
3´. . . G  G mC C . . . 5´

or CHG sites: 
5´. . . C mC H  G G . . . 3´
3´. . . G  G D mC C . . . 5´

H = A or C or T (not G)
D = A or G or T (not C) 

so need to check how many reads have that fully methylated CpG site pattern in the middle of CHG site 

In [2]:
# for forward reads
from Bio import SeqIO
import gzip

patterns = ["CCAGG","CCCGG","CCTGG", "CCGG", "GGCC", "GGACC","GGGCC","GGTCC"]

def count_sequences_with_patterns(file_path):
    pattern_counts = {pattern: 0 for pattern in patterns}
    with gzip.open(file_path, "rt") as handle:  # Use gzip.open for compressed files
        for record in SeqIO.parse(handle, "fastq"):
            for pattern in patterns:
                if pattern in str(record.seq):
                    pattern_counts[pattern] += 1
    return pattern_counts

directory_path = "/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/working_seq"

import os

for filename in os.listdir(directory_path):
    if filename.endswith("fq.gz"):
        file_path = os.path.join(directory_path, filename)
        pattern_counts = count_sequences_with_patterns(file_path)
        print(f"Sample {filename}: {pattern_counts}")


Sample 2018--WBO-BBR-W03-CV_R1_001_val_1.fq.gz: {'CCAGG': 8142, 'CCCGG': 24991, 'CCTGG': 8316, 'CCGG': 81794, 'GGCC': 36715, 'GGACC': 12792, 'GGGCC': 10826, 'GGTCC': 13212}
Sample 2018--BBR-BBG-B38-CV_R2_001_val_2.fq.gz: {'CCAGG': 92926, 'CCCGG': 297457, 'CCTGG': 92346, 'CCGG': 987598, 'GGCC': 434084, 'GGACC': 140980, 'GGGCC': 122067, 'GGTCC': 141227}
Sample 2018--WBO-BBR-W03-CV_R2_001_val_2.fq.gz: {'CCAGG': 8373, 'CCCGG': 24417, 'CCTGG': 8166, 'CCGG': 81054, 'GGCC': 36953, 'GGACC': 13200, 'GGGCC': 10952, 'GGTCC': 12668}
Sample 2018--BBR-BBG-B38-CV_R1_001_val_1.fq.gz: {'CCAGG': 91994, 'CCCGG': 294874, 'CCTGG': 92598, 'CCGG': 993448, 'GGCC': 432353, 'GGACC': 142229, 'GGGCC': 124418, 'GGTCC': 141133}
Sample 2018--BBR-BBB-B50-CV_R1_001_val_1.fq.gz: {'CCAGG': 51027, 'CCCGG': 147899, 'CCTGG': 51680, 'CCGG': 494602, 'GGCC': 232072, 'GGACC': 75130, 'GGGCC': 64545, 'GGTCC': 75747}
Sample 2018--WBV-WBO-W23-CV_R1_001_val_1.fq.gz: {'CCAGG': 33243, 'CCCGG': 98897, 'CCTGG': 33960, 'CCGG': 333108, '