## Pipeline Counts for methylRAD data

#### counts of raw reads and trimmed reads

In [1]:
# importing package to move through directories
import os
# to change directory
os.chdir("/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/trimmed/filtered_auto_trim_sequences/trim_files/")

In [None]:
import gzip
from Bio import SeqIO

# read directories
raw_reads_dir = "/project/pi_sarah_gignouxwolfsohn_uml_edu/Raw_sequences/methyl_raw/"
trimmed_reads_dir = "/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/trimmed/filtered_auto_trim_sequences/trim_files/"

# Collect unique sample names from raw reads directory
sample_names = set()
for filename in os.listdir(raw_reads_dir):
    if filename.endswith(".fastq.gz"):
        sample_name = filename.split("_")[0]
        sample_names.add(sample_name)

counts_df = {}
# Count reads for each unique sample
for sample in sample_names:
    trim_filename = f"{trimmed_reads_dir}{sample}_R1_001_val_1.fq.gz"
    raw_filename = f"{raw_reads_dir}{sample}_R1_001.fastq.gz"
    
    try:
        with gzip.open(raw_filename, 'rt') as raw:
            raw_count = sum(1 for _ in raw)
        
        with gzip.open(trim_filename, 'rt') as trim:
            trim_count = sum(1 for _ in trim)
        
        counts_df[sample] = trim_count / raw_count
        print(f"Sample: {sample}, Raw Reads: {raw_count}, Trimmed Reads: {trim_count}, Ratio: {trim_count/raw_count}")
    except FileNotFoundError:
        print(f"File not found for sample: {sample}")
        
counts_df

Sample: 2018--BBO-WBO-B16-CV, Raw Reads: 59769484, Trimmed Reads: 9603844, Ratio: 0.1606813938698216
Sample: 2018--BPO-BPO-O16-CV, Raw Reads: 17002472, Trimmed Reads: 936764, Ratio: 0.05509575313519117
Sample: 2018--WBV-WBO-W23-CV, Raw Reads: 20244836, Trimmed Reads: 2744600, Ratio: 0.13557037458836416
Sample: 2018--BPR-BPG-O38-CV, Raw Reads: 38746892, Trimmed Reads: 4390176, Ratio: 0.11330395222409065
Sample: 2018--BBO-WBV-B64-CV, Raw Reads: 49958272, Trimmed Reads: 8379816, Ratio: 0.167736306011545
Sample: 2018--WBB-WBV-W69-CV, Raw Reads: 41913360, Trimmed Reads: 8547732, Ratio: 0.20393812378678303
Sample: 2018--WPO-BPY-G28-CV, Raw Reads: 26339844, Trimmed Reads: 1346688, Ratio: 0.05112740986620878
Sample: 2018--WPB-BPG-G45-CV, Raw Reads: 38780388, Trimmed Reads: 4035068, Ratio: 0.10404919104986779
Sample: 2018--BBB-WBV-B70-CV, Raw Reads: 47021020, Trimmed Reads: 7920020, Ratio: 0.16843573363572292
Sample: 2018--BBB-WBO-B21-CV, Raw Reads: 60015908, Trimmed Reads: 9142520, Ratio: 0.15

#### finding number of paired reads that aligned concordantly with bowtie2 alignment and the refseq 

In [12]:
import os

# Directory containing Bowtie2 output files for your samples
input_dir = '/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2_refseq/SAM_files/'

# List of sample names (corresponding to your input files)
sample_names=( "2018--BBB-WBO-B21-CV",
"2018--BBB-WBV-B70-CV",
"2018--BBO-BBO-B16-CV",
"2018--BBO-BBY-B27-CV",
"2018--BBO-WBO-B16-CV",
"2018--BBO-WBV-B64-CV",
"2018--BBR-BBB-B50-CV",
"2018--BBR-BBG-B38-CV",
"2018--BBR-BBY-B26-CV",
"2018--BBY-WBG-B42-CV",
"2018--BPO-BPO-O16-CV",
"2018--BPR-BPG-O38-CV",
"2018--BPR-BPR-O02-CV",
"2018--BPY-BPG-O42-CV",
"2018--BPY-BPY-O29-CV",
"2018--WBB-WBV-W69-CV",
"2018--WBG-BBB-W56-CV",
"2018--WBG-WBG-W44-CV",
"2018--WBO-BBR-W03-CV",
"2018--WBO-WBV-W64-CV",
"2018--WBR-BBY-W25-CV",
"2018--WBV-WBO-W23-CV",
"2018--WBV-WBR-W12-CV",
"2018--WBY-BBV-W65-CV",
"2018--WBY-BBY-W30-CV",
"2018--WPB-BPG-G45-CV",
"2018--WPO-BPO-G16-CV",
"2018--WPO-BPY-G28-CV",
"2018--WPR-BPY-G25-CV",
"2018--WPV-BPR-G11-CV" )

# Dictionary to store concordant alignment counts for each sample
concordant_counts = {}

# Iterate through the list of samples
for sample in sample_names:
    # Construct the Bowtie2 output file path for the sample
    bowtie2_output_file = os.path.join(input_dir, f'{sample}_alignment.sam')  # Assuming you have SAM output files

    # Initialize the count for this sample
    concordant_counts[sample] = 0

    # Open and read the Bowtie2 output file
    with open(bowtie2_output_file, 'r') as file:
        for line in file:
            if line.startswith('@'):
                continue  # Skip header lines
            fields = line.split('\t')
            if fields[1] == '99' or fields[1] == '147':
                # Check for the YT flag (99 or 147 indicates concordant alignments)
                if 'YT:Z:CP' in line:
                    concordant_counts[sample] += 1

# Print the concordant alignment counts for each sample
for sample, count in concordant_counts.items():
    print(f'Sample: {sample}, Concordant Alignments: {count}')


Sample: 2018--BBB-WBO-B21-CV, Concordant Alignments: 1987316
Sample: 2018--BBB-WBV-B70-CV, Concordant Alignments: 1702318
Sample: 2018--BBO-BBO-B16-CV, Concordant Alignments: 1504682
Sample: 2018--BBO-BBY-B27-CV, Concordant Alignments: 963810
Sample: 2018--BBO-WBO-B16-CV, Concordant Alignments: 2076074
Sample: 2018--BBO-WBV-B64-CV, Concordant Alignments: 1812804
Sample: 2018--BBR-BBB-B50-CV, Concordant Alignments: 859402
Sample: 2018--BBR-BBG-B38-CV, Concordant Alignments: 1538688
Sample: 2018--BBR-BBY-B26-CV, Concordant Alignments: 1451844
Sample: 2018--BBY-WBG-B42-CV, Concordant Alignments: 492580
Sample: 2018--BPO-BPO-O16-CV, Concordant Alignments: 202704
Sample: 2018--BPR-BPG-O38-CV, Concordant Alignments: 958966
Sample: 2018--BPR-BPR-O02-CV, Concordant Alignments: 231214
Sample: 2018--BPY-BPG-O42-CV, Concordant Alignments: 123478
Sample: 2018--BPY-BPY-O29-CV, Concordant Alignments: 358770
Sample: 2018--WBB-WBV-W69-CV, Concordant Alignments: 1841576
Sample: 2018--WBG-BBB-W56-CV, Co

#### marked duplicates counts from picard tools
still not working - returning values of 0 even though the files have numbers

In [20]:
import os

# Directory containing Picard Metrics files for your samples
input_dir = '/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/assembly/bowtie2_refseq/mark_dups/mark_dups_metrics/'

# List of sample names (corresponding to your input files)
sample_names=( "2018--BBB-WBO-B21",
"2018--BBB-WBV-B70",
"2018--BBO-BBO-B16",
"2018--BBO-BBY-B27",
"2018--BBO-WBO-B16",
"2018--BBO-WBV-B64",
"2018--BBR-BBB-B50",
"2018--BBR-BBG-B38",
"2018--BBR-BBY-B26",
"2018--BBY-WBG-B42",
"2018--BPO-BPO-O16",
"2018--BPR-BPG-O38",
"2018--BPR-BPR-O02",
"2018--BPY-BPG-O42",
"2018--BPY-BPY-O29",
"2018--WBB-WBV-W69",
"2018--WBG-BBB-W56",
"2018--WBG-WBG-W44",
"2018--WBO-BBR-W03",
"2018--WBO-WBV-W64",
"2018--WBR-BBY-W25",
"2018--WBV-WBO-W23",
"2018--WBV-WBR-W12",
"2018--WBY-BBV-W65",
"2018--WBY-BBY-W30",
"2018--WPB-BPG-G45",
"2018--WPO-BPO-G16",
"2018--WPO-BPY-G28",
"2018--WPR-BPY-G25",
"2018--WPV-BPR-G11" )

# Dictionary to store READ_PAIR_OPTICAL_DUPLICATES counts for each sample
optical_duplicate_counts = {}

# Iterate through the list of samples
for sample in sample_names:
    # Construct the Picard Metrics file path for the sample
    picard_metrics_file = os.path.join(input_dir, f'{sample}-marked_dup_metrics.txt')

    # Initialize the count for this sample
    optical_duplicate_counts[sample] = 0

    # Open and read the Picard Metrics file
    with open(picard_metrics_file, 'r') as file:
        for line in file:
            if line.startswith('READ_PAIR_OPTICAL_DUPLICATES'):
                try:
                    # Extract the value after the last tab and convert to integer
                    optical_duplicate_counts[sample] = int(line.split('\t')[-3].strip())
                except ValueError:
                    print(f"Skipping non-numeric value for {sample}")
                    break  # Stop reading further lines for this sample

# Print the optical duplicate counts for each sample
for sample, count in optical_duplicate_counts.items():
    print(f'Sample: {sample}, READ_PAIR_OPTICAL_DUPLICATES Counts: {count}')

Sample: 2018--BBB-WBO-B21, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BBB-WBV-B70, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BBO-BBO-B16, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BBO-BBY-B27, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BBO-WBO-B16, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BBO-WBV-B64, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BBR-BBB-B50, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BBR-BBG-B38, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BBR-BBY-B26, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BBY-WBG-B42, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BPO-BPO-O16, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BPR-BPG-O38, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BPR-BPR-O02, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BPY-BPG-O42, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 2018--BPY-BPY-O29, READ_PAIR_OPTICAL_DUPLICATES Counts: 0
Sample: 20

### checking if our reads actually have a methyl group in the middle
enzyme: FspEI https://www.neb.com/en-us/products/r0662-fspei#Product%20Information

At fully methylated CpG sites: 
5´. . . C mC  G G . . . 3´
3´. . . G  G mC C . . . 5´

or CHG sites: 
5´. . . C mC H  G G . . . 3´
3´. . . G  G D mC C . . . 5´

H = A or C or T (not G)
D = A or G or T (not C) 

so need to check how many reads have that fully methylated CpG site pattern in the middle of CHG site 

In [10]:
import zipfile
import os

# Directory containing your zipped sample files
zipped_samples_dir = '/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/trimmed/filtered_auto_trim_sequences/trim_files'

# Specific sequence you want to search for
target_sequence = ['CCAGG','CCCGG','CCTGG']

# Dictionary to store counts for each sample
sample_counts = {}

# Loop through the zipped sample files
for zip_file_name in os.listdir(zipped_samples_dir):
    if zip_file_name.endswith('R1_001_val_1.fq.gz'):
        sample_name = os.path.splitext(zip_file_name)[0]
        sample_counts[sample_name] = 0  # Initialize count to 0

        with zipfile.ZipFile(os.path.join(zipped_samples_dir, zip_file_name), 'r') as zip_file:
            for file_name in zip_file.namelist():
                with zip_file.open(file_name) as sample_file:
                    for line in sample_file:
                        line = line.decode('utf-8')  # Decode the line if needed
                        if target_sequence in line:
                            sample_counts[sample_name] += 1

# Print the counts for each sample
for sample_name, count in sample_counts.items():
    print(f"Sample: {sample_name}, Count: {count}")



BadZipFile: File is not a zip file

In [None]:
import re


list_yes = []

s = "ACGTATCGAAGACGT"
forward = ['CCAGG','CCCGG','CCTGG']
reverse = ['GGACC','GGGCC','GGTCC']

result = re.findall(r'ATCGAAG', forward)

if result:
    list_yes.append(i)
else:
    print("The pattern 'ATCGAAG' is not in the middle of the reads.")