In [2]:
from Bio import SeqIO
import argparse
import sys
import os

def remove_redundant_sequences(input_fastq, output_fastq):
    if not os.path.isfile(input_fastq):
        print(f"Error: The input file '{input_fastq}' does not exist.")
        sys.exit(1)

    unique_sequences = set()
    with open(output_fastq, "w") as output_handle:
        for record in SeqIO.parse(input_fastq, "fastq"):
            sequence = str(record.seq)
            if sequence not in unique_sequences:
                unique_sequences.add(sequence)
                SeqIO.write(record, output_handle, "fastq")
                
    print(f"Redundant sequences removed. Output saved to {output_fastq}")

def count_sequences(fastq_file):
    return sum(1 for _ in SeqIO.parse(fastq_file, "fastq"))

def main():
    parser = argparse.ArgumentParser(description="Remove redundant sequences from a FASTQ file.")
    parser.add_argument("-i", "--input", help="Input FASTQ file")
    parser.add_argument("-o", "--output", help="Output FASTQ file with unique sequences")
    args = parser.parse_args()
    
    # Check if both input and output files are provided
    if not args.input:
        print("Error: No input file specified. Use the -i option to specify the input FASTQ file.")
        sys.exit(1)
    if not args.output:
        print("Error: No output file specified. Use the -o option to specify the output FASTQ file.")
        sys.exit(1)
    
    # Check if the input file has .fastq or .fq extension
    if not (args.input.endswith(".fastq") or args.input.endswith(".fq")):
        print("Error: The input file must have a .fastq or .fq extension.")
        sys.exit(1)
    
    # Remove redundant sequences and save to output file
    remove_redundant_sequences(args.input, args.output)
    
    # Count sequences in input and output files
    count_input = count_sequences(args.input)
    count_output = count_sequences(args.output)
    
    # Calculate and format the percentage of unique sequences
    percent_retained = (count_output / count_input) * 100
    percent_retained = f"{percent_retained:.2f}"
    
    # Print unique sequences stats
    print("Unique sequences stats:")
    print(f"Total number of input sequences: {count_input}")
    print(f"Total number of output sequences: {count_output}")
    print(f"Percent of unique sequences: {percent_retained}%")

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [-i INPUT] [-o OUTPUT]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\iwin\AppData\Roaming\jupyter\runtime\kernel-25f6e5cd-59fb-4c39-8899-0ec598dcc971.json


SystemExit: 2