In [5]:
import os

def process_sequence_file(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        current_sequence = ''
        for line in infile:
            if line.startswith('>'):  # Identifier line
                if len(current_sequence) >= 100:  # Check if the current sequence can be processed
                    outfile.write(current_sequence[:100] + '\n')  # Write the first 100 nucleotides
                current_sequence = ''  # Reset for the next sequence
            else:
                current_sequence += line.strip()  # Build up the sequence
        if len(current_sequence) >= 100:  # Check the last sequence in the file
            outfile.write(current_sequence[:100] + '\n')

def process_all_txt_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.txt') and not filename.endswith('_100nt.txt'):  # Avoid reprocessing
            input_path = os.path.join(directory, filename)
            output_path = os.path.join(directory, filename.replace('.txt', '_100nt.txt'))
            process_sequence_file(input_path, output_path)
            print(f"Processed {filename} and saved to {output_path}")

# Specify the directory where your sequence files are located
sequence_directory = './'

# Process all .txt files in the directory
process_all_txt_files(sequence_directory)


Processed ENCFF139HDN_negative.txt and saved to ./ENCFF139HDN_negative_100nt.txt
Processed ENCFF825KFE_negative.txt and saved to ./ENCFF825KFE_negative_100nt.txt
Processed ENCFF615ZGF_negative.txt and saved to ./ENCFF615ZGF_negative_100nt.txt
Processed ENCFF397KNY_negative.txt and saved to ./ENCFF397KNY_negative_100nt.txt
Processed ENCFF092DWH_negative.txt and saved to ./ENCFF092DWH_negative_100nt.txt
Processed ENCFF931AKV_negative.txt and saved to ./ENCFF931AKV_negative_100nt.txt
Processed ENCFF298ANC_negative.txt and saved to ./ENCFF298ANC_negative_100nt.txt
Processed ENCFF538PLU_negative.txt and saved to ./ENCFF538PLU_negative_100nt.txt
Processed ENCFF986UZO_negative.txt and saved to ./ENCFF986UZO_negative_100nt.txt
Processed ENCFF768XXW_negative.txt and saved to ./ENCFF768XXW_negative_100nt.txt


In [4]:
import os

def create_negative_bed(input_bed_file, output_bed_file):
    with open(input_bed_file, 'r') as infile, open(output_bed_file, 'w') as outfile:
        lines = infile.readlines()
        intervals = sorted([line.strip().split('\t') for line in lines], key=lambda x: (x[0], int(x[1])))

        for i in range(len(intervals) - 1):
            current_chrom, current_end = intervals[i][0], int(intervals[i][2])
            next_chrom, next_start = intervals[i+1][0], int(intervals[i+1][1])

            if current_chrom == next_chrom and current_end < next_start - 1:
                outfile.write(f"{current_chrom}\t{current_end + 1}\t{next_start - 1}\n")

def process_all_bed_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.bed') and not filename.endswith('_negative.bed'):  # Process only original BED files
            input_path = os.path.join(directory, filename)
            output_path = os.path.join(directory, filename.replace('.bed', '_negative.bed'))
            create_negative_bed(input_path, output_path)
            print(f"Created negative BED file: {output_path}")

# Specify the directory where your BED files are located
bed_files_directory = './'

# Create negative BED files for all BED files in the directory
process_all_bed_files(bed_files_directory)


Created negative BED file: ./ENCFF139HDN_negative.bed
Created negative BED file: ./ENCFF931AKV_negative.bed
Created negative BED file: ./ENCFF092DWH_negative.bed
Created negative BED file: ./ENCFF986UZO_negative.bed
Created negative BED file: ./ENCFF397KNY_negative.bed
Created negative BED file: ./ENCFF825KFE_negative.bed
Created negative BED file: ./ENCFF538PLU_negative.bed
Created negative BED file: ./ENCFF298ANC_negative.bed
Created negative BED file: ./ENCFF768XXW_negative.bed
Created negative BED file: ./ENCFF615ZGF_negative.bed
