In [46]:
import os

def normalize_sequences_to_txt(input_file, output_directory, desired_length):
    output_file = os.path.join(output_directory, f"normalized_{os.path.basename(input_file)}")
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            sequence = line.strip()  # Remove any whitespace characters, like newline at the end
            if len(sequence) >= desired_length:
                # Trim the sequence if it's longer than the desired length
                trimmed_sequence = sequence[:desired_length]
                outfile.write(f"{trimmed_sequence}\n")  # Write just the sequence

def process_directory(directory, output_directory, desired_length):
    for file in os.listdir(directory):
        if file.endswith("_nucleotide_sequences.txt"):
            input_file = os.path.join(directory, file)
            print(f"Normalizing {input_file}...")
            normalize_sequences_to_txt(input_file, output_directory, desired_length)

desired_length = 100  # The chosen uniform length for sequences

# Directories
data_directory = '/users/hutruon/Assignment-1-CS490/Data1'
negative_files_directory = os.path.join(data_directory, "NegativeBedFiles")

# Process positive sequence files in the Data1 directory
process_directory(data_directory, data_directory, desired_length)

# Process negative sequence files in the NegativeBedFiles directory
process_directory(negative_files_directory, negative_files_directory, desired_length)



Normalizing /users/hutruon/Assignment-1-CS490/Data1/ENCFF931AKV_nucleotide_sequences.txt...
Normalizing /users/hutruon/Assignment-1-CS490/Data1/ENCFF825KFE_nucleotide_sequences.txt...
Normalizing /users/hutruon/Assignment-1-CS490/Data1/ENCFF768XXW_nucleotide_sequences.txt...
Normalizing /users/hutruon/Assignment-1-CS490/Data1/ENCFF538PLU_nucleotide_sequences.txt...
Normalizing /users/hutruon/Assignment-1-CS490/Data1/normalized_ENCFF092DWH_nucleotide_sequences.txt...
Normalizing /users/hutruon/Assignment-1-CS490/Data1/ENCFF615ZGF_nucleotide_sequences.txt...
Normalizing /users/hutruon/Assignment-1-CS490/Data1/ENCFF139HDN_nucleotide_sequences.txt...
Normalizing /users/hutruon/Assignment-1-CS490/Data1/normalized_ENCFF931AKV_nucleotide_sequences.txt...
Normalizing /users/hutruon/Assignment-1-CS490/Data1/ENCFF397KNY_nucleotide_sequences.txt...
Normalizing /users/hutruon/Assignment-1-CS490/Data1/ENCFF986UZO_nucleotide_sequences.txt...
Normalizing /users/hutruon/Assignment-1-CS490/Data1/normal

In [47]:
import os

def verify_sequence_lengths(txt_file, expected_length):
    with open(txt_file, 'r') as file:
        lengths = [len(line.strip()) for line in file]
    return all(length == expected_length for length in lengths)

def process_directory(directory, desired_length, prefix="normalized_"):
    all_lengths_ok = True
    # Loop over all files in the directory
    for file in os.listdir(directory):
        if file.startswith(prefix) and file.endswith("_nucleotide_sequences.txt"):
            full_path = os.path.join(directory, file)
            # Verify sequence lengths for each file
            lengths_ok = verify_sequence_lengths(full_path, desired_length)
            if not lengths_ok:
                all_lengths_ok = False
                print(f"File {file} in {directory} contains sequences that do not match the desired length.")
            else:
                print(f"File {file} in {directory} OK.")
    return all_lengths_ok

desired_length = 100  # The chosen uniform length for sequences

# Directories
data_directory = '/users/hutruon/Assignment-1-CS490/Data1'
negative_files_directory = os.path.join(data_directory, "NegativeBedFiles")

# Verify sequence lengths for normalized files in both directories
positive_ok = process_directory(data_directory, desired_length)
negative_ok = process_directory(negative_files_directory, desired_length, prefix="normalized_negative_")

if positive_ok and negative_ok:
    print("All normalized positive and negative files contain sequences of the desired length.")
else:
    print("One or more files contain sequences that do not match the desired length.")


File normalized_ENCFF092DWH_nucleotide_sequences.txt in /users/hutruon/Assignment-1-CS490/Data1 OK.
File normalized_ENCFF931AKV_nucleotide_sequences.txt in /users/hutruon/Assignment-1-CS490/Data1 OK.
File normalized_ENCFF615ZGF_nucleotide_sequences.txt in /users/hutruon/Assignment-1-CS490/Data1 OK.
File normalized_ENCFF139HDN_nucleotide_sequences.txt in /users/hutruon/Assignment-1-CS490/Data1 OK.
File normalized_ENCFF538PLU_nucleotide_sequences.txt in /users/hutruon/Assignment-1-CS490/Data1 OK.
File normalized_ENCFF986UZO_nucleotide_sequences.txt in /users/hutruon/Assignment-1-CS490/Data1 OK.
File normalized_ENCFF397KNY_nucleotide_sequences.txt in /users/hutruon/Assignment-1-CS490/Data1 OK.
File normalized_ENCFF825KFE_nucleotide_sequences.txt in /users/hutruon/Assignment-1-CS490/Data1 OK.
File normalized_ENCFF768XXW_nucleotide_sequences.txt in /users/hutruon/Assignment-1-CS490/Data1 OK.
File normalized_ENCFF298ANC_nucleotide_sequences.txt in /users/hutruon/Assignment-1-CS490/Data1 OK.
