# File opening, reading, writing, etc

In [14]:
# Typically save a variable with a file path
my_file = "coding.txt"

# Can open and read like so
file_handle = open(my_file)
file_contents = file_handle.read()
print(file_contents)

# Make sure to close the file handle!
file_handle.close()

# Instead of having to manually close the file, I typically use the "with open"
# syntax to open files. This automatically closes the file after it's parsed
with open(my_file) as infile:
    contents = infile.read()
print(contents)

# You can write things to files as well - you need to use open(file_name, "w") 
# Note that what we did earlier, open(file), is equivelent to open(file, "r") (e.g.
# "r" is the default argument.)
message = "sos"
output_file = "send_help.txt"
with open(output_file, "w") as outfile:
    outfile.write(message)

ATCGATCGATCGATCGACTGACTAGTCATAGCTATGCATGTAGCTACTCGATCGATCGATCGATCATCGATCGATATCGATGCATCGACTACTAT
ATCGATCGATCGATCGACTGACTAGTCATAGCTATGCATGTAGCTACTCGATCGATCGATCGATCATCGATCGATATCGATGCATCGACTACTAT


# Splitting genomic DNA

From Chapter 2, last problem


In [5]:

problem4_dna_sequence = "ATCGATCGATCGATCGACTGACTAGTCATAGCTATGCATGTAGCTACTCGATCGATCGATCGATCGATCGATCGATCGATCGATCATGCTATCATCGATCGATATCGATGCATCGACTACTAT"
first_exon_end_character = 63
second_exon_start_character = 91

# First, we need to adjust the positions. The 63rd character is index 62, and the 91st
# character is index 90. Sorry.
first_exon_end = 62
second_exon_start = 90

first_exon = problem4_dna_sequence[:first_exon_end]
second_exon = problem4_dna_sequence[second_exon_start:]

coding_sequence = first_exon + second_exon
intron = problem4_dna_sequence[first_exon_end:second_exon_start]


In [6]:
intron_output_file = "intron.txt"
coding_output_file = "coding.txt"



with open(coding_output_file, "w") as outfile:
    outfile.write(coding_sequence)


with open(intron_output_file, "w") as outfile:
    outfile.write(intron)


# Write fasta

In [16]:
outfile = "sequences.fasta"

header_1 = "ABC123"
seq_1 = "ATCGTACGATCGATCGATCGCTAGACGTATCG"

header_2 = "DEF456"
seq_2 = "actgatcgacgatcgatcgatcacgact"

header_3 = "HIJ789"
seq_3 = "ACTGAC-ACTGT--ACTGTA----CATGTG"

# Format the sequences
seq_1 = seq_1.upper().replace("-", "")
seq_2 = seq_2.upper().replace("-", "")
seq_3 = seq_3.upper().replace("-", "")

# prepare entries for each
fasta1 = ">" + header_1 + "\n" + seq_1 + "\n"
fasta2 = ">" + header_2 + "\n" + seq_2 + "\n"
fasta3 = ">" + header_3 + "\n" + seq_3 + "\n"

with open(outfile, "w") as outfile:
    outfile.write(fasta1)
    outfile.write(fasta2)
    outfile.write(fasta3)


# Keep in mind - typically you'd make a function to do processing that you repeat over
# and over again.


# Multiple fastas

In [17]:
outfile_path1 = header_1 + ".fasta"
outfile_path2 = header_2 + ".fasta"
outfile_path3 = header_3 + ".fasta"

with open(outfile_path1, "w") as outfile:
    outfile.write(fasta1)
with open(outfile_path2, "w") as outfile:
    outfile.write(fasta2)
with open(outfile_path3, "w") as outfile:
    outfile.write(fasta3)

# What I'd ACTUALLY do

In [19]:
sequences = dict()

# create a dictionary to hold the sequence information
sequences = {
    header_1: seq_1,
    header_2: seq_2,
    header_3: seq_3
}

# Loop over the contents of the dictionary
for header, seq in sequences.items():
    seq = seq.upper().replace("-", "")
    out_contents = ">" + header + "\n" + seq + "\n"
    outfile = header + ".fasta"

    with open(outfile, "w") as outfile:
        outfile.write(out_contents)


