## Import all modules

In [266]:
from Bio import Seq
import urllib
from Bio import Entrez
from Bio import SeqIO
import statistics as stat

## Problem 1: Count the number of each nt in a string

In [175]:
#read in string
wd= "/mnt/c/Users/jenna/OneDrive/Desktop/Rosalind/"
filepath=wd+"rosalind_ini.txt"
seq = open(filepath, 'r').read().replace('\n', '')
    
#Print out the number of times each occurs
counts=[seq.count("A"), seq.count("C"), seq.count("G"), seq.count("T")]
print("Occurences of A/C/G/T:")
" ".join(map(str, counts))


Occurences of A/C/G/T:


'207 189 214 190'

## Problem 2: Given a protein's ID, retrieve a list of biological processes it's involved in from UniProt

In [121]:
#Read in id
wd= "/mnt/c/Users/jenna/OneDrive/Desktop/Rosalind/"
filepath=wd+"rosalind_ini.txt"

#get url for id
url="http://www.uniprot.org/uniprot/"+ id + ".txt"

#open up url
content=urllib.request.urlopen(url) 

#iterate through each line, saving lines with "GO", then split up to print just the process names
print("GO terms for given protein id:")
for line in content: 
    line=str(line)
    if "GO" in line:
        groups=line.split(";")
        group2=";".join(groups[2:3])
        print(group2.split(":")[1])

GO terms for given protein id:
cytoplasm
ATP binding
glycine-tRNA ligase activity
protein dimerization activity
glycyl-tRNA aminoacylation


## Problem 3: given a genus name and a range of dates, return the number of entries in Nucleotide GenBank

In [209]:
#read in search terms
wd= "/mnt/c/Users/jenna/OneDrive/Desktop/Rosalind/"
filepath=wd+"rosalind_gbk.txt"
searchList=open(filepath,'r').read().split("\n")

#assign to name. Assumes in format of genus \n start date \n end date
genus=searchList[0]
firstdate=searchList[1]
seconddate=searchList[2]

#get query to use
query= f'({genus}[Organism]) AND("{firstdate}"[Publication Date]: "{seconddate}"[Publication Date])'

#get the handle for the given organism
handle = Entrez.esearch(db="nucleotide", term=query)

#read record of the handle, print out the count
record = Entrez.read(handle)
print("Number of records for genus in given time frame: " + record["Count"])

Number of records for genus in given time frame: 989


## Problem 4: given list of genbank gene IDs, return the fasta file of the gene with the shortest header

In [213]:
#read in IDs
wd= "/mnt/c/Users/jenna/OneDrive/Desktop/Rosalind/"
filepath=wd+"rosalind_frmt.txt"
IDs=open(filepath).read().replace("\n", "").split(" ")

In [214]:
#get the handle for the given IDs, returning fasta data
handle = Entrez.efetch(db="nucleotide", id=IDs, rettype="fasta")

#use SeqIO to read in record as fasta format
records = list (SeqIO.parse(handle, "fasta")) 

#get index of fasta with shortest description
lengths=[]

for record in records:
    lengths.append(len(record.seq))
    
minIndex=lengths.index(min(lengths))

#Print out this fasta record
print(records[minIndex].format("fasta"))
    

>JX295575.1 Anopheles sinensis prophenoloxidase 1 (PPO-1) mRNA, partial cds
AACCTGCATCATTGGCATTGGCATCTTGTGTACCCGGGCGAAGGGCCCGATCGTGTCGTC
AACAAGGATCGTCGTGGAGAGTTGTTCTACTACATGCACCAGCAGCTGATCGCTCGCTAC
AACGTCGATCGCTTCTGCAACCGTTTGGCGCGGGTGCGTCCACTGACGAATCTGCGTGAG
CCTCTTCCGGAGGGATACTTCCCGAAACTCATCCGAAGCTTCACCAACCGTGCCTTCCCT
GCCCGACCTCAGAACCATATGTTGAGGGATTTGAATCGCATTGAGGACGATGTGGTACTC
TCGATCAGTGATCTGGAACGCTGGGGAAGCCGCATTGCCGAGAGCATTGATGGTGGATAC
GTGGTGGGCCCCGGTGGTGCACGTACTCCTCTGGATGAACAAACGGGTATCGACGTGCTG
GGCAACATCATGGAACCGTCGGCACTGTCGGTGAACCCGCAATTCTATGGAAACTACCAT
GGCCATATGCACAATCTCATCGCGTTCAGTCACGATCCTGAGAACCGCTTCCTGGAGGGG
TACGGTGTGGTGGGCGAGTTCCAGACGGCCATGCGTGACCCTACGTTCTACCGCTGG



## Problem 5: given a file with fastqs, convert to fasta

In [236]:
#get path to string
wd= "/mnt/c/Users/jenna/OneDrive/Desktop/Rosalind/"
filepath=wd+"rosalind_tfsq.txt"

#write out to fasta
SeqIO.convert(filepath,'fastq','test.fasta.txt','fasta')


## Problem 6: given a file with fastqs + quality threshold, print the number of fastqs that pass that threshold

In [300]:
#read in fastq as normal text, extract threshold
file="rosalind_phre.txt"
fileSplit=open(wd+file).read().split("\n")
thresh=int(fileSplit[0])

#remove first line (threshold) from fastq, then write back out as same file
with open(file, 'r') as fin:
    data = fin.read().splitlines(True)
with open(file, 'w') as fout:
    fout.writelines(data[1:])

In [302]:
#Initialize count, then count the number of fastqs with a mean quality score less than the given threshold
count=0
for record in list(SeqIO.parse(file, "fastq")):
    avgQ=stat.mean(record.letter_annotations["phred_quality"])
    if avgQ<thresh:
        count=count+1
print(str(count) + " fastqs have a mean quality score less than " + str(thresh))        


18 fastqs have a mean quality score less than19
