In [27]:
import requests
from bs4 import BeautifulSoup
import re

## class `GenscanOutput`
Attributes:
- `status` - request status
- `cds_list` - the list with predicted protein sequences
- `intron_list` - the list if found introns. It could be any data type but it should contain information about its number, start and stop positions
- `exon_list` - same , but with exons.

In [90]:
class GenscanOutput:
    def __init__(self, status, cds_list, intron_list,exon_list):
        self.status = status
        self.cds_list = cds_list
        self.intron_list = intron_list
        self.exon_list = exon_list

## Function `run_genscan (sequence=None, sequence_file=None, organism="Vertebrate", exon_cutoff=1.00, sequence_name="")`
 This function get request likewise filling the form.
It takes same parameters as you should give from site (except Print options):
- `sequence` - sequence in format of string or any other that will be convenient,
- `sequence_file` - path to file with sequence that will be uploaded instead of sequence.

Returns object of class GenscanOutput

In [92]:
def run_genscan(sequence=None, sequence_file=None, organism="Vertebrate", exon_cutoff=1.00, sequence_name=""):

    #request
    url = 'http://hollywood.mit.edu/cgi-bin/genscanw_py.cgi'
    payload = {
        "-o": organism,
        "-e": exon_cutoff,
        "-n": sequence_name,
        "-p": "Predicted peptides only"
    }
    if sequence is not None:
        payload['-s'] = sequence
        resp = requests.post(url, data=payload)
    if sequence_file is not None:
        seq_file = open(sequence_file, 'rb')
        file_list = {'-u': seq_file}
        resp = requests.post(url, data=payload, files = file_list)
        seq_file.close()
    resp_status = resp.status_code
    # save_content(resp, "page.html")
    soup = BeautifulSoup(resp.content)
    out = soup.find("pre").text
    out = str(out)

    # cds
    cds_list = []
    cds_pattern = re.compile(r'^>.+\n+([A-Z\n]+)', re.M)
    cds_drop_title = re.compile(r'^[A-Z\n]*', re.M)
    cds_find = cds_pattern.finditer(out)
    for i in cds_find:
        seq = cds_drop_title.findall(i.group())
        cds_list.append(seq[1].replace('\n', ''))

    # introns
    intron_list = []
    intron_pattern = re.compile(r'\d+ [A-z]{4}.*', re.M)
    intron_find = intron_pattern.finditer(out)
    for i in intron_find:
        start_pos = int(i.group()[12:16])
        end_pos = int(i.group()[19:23])
        intron_list += [[start_pos, end_pos]]

    # exons
    exon_list = []
    seq_length_pattern = re.compile(r'(\d+) bp', re.M)
    seq_length = seq_length_pattern.findall(out)
    length = int(seq_length[0])
    if len(intron_list) >= 1:
        exon_list += [[1, intron_list[0][0]]]
        if len(intron_list) >= 2:
            exon_list += [[intron_list[-1][1], length]]
            if len(intron_list) >=3:
                for i in range(1, len(intron_list)):
                    exon_list += [[intron_list[i-1][1], intron_list[i][0]]]

    return GenscanOutput(status = resp_status, cds_list = cds_list, intron_list = intron_list ,exon_list = exon_list)

### Example for running with sequence

In [4]:
sequence = """GTATAATATCGATTTTAATTTTAACCAAGTAAACCGGATAGGAGGAGATTCGTAAATTGTAATCGATAAA
AGAGATCTGTGTCAAAAGTACGCGTGAGGTTCTCTCTTCTCTGCAAACAAAAATCCTGGCGTTGATAGCG
ATTGGTTAATTGTCGGTGAATCAACGTTTTTCTGTTACACATATCAATCAATCATCATCCTCTTCATACT
CTCTTCTTCTTCTTCATCGTAATCATTAGCTTCAATTCCGAGTTTTCTCTTCGTCGCCGTTGTGCTAGAG
TTTTGTCAGATCTCGCCGGGAAATTAGCAATGGCGTTTTAGTCTGTTTACCTGTGAAGCTTTTGCGTAAA
CGCTGGATTCTTCTCAAATCTCTCTCACCTGATCTCTTATTAGATTCCGAGTATGATTTGTTTCTCTCCT
TCATTATATTCGGGTTGCTTCCACGATTTCGTTGCTTGATTTAGTTTTTTTTTTGAATTTGATAGGGATG
GCGCTAAAGCGAGGGCTATCTGGAGTTAACCGGATTAGAGGAAGTGGTGGTGGATCTCGATCTGTGCTTG
TGCTTCTCATATTTTTCTGTGTTTTTGCACCTCTTTGCTTCTTTGTTGGCCGAGGAGTGTATATCGATTC
CTCAAATGGTATGCATATATTGCTTATCTTTCTATATACTAATTTGCAGATTCATGTATAGAGATTCAGA
AATTCTATTAATGATTGAGAACATTAGCATAGTTTTACCACTCTTTAGTTTTCAGCCTTTACTATCTCAA
TTTTAAGTTACTTTTAGCTTGATGTGACACTGATGATATTCAGACAATTGCTATTTTTCTAGACTGGCAA
AATTGGTTCTTTGCTATTTTACTATCGAGCTTTGGTTTCAAGGATTAGTGTCTCTCGATTTTGTGTTACA
TATATAATTTATCTGCGAATTGTCCATCTGATGGAGCTTACTTTCATTACTGCAGATTATTCAATTGTTT
CTGTGAAGCAGGTATGTCGGCTTAAACTCTATCAAATTATCTACTTGTATGTTTTGACAGAAGTAAGAAT
GTTGTTTGATTGGACTGTTTTTTTTTGTTTTTTTTGGCAGAATCTTGACTGGAGAGAACGTTTAGCAATG
CAATCTGTTAGATCTCTTTTCTCGAAAGAGGTAGTTTAAATCTTTGAAACTTTGCTTCGATTTATTGTCT
TTGTCAGTTTTTCTGAGACTGGATAGTTTCTTACTAAATTTGTTGTCCAGATACTAGATGTTATAGCAAC
CAGCACAGCTGATTTGGGTCCTCTTAGCCTTGATTCTTTTAAGAAAAACAATTTGTCTGCATCATGGCGG
GGAACCGGAGTAGACCCCTCCTTTAGACATTCTGAGGTGTGGTTTAAGTGTCTCATTTCTTTTCTAAGCT
TCTATTCTCAATGATGTTTTCTCATAGTTTTCATCTTTTGCTGTACTTGAGGCAGAATCCAGCAACTCCT
GATGTCAAATCTAATAACCTGAATGAAAAACGTGACAGCATTTCAAAAGGTAGTGGCTTTTAATTTAAGT
GGCATCTCTGATTTGACGTTTTCCATCTCTGATTCTTCTATATATTGCTGATGCAGATAGTATCCATCAG
AAAGTTGAGACACCTACAAAGATTCACAGAAGGGTAACTATTATTTATTAAAGTTCAATGTCATGATGCA
TCAAATACTCTTTAGGGACTTCTTGTGTCTTTAGTATTTCATTATCAACATTTTTTTTATCCATCTTGAC
ATTTCAGCAACTAAGAGAGAAAAGGCGTGAGATGCGGGCAAATGAGTTAGTTCAGCACAATGATGACACG
ATTTTGAAACTCGAAAATGCTGCCATTGAACGCTCTAAGTCTGTTGATTCTGCAGTCCTTGGTAAATACA
GTATTTGGAGAAGAGAAAATGAGAATGACAACTCTGATTCAAATATACGCTTGATGCGGGATCAAGTAAT
AATGGCTAGAGTCTATAGTGGGATTGCAAAATTGAAAAACAAGAACGATTTGTTACAAGAACTCCAGGCC
CGACTTAAGGACAGCCAACGGGTTTTGGGGGAAGCAACATCTGATGCTGATCTTCCTCGGAGGTAAATTA
CTCCTTTTGGTTTCTACTACTTGCCTTTTTTCGACTGCTTGTGTACTTCTTAAAGTAACATGCTCTCTAA
ATTACTTTTCATATAGTGCGCATGAGAAACTCAGAGCCATGGGTCAAGTCTTGGCTAAAGCTAAGATGCA
GTTATATGACTGCAAGCTGGTTACTGGAAAGCTGAGAGCAATGCTTCAGACTGCCGACGAACAAGTGAGG
AGCTTAAAGAAGCAGAGTACTTTTCTGGCTCAGTTAGCAGCAAAAACCATTCCAAATCCTATCCATTGCC
TATCAATGCGCTTGACTATCGATTACTATCTTCTGTCTCCGGAGAAAAGAAAATTCCCTCGGAGTGAAAA
CCTAGAAAACCCTAATCTTTATCATTATGCCCTCTTTTCCGACAATGTATTAGCTGCATCAGTAGTTGTT
AACTCAACCATCATGAATGCCAAGGTAAAATCATTAGCTCTTATCACTTGATTCGTCCTAATATCTGTAT
GTTTAATTTTAGATACGCATTGCAGAGCTGAAATTAAAAATGTCTTTATTCCCTATATGTGCAGGATCCT
TCTAAGCATGTTTTTCACCTTGTCACGGATAAACTCAATTTCGGAGCAATGAACATGTGGTTCCTCCTAA
ACCCACCCGGAAAGGCAACCATACATGTGGAAAACGTCGATGAGTTTAAGTGGCTCAATTCATCTTACTG
TCCTGTCCTTCGTCAGCTTGAATCTGCAGCAATGAGAGAGTACTATTTTAAAGCAGACCATCCAACTTCA
GGCTCTTCGAATCTAAAATACAGAAACCCAAAGTATCTATCCATGTTGAATCACTTGAGATTCTACCTCC
CTGAGGTTTATCCCAAGCTGAACAAAATCCTCTTCCTGGACGATGACATCATTGTTCAGAAAGACTTGAC
TCCACTCTGGGAAGTTAACCTGAACGGCAAAGTCAACGGTGCAGTCGAAACCTGTGGGGAAAGTTTCCAC
AGATTCGACAAGTATCTCAACTTTTCGAATCCTCACATTGCGAGGAACTTCAATCCAAATGCTTGTGGAT
GGGCTTATGGAATGAACATGTTCGACCTAAAGGAATGGAAGAAGAGAGACATCACTGGTATATACCACAA
GTGGCAAAACATGGTAAATAACTCTTTAATTCTTTGCAACAAATAGTCTAATGATGGTTTATTATTTTAT
TTATTGTGGTTTGCTCATTTTGGCTTGTGTTGTTTAGAATGAGAACAGGACACTATGGAAGCTAGGGACA
TTGCCACCAGGATTAATAACATTCTACGGATTAACACATCCCTTAAACAAGGCGTGGCATGTGCTGGGAC
TTGGATATAACCCGAGTATCGACAAGAAGGACATTGAGAATGCAGCAGTGGTTCACTATAACGGGAACAT
GAAACCATGGTTGGAGTTGGCAATGTCCAAATATCGGCCGTATTGGACCAAGTACATCAAGTTTGATCAC
CCATATCTTCGTCGTTGCAACCTTCATGAATAAAATCAAATCTTTGTTAGATGATTTGAGGTAATTACTT
CTAATGTTCTGTTGTTCCCATTCCATTGCTCTTTATTCCACACATTGTTAATTGTTTATTCATATCGTTT
TTTAGTCATTCTTTTCTTCTCTTCTTCTTTCGGGTTAAAAACACTTATGTTATTTTTTGAGTTTTTCTAA
TGATAGTAATATGTTTCTGGAGATTCTCTTGATGGTCTTGTGAAAAAATTGAAGAACACATACTATCTCT
CTATTTACTATATACTGTAAGTCAAAATAATAATCCAATTTTTTGTT"""

In [99]:
example = run_genscan(sequence=sequence)

In [100]:
example.cds_list

['ILDVIATSTADLGPLSLDSFKKNNLSASWRGTGVDPSFRHSEQLREKRREMRANELVQHNDDTILKLENAAIERSKSVDSAVLGKYSIWRRENENDNSDSNIRLMRDQVIMARVYSGIAKLKNKNDLLQELQARLKDSQRVLGEATSDADLPRSAHEKLRAMGQVLAKAKMQLYDCKLVTGKLRAMLQTADEQVRSLKKQSTFLAQLAAKTIPNPIHCLSMRLTIDYYLLSPEKRKFPRSENLENPNLYHYALFSDNVLAASVVVNSTIMNAKDPSKHVFHLVTDKLNFGAMNMWFLLNPPGKATIHVENVDEFKWLNSSYCPVLRQLESAAMREYYFKADHPTSGSSNLKYRNPKYLSMLNHLRFYLPEVYPKLNKILFLDDDIIVQKDLTPLWEVNLNGKVNGAVETCGESFHRFDKYLNFSNPHIARNFNPNACGWAYGMNMFDLKEWKKRDITGIYHKWQNMNENRTLWKLGTLPPGLITFYGLTHPLNKAWHVLGLGYNPSIDKKDIENAAVVHYNGNMKPWLELAMSKYRPYWTKYIKFDHPYLRRCNLHE']

In [101]:
example.exon_list

[[1, 1241],
 [3828, 3897],
 [1366, 1758],
 [2092, 2187],
 [2544, 2655],
 [3233, 3328],
 [3603, 3823]]

In [102]:
example.intron_list

[[1241, 1366],
 [1758, 2092],
 [2187, 2544],
 [2655, 3233],
 [3328, 3603],
 [3823, 3828]]

### Example for running with `./test_file`

In [105]:
example = run_genscan(sequence_file='./test_file.fasta')

In [106]:
example.cds_list

['ILDVIATSTADLGPLSLDSFKKNNLSASWRGTGVDPSFRHSEQLREKRREMRANELVQHNDDTILKLENAAIERSKSVDSAVLGKYSIWRRENENDNSDSNIRLMRDQVIMARVYSGIAKLKNKNDLLQELQARLKDSQRVLGEATSDADLPRSAHEKLRAMGQVLAKAKMQLYDCKLVTGKLRAMLQTADEQVRSLKKQSTFLAQLAAKTIPNPIHCLSMRLTIDYYLLSPEKRKFPRSENLENPNLYHYALFSDNVLAASVVVNSTIMNAKDPSKHVFHLVTDKLNFGAMNMWFLLNPPGKATIHVENVDEFKWLNSSYCPVLRQLESAAMREYYFKADHPTSGSSNLKYRNPKYLSMLNHLRFYLPEVYPKLNKILFLDDDIIVQKDLTPLWEVNLNGKVNGAVETCGESFHRFDKYLNFSNPHIARNFNPNACGWAYGMNMFDLKEWKKRDITGIYHKWQNMNENRTLWKLGTLPPGLITFYGLTHPLNKAWHVLGLGYNPSIDKKDIENAAVVHYNGNMKPWLELAMSKYRPYWTKYIKFDHPYLRRCNLHE']

In [107]:
example.exon_list

[[1, 1280],
 [3867, 3936],
 [1405, 1797],
 [2131, 2226],
 [2583, 2694],
 [3272, 3367],
 [3642, 3862]]

In [108]:
example.intron_list

[[1280, 1405],
 [1797, 2131],
 [2226, 2583],
 [2694, 3272],
 [3367, 3642],
 [3862, 3867]]