In [21]:
import urllib
import urllib.request
import pickle
from time import sleep
import numpy as np

def extract_apa_sites_from_html(html) :
    table = html.replace('  ', '').split('<tbody>')[1].strip().split('</tbody>')[0].strip()
    
    site_strs = table.split('</tr>')

    sites = []
    
    for site_i, site_str in enumerate(site_strs) :
        if len(site_str) > 10 :
            gene_symbol = site_str.split("<td class=\"gene_symbol\">")[1].split('</td>')[0]
            chromosome = site_str.split("<td class=\"chromosome\">")[1].split('</td>')[0]
            start = site_str.split("<td class=\"start\">")[1].split('</td>')[0]
            end = site_str.split("<td class=\"end\">")[1].split('</td>')[0]
            location = site_str.split("<td class=\"location\">")[1].split('</td>')[0]
            strand = site_str.split("<td class=\"strand\">")[1].split('</td>')[0]
            read_median = site_str.split("<td class=\"read_median\">")[1].split('</td>')[0]
            read_mode = site_str.split("<td class=\"read_mode\">")[1].split('</td>')[0]
            reads_supporting_site = site_str.split("<td class=\"reads_supporting_site\">")[1].split('</td>')[0]
            lost_mirnas = site_str.split("<td class=\"lost_mirnas\">")[1].split('</td>')[0]

            site_dict = {
                'gene_symbol' : gene_symbol,
                'chromosome' : chromosome,
                'start' : start,
                'end' : end,
                'location' : location,
                'strand' : strand,
                'read_median' : read_median,
                'read_mode' : read_mode,
                'reads_supporting_site' : reads_supporting_site,
                'lost_mirnas' : lost_mirnas
            }
            sites.append(site_dict)
    
    return sites

In [22]:
crawl_tissuelist = ['kidney', 'pancreas', 'monocytes', 'all', 'pdac', 'prcc', 'full_blood', 'hlf']

crawl_genelist = pickle.load(open('apadb_crawl_genelist.pickle', 'rb'))

In [23]:
print(crawl_genelist)

['ABCB10' 'ABCD3' 'ABL2' ... 'USP9Y' 'UTY' 'ZFY']


In [49]:
#Create empty dict to store extracted sites per tissue
tissue_dict = {}

In [26]:
#Load stored tissue dict
tissue_dict = pickle.load(open('parsed_apa_tissue_dict.pickle', 'rb'))

In [27]:

def get_html(url_str, retries=3) :
    if retries <= 0 :
        print("Could not open '" + url_str + "'")
        return ''
    
    try :
        html = ''
        with urllib.request.urlopen(url_str) as response:
            html = str(response.read().decode("utf-8"))
        return html
    except urllib.request.URLError :
        return get_html(url_str, retries=retries-1)

save_folder = 'apadb_crawled/'

min_sleep_seconds = 0.75
max_sleep_seconds = 1.25

for tissue_i, tissue in enumerate(crawl_tissuelist) :
    print('Crawling tissue = ' + tissue)
    
    if tissue not in tissue_dict :
        tissue_dict[tissue] = {}
    
    for gene_i, gene in enumerate(crawl_genelist) :
        if gene_i % 100 == 0 :
            print('Crawling gene ' + str(gene_i) + '...')
            pickle.dump(tissue_dict, open('parsed_apa_tissue_dict.pickle', 'wb'))
        
        if gene in tissue_dict[tissue] :
            continue
        
        url_str = 'http://tools.genxpro.net/apadb/browse/human/' + tissue + '/' + gene + '/'
        
        html = get_html(url_str, retries=3)
        
        if html != '' :
            with open(save_folder + 'apadb_crawled_' + tissue + '_' + gene + '.html', 'w') as f :
                f.write(html)

            #Parse and extract apa sites
            tissue_dict[tissue][gene] = extract_apa_sites_from_html(html)
        
        #Sleep delay
        rand_delay = np.random.rand() * (max_sleep_seconds - min_sleep_seconds) + min_sleep_seconds
        sleep(rand_delay)
    
    pickle.dump(tissue_dict, open('parsed_apa_tissue_dict.pickle', 'wb'))

print('Crawling completed.')

Crawling tissue = kidney
Crawling gene 0...
Crawling gene 100...
Crawling gene 200...
Crawling gene 300...
Crawling gene 400...
Crawling gene 500...
Crawling gene 600...
Crawling gene 700...
Crawling gene 800...
Crawling gene 900...
Crawling gene 1000...
Crawling gene 1100...
Crawling gene 1200...
Crawling gene 1300...
Crawling gene 1400...
Crawling gene 1500...
Crawling gene 1600...
Crawling gene 1700...
Crawling gene 1800...
Crawling gene 1900...
Crawling gene 2000...
Crawling gene 2100...
Crawling gene 2200...
Crawling gene 2300...
Crawling gene 2400...
Crawling gene 2500...
Crawling gene 2600...
Crawling gene 2700...
Crawling gene 2800...
Crawling gene 2900...
Crawling gene 3000...
Crawling gene 3100...
Crawling gene 3200...
Crawling gene 3300...
Crawling gene 3400...
Crawling gene 3500...
Crawling gene 3600...
Crawling gene 3700...
Crawling gene 3800...
Crawling gene 3900...
Crawling gene 4000...
Crawling gene 4100...
Crawling gene 4200...
Crawling gene 4300...
Crawling gene 4400.

Crawling gene 10800...
Crawling gene 10900...
Crawling gene 11000...
Crawling gene 11100...
Crawling gene 11200...
Crawling gene 11300...
Crawling gene 11400...
Crawling gene 11500...
Crawling gene 11600...
Crawling gene 11700...
Crawling gene 11800...
Crawling gene 11900...
Crawling gene 12000...
Crawling gene 12100...
Crawling gene 12200...
Crawling gene 12300...
Crawling gene 12400...
Crawling gene 12500...
Crawling gene 12600...
Crawling gene 12700...
Crawling gene 12800...
Crawling gene 12900...
Crawling tissue = all
Crawling gene 0...
Crawling gene 100...
Crawling gene 200...
Crawling gene 300...
Crawling gene 400...
Crawling gene 500...
Crawling gene 600...
Crawling gene 700...
Crawling gene 800...
Crawling gene 900...
Crawling gene 1000...
Crawling gene 1100...
Crawling gene 1200...
Crawling gene 1300...
Crawling gene 1400...
Crawling gene 1500...
Crawling gene 1600...
Crawling gene 1700...
Crawling gene 1800...
Crawling gene 1900...
Crawling gene 2000...
Crawling gene 2100...


Crawling gene 8600...
Crawling gene 8700...
Crawling gene 8800...
Crawling gene 8900...
Crawling gene 9000...
Crawling gene 9100...
Crawling gene 9200...
Crawling gene 9300...
Crawling gene 9400...
Crawling gene 9500...
Crawling gene 9600...
Crawling gene 9700...
Crawling gene 9800...
Crawling gene 9900...
Crawling gene 10000...
Crawling gene 10100...
Crawling gene 10200...
Crawling gene 10300...
Crawling gene 10400...
Crawling gene 10500...
Crawling gene 10600...
Crawling gene 10700...
Crawling gene 10800...
Crawling gene 10900...
Crawling gene 11000...
Crawling gene 11100...
Crawling gene 11200...
Crawling gene 11300...
Crawling gene 11400...
Crawling gene 11500...
Crawling gene 11600...
Crawling gene 11700...
Crawling gene 11800...
Crawling gene 11900...
Crawling gene 12000...
Crawling gene 12100...
Crawling gene 12200...
Crawling gene 12300...
Crawling gene 12400...
Crawling gene 12500...
Crawling gene 12600...
Crawling gene 12700...
Crawling gene 12800...
Crawling gene 12900...
Cr