# --- Improved Luis's Script ---

## Get ID's

In [12]:
# Imports
from Bio import Entrez
import csv
import time
import pandas as pd
from urllib.error import HTTPError  # for Python 2 use: from urllib2 import HTTPError  # for Python 2

# Define max_samples
max_samples = 1000

# Define email
Entrez.email = "vela.vela.luis@gmail.com" #"karsten.leonhardt@posteo.de"

# Perform search - get handle
handle = Entrez.esearch(db="gds", term="GSE[ETYP] AND Homo[Organism]", usehistory="y", retmax = max_samples)

# Read results
record = Entrez.read(handle)

# Get idlist
idlist = record['IdList']

# Count entries
found_count = int(record['Count'])
read_count = len(idlist)

# Echo results
print('Total number of FOUND entries: ' + str(found_count))
print('Total number of READ  entries: ' + str(read_count))

# Close handle
handle.close()

Total number of FOUND entries: 46584
Total number of READ  entries: 1000


## Get Summaries

In [13]:
filename = 'test_'+str(max_samples)+'.csv'

# Open csv-target file
with open(filename, 'w') as opened_file:

    # write with writer
    csvwriter = csv.writer(opened_file)

    # Set fieldnames
#     fieldnames = ['Item', 'Id', 'Accession', 'GDS', 'title', 
#           'summary', 'GPL', 'GSE', 'taxon', 'entryType', 'gdsType', 
#           'ptechType', 'valType', 'SSInfo', 'subsetInfo', 'PDAT', 
#           'suppFile', 'Samples', 'Relations', 'ExtRelations', 
#           'n_samples', 'SeriesTitle', 'PlatformTitle', 'PlatformTaxa', 
#           'SamplesTaxa', 'PubMedIds', 'Projects', 'FTPLink', 'GEO2R']
    fieldnames = ['Id', 'Accession', 'title', 'summary', 'taxon']
        
    # Print fildnames
    csvwriter.writerow(fieldnames)

    # Begin retrieval
    for i, e_id in enumerate(idlist):
    
        # Echo info
        print("Going to download record: {:10.0f} ({:5.1f}%)".format(int(e_id), (i+1)/read_count*100))
    
        # Avoid HTTP error (max 3 queries per second ?)
        if (i%3==0): time.sleep(1)
        
        # Get Summary
        handle = Entrez.esummary(db="gds", id=e_id)
    
        # Read handle
        data = Entrez.read(handle)

        # Define list for things to print
        list_to_print = list()

        # Iterate over fieldnames
        for name in fieldnames:
            list_to_print.append(data[0][name])

        # Print the line
        csvwriter.writerow(list_to_print)
        
        # Close handle
        handle.close()    

Going to download record:  200112120 (  0.1%)
Going to download record:  200117735 (  0.2%)
Going to download record:  200117734 (  0.3%)
Going to download record:  200128119 (  0.4%)
Going to download record:  200126367 (  0.5%)
Going to download record:  200128077 (  0.6%)
Going to download record:  200126109 (  0.7%)
Going to download record:  200106774 (  0.8%)
Going to download record:  200125561 (  0.9%)
Going to download record:  200117301 (  1.0%)
Going to download record:  200117311 (  1.1%)
Going to download record:  200112164 (  1.2%)
Going to download record:  200120993 (  1.3%)
Going to download record:  200124453 (  1.4%)
Going to download record:  200106592 (  1.5%)
Going to download record:  200115197 (  1.6%)
Going to download record:  200115196 (  1.7%)
Going to download record:  200116984 (  1.8%)
Going to download record:  200107387 (  1.9%)
Going to download record:  200111073 (  2.0%)
Going to download record:  200117145 (  2.1%)
Going to download record:  2001280

HTTPError: HTTP Error 500: Internal Server Error

## Check IF DataFrame

In [8]:
# Read to DataFrame
pd.read_csv(filename, sep=',', header=[0], error_bad_lines=False)

Unnamed: 0,Id,Accession,title,summary,taxon
0,200112120,GSE112120,Risk SNPs mediated promoter-enhancer switching...,To determine the binding of H3K4me1 and H3K4me...,Homo sapiens
1,200117735,GSE117735,The ATPase module of mammalian SWI/SNF family ...,This SuperSeries is composed of the SubSeries ...,Homo sapiens
2,200117734,GSE117734,The mSWI/SNF ATPase module mediates subcomplex...,Perturbations to mammalian SWI/SNF (mSWI/SNF) ...,Homo sapiens
3,200128119,GSE128119,COX-2 mediates tumor-stromal Prolactin signali...,Tumor-stromal communication within the microen...,Homo sapiens
4,200126367,GSE126367,Copy number analysis of selumetinib-resistant ...,Copy number analysis to compare parental color...,Homo sapiens
5,200128077,GSE128077,Whole genome-derived tiled peptide arrays dete...,Investigation of whole genome-derived tiled pe...,Homo sapiens
6,200126109,GSE126109,RNA sequencing analysis of selumetinib-resista...,RNA sequencing analysis to compare parental co...,Homo sapiens
7,200106774,GSE106774,Risk SNPs mediated promoter-enhancer switching...,To determine the functional mechanisms of PCAT...,Homo sapiens
8,200125561,GSE125561,Transcription factor SPIB binding sites identi...,SPIB overexpressed in lung cancer and promoted...,Homo sapiens
9,200117301,GSE117301,The mSWI/SNF ATPase module mediates subcomplex...,Perturbations to mammalian SWI/SNF (mSWI/SNF) ...,Homo sapiens
