# Script to Get IDs and Summaries

## Get ID's

In [8]:
# Imports
from Bio import Entrez
import csv
import time

# Define max_samples
max_samples = 100

# Define email
Entrez.email = "vela.vela.luis@gmail.com" #"karsten.leonhardt@posteo.de"

# Perform search - get handle
handle = Entrez.esearch(db="gds", term="GSE[ETYP] AND Homo[Organism]", usehistory="y", retmax = max_samples)

# Read results
record = Entrez.read(handle)

# Get idlist
idlist = record['IdList']

# Count entries
found_count = int(record['Count'])
read_count = len(idlist)

# Echo results
print('Total number of FOUND entries: ' + str(found_count))
print('Total number of READ  entries: ' + str(read_count))

# Close handle
handle.close()

Total number of FOUND entries: 46584
Total number of READ  entries: 100


## Get Summaries

In [9]:
# Set name of file
filename = 'summaries.csv'

# Open csv-target file
with open(filename, 'w') as opened_file:

    # write with writer
    csvwriter = csv.writer(opened_file)

    # Set fieldnames
#     fieldnames = ['Item', 'Id', 'Accession', 'GDS', 'title', 
#           'summary', 'GPL', 'GSE', 'taxon', 'entryType', 'gdsType', 
#           'ptechType', 'valType', 'SSInfo', 'subsetInfo', 'PDAT', 
#           'suppFile', 'Samples', 'Relations', 'ExtRelations', 
#           'n_samples', 'SeriesTitle', 'PlatformTitle', 'PlatformTaxa', 
#           'SamplesTaxa', 'PubMedIds', 'Projects', 'FTPLink', 'GEO2R']
    fieldnames = ['Id', 'Accession', 'title', 'summary', 'taxon']
        
    # Print fildnames
    csvwriter.writerow(fieldnames)

    # Begin retrieval
    for i, e_id in enumerate(idlist):
    
        # Echo info
        print("Going to download record: {:10.0f} ({:5.1f}%)".format(int(e_id), (i+1)/read_count*100))
    
        # Avoid HTTP error (max 3 queries per second ?)
        if (i%3==0): time.sleep(1)
        
        # Get Summary
        handle = Entrez.esummary(db="gds", id=e_id)
    
        # Read handle
        data = Entrez.read(handle)

        # Define list for things to print
        list_to_print = list()

        # Iterate over fieldnames
        for name in fieldnames:
            list_to_print.append(data[0][name])

        # Print the line
        csvwriter.writerow(list_to_print)
        
        # Close handle
        handle.close()    

Going to download record:  200112120 (  1.0%)
Going to download record:  200117735 (  2.0%)
Going to download record:  200117734 (  3.0%)
Going to download record:  200128119 (  4.0%)
Going to download record:  200126367 (  5.0%)
Going to download record:  200128077 (  6.0%)
Going to download record:  200126109 (  7.0%)
Going to download record:  200106774 (  8.0%)
Going to download record:  200125561 (  9.0%)
Going to download record:  200117301 ( 10.0%)
Going to download record:  200117311 ( 11.0%)
Going to download record:  200112164 ( 12.0%)
Going to download record:  200120993 ( 13.0%)
Going to download record:  200124453 ( 14.0%)
Going to download record:  200106592 ( 15.0%)
Going to download record:  200115197 ( 16.0%)
Going to download record:  200115196 ( 17.0%)
Going to download record:  200116984 ( 18.0%)
Going to download record:  200107387 ( 19.0%)
Going to download record:  200111073 ( 20.0%)
Going to download record:  200117145 ( 21.0%)
Going to download record:  2001280