# gsc-scrape: example usage

**NOTE**: `target_url` is intentionally left blank. You should already know what it is.

In [1]:
from gscscr.odor_data_coll import OdorDataCollector
from gscscr import utils

config = {
    'target_url': '[URL_STRING]',  # target webpage
    'file_prefix': 'gsc_', # prefix for the list files only
    'out_dir': '_test/', # local output path
    'talkative': True
    }

## demo for a specific list

NOTE: `test_run` is a little safety pin here. First test with the option turned on (set to True, as provided). You can then turn it off (set to False) for a full run.

In [2]:
list_num = 1

# create a data collector for this list
test_set = OdorDataCollector(list_num=list_num, **config)

In [3]:
# you can just retrieve the list of molecules from the database
_ = test_set.get_list_db()

CAS Number Listing : Starting with 50-00-0 to 123-99-9
file saved to: _test/gsc_list1.csv


In [4]:
# and/or go through all molecules in the list
# (turn off test_run to run through all molecules)
test_set.get_odor_info_from_list(test_run=True)

file already exists: _test/gsc_list1.csv
running through all 1728 molecules in the list ...
formaldehyde
time for webpage retrieval: 0.6137678623199463
Saved to: _test/list1/list1_mol0_rw1247381.txt
stopped by test_run option


In [5]:
# optionally, extract a fuller version of the list for further reference
_ = test_set.get_list_db(full_list=True)

CAS Number Listing : Starting with 50-00-0 to 123-99-9
file saved to: _test/gsc_list1_full.csv


## loop over all lists

Now the actual work. A 5x wait time (after scraping from one page, wait for 5x the time spent for scraping that page) is already implemented, but always try to be gentle to the target server...

Again `test_run` is our safety pin. Turn it off when you are ready.

In [7]:
# --- loop over all lists
list_num_iter = range(1,13) # from list 1 to 12

for list_num in list_num_iter:
    test_set = OdorDataCollector(list_num=list_num, **config)
    test_set.get_odor_info_from_list(test_run=True)

file already exists: _test/gsc_list1.csv
running through all 1728 molecules in the list ...
formaldehyde
list file already exists:_test/list1/list1_mol0_rw1247381.txt
stopped by test_run option
CAS Number Listing : Starting with 124-04-9 to 764-38-5
file saved to: _test/gsc_list2.csv
running through all 2353 molecules in the list ...
adipic acid
time for webpage retrieval: 0.7122130393981934
Saved to: _test/list2/list2_mol0_rw1001801.txt
stopped by test_run option
CAS Number Listing : Starting with 764-39-6 to 3623-52-7
file saved to: _test/gsc_list3.csv
running through all 2867 molecules in the list ...
pentenal
time for webpage retrieval: 0.7893209457397461
Saved to: _test/list3/list3_mol0_rw1457931.txt
stopped by test_run option
CAS Number Listing : Starting with 3632-91-5 to 7784-26-1
file saved to: _test/gsc_list4.csv
running through all 2289 molecules in the list ...
magnesium gluconate anhydrous
time for webpage retrieval: 0.4950840473175049
Saved to: _test/list4/list4_mol0_rw13

## once scraping is done, merge

In [8]:
def generate_dataset_summary_files(config, list_num_iter, talkative=True):
    allcounts = []
    for list_num in list_num_iter:
        odc = OdorDataCollector(list_num=list_num, **config)
        count_mylist, header = odc.get_list_summary_count() # writes out_list file
        allcounts.append(count_mylist)

    # also write a small table with molecule all_counts
    summarydir = config['summary_dir']
    acfilename = summarydir + 'all_counts.csv'
    utils.write_csv(acfilename, allcounts, header=header)
    if talkative:
        print('all-counts summary saved to: ' + acfilename)
    return allcounts, header

def merge_summary_files(config, list_num_iter, talkative=True):
    merged_list = []
    cnt = 0
    for list_num in list_num_iter:
        odc = OdorDataCollector(list_num=list_num, **config)
        merged_list_sub, header, cnt = odc.roll_summary(cnt)
        merged_list.extend(merged_list_sub)

    # save to a separate file
    summarydir = config['summary_dir']
    mrgfilename = summarydir + config.get('file_prefix', '') + 'out_merged.csv'
    utils.write_csv(mrgfilename, merged_list, header=header)
    if talkative:
        print('file saved to: ' + mrgfilename)

In [9]:
# add summary_dir to config
config['summary_dir'] = '_test/summary/'

# generate a summary file for each list
allcounts, header = generate_dataset_summary_files(config, list_num_iter)

file already exists: _test/gsc_list1_full.csv
file saved to: _test/summary/gsc_out_list1.csv
file already exists: _test/gsc_list2_full.csv
file saved to: _test/summary/gsc_out_list2.csv
file already exists: _test/gsc_list3_full.csv
file saved to: _test/summary/gsc_out_list3.csv
file already exists: _test/gsc_list4_full.csv
file saved to: _test/summary/gsc_out_list4.csv
file already exists: _test/gsc_list5_full.csv
file saved to: _test/summary/gsc_out_list5.csv
file already exists: _test/gsc_list6_full.csv
file saved to: _test/summary/gsc_out_list6.csv
file already exists: _test/gsc_list7_full.csv
file saved to: _test/summary/gsc_out_list7.csv
file already exists: _test/gsc_list8_full.csv
file saved to: _test/summary/gsc_out_list8.csv
file already exists: _test/gsc_list9_full.csv
file saved to: _test/summary/gsc_out_list9.csv
file already exists: _test/gsc_list10_full.csv
file saved to: _test/summary/gsc_out_list10.csv
file already exists: _test/gsc_list11_full.csv
file saved to: _test/

In [10]:
# finally, generate an all-odor summary for this database
merge_summary_files(config, list_num_iter)

file saved to: _test/summary/gsc_out_merged.csv
