In [1]:
import ee
ee.Authenticate()
import pandas as pd
import os
from gee_utils_alt import export_images, wait_on_tasks
import configparser
import math
import time

config = configparser.ConfigParser()
config.read('config.ini')



['config.ini']

In [2]:
ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com', project='animated-way-451621-i3')

In [5]:
data_dir = './data'
dhs_cluster_file_path = os.path.join(data_dir, 'senegal_shuffled_coords_imputed_2.csv')
df = pd.read_csv(dhs_cluster_file_path)
df.head()

Unnamed: 0,country,survey_start_year,year,lat,lon,GID_1,GID_2,households,rural,iwi,uid,origin_lat,origin_lon
0,senegal,1992,1992,16.043602,-16.487102,SEN.10,SEN.10.3,10,0,29.504925,senegal_1992_00000,16.0333,-16.5
1,senegal,1992,1992,13.782215,-13.678632,SEN.12,SEN.12.4,6,0,31.092883,senegal_1992_00001,13.7833,-13.6667
2,senegal,1992,1992,14.741053,-17.399721,SEN.1,SEN.1.3,16,0,31.332407,senegal_1992_00002,14.75,-17.4
3,senegal,1992,1992,12.4333,-12.0167,SEN.6,SEN.6.1,14,1,11.46875,senegal_1992_00003,12.4333,-12.0167
4,senegal,1992,1992,13.95,-16.1667,SEN.5,SEN.5.2,11,1,15.449591,senegal_1992_00004,13.95,-16.1667


In [9]:
surveys = list(df.groupby(['country', 'year']).groups.keys())

In [7]:
df[(df['country'] == 'senegal') & (df['year'] == 1992)].iloc[:10]

Unnamed: 0,country,survey_start_year,year,lat,lon,GID_1,GID_2,households,rural,iwi,uid,origin_lat,origin_lon
0,senegal,1992,1992,16.043602,-16.487102,SEN.10,SEN.10.3,10,0,29.504925,senegal_1992_00000,16.0333,-16.5
1,senegal,1992,1992,13.782215,-13.678632,SEN.12,SEN.12.4,6,0,31.092883,senegal_1992_00001,13.7833,-13.6667
2,senegal,1992,1992,14.741053,-17.399721,SEN.1,SEN.1.3,16,0,31.332407,senegal_1992_00002,14.75,-17.4
3,senegal,1992,1992,12.4333,-12.0167,SEN.6,SEN.6.1,14,1,11.46875,senegal_1992_00003,12.4333,-12.0167
4,senegal,1992,1992,13.95,-16.1667,SEN.5,SEN.5.2,11,1,15.449591,senegal_1992_00004,13.95,-16.1667
5,senegal,1992,1992,16.241217,-16.135265,SEN.10,SEN.10.1,17,1,9.775438,senegal_1992_00005,16.2667,-16.1333
6,senegal,1992,1992,13.5333,-13.5167,SEN.12,SEN.12.4,18,1,28.241736,senegal_1992_00006,13.5333,-13.5167
7,senegal,1992,1992,13.95,-16.1667,SEN.5,SEN.5.2,3,1,27.162133,senegal_1992_00007,13.95,-16.1667
8,senegal,1992,1992,14.33,-16.1667,SEN.5,SEN.5.2,1,1,43.935841,senegal_1992_00008,14.33,-16.1667
9,senegal,1992,1992,14.749187,-17.403344,SEN.1,SEN.1.3,13,0,39.599156,senegal_1992_00009,14.75,-17.4


In [None]:
def test_export(df, country, year):
    test_df = df[(df['country'] == country) & (df['year'] == year)].iloc[:3]
    test_tasks = export_images(test_df,
                               country=country,
                               year=year,
                               export_folder=config['GCS']['EXPORT_FOLDER'],  # 'data/dhs_tfrecords_raw',
                               export='gcs',
                               bucket=config['GCS']['BUCKET'],
                               ms_bands=['BLUE', 'GREEN', 'RED', 'NIR', 'SWIR1', 'SWIR2', 'TEMP1'],
                               include_nl=True,
                               start_year=1990,
                               end_year=2020,
                               span_length=3,
                               chunk_size=5)
    wait_on_tasks(test_tasks, poll_interval=60)

test_export(df, surveys[0][0], surveys[0][1])

In [8]:
latest_tasks = {}
for survey in surveys:
    latest_tasks[survey] = -1

In [None]:
for survey in surveys:
    files_path = f'gs://{config['GCS']['BUCKET']}/{config['GCS']['EXPORT_FOLDER']}/{survey[0]}_{survey[1]}'
    files_in_bucket = !gsutil ls {files_path}*
    if files_in_bucket[-1].startswith(files_path):
        latest_file = files_in_bucket[-1]
        latest_file_nr = int(latest_file[len(files_path)+1:len(files_path)+5])
        latest_tasks[survey] = latest_file_nr

In [None]:
print('Latest tasks already in bucket:\n', latest_tasks)

In [None]:
# Get task list from GEE
gee_tasks = !earthengine task list

# Loop over these tasks. Save the latest in "last_tasks", if it's higher than what is already in the GCS bucket.
for line in gee_tasks:
    if 'Export.table' in line:
        task = line.split()[2]
        survey_string = task.split('_')[:2]
        survey = (survey_string[0], int(survey_string[1]))
        if survey not in surveys:
            continue
        task_nr = int(task.split('_')[2][:4])
        if task_nr > latest_tasks[survey]:
            latest_tasks[survey] = task_nr

In [None]:
print('Latest tasks already started in GEE:\n', latest_tasks)

In [10]:
chunk_size = 50
all_tasks = dict()

for survey in surveys:
    last_started = latest_tasks[survey]
    survey_df = df[(df['country'] == survey[0]) & (df['year'] == survey[1])]
    expected_nr_of_tasks = int(math.ceil(len(survey_df) / chunk_size))
    if last_started < expected_nr_of_tasks - 1:
        # Some tasks have not been started. Starts them here:
        country = survey[0]
        year = survey[1]
        already_in_bucket = list(range(last_started + 1))
        survey_tasks = export_images(survey_df,
                                     country=country,
                                     year=year,
                                     export_folder=config['GCS']['EXPORT_FOLDER'],
                                     export='gcs',
                                     bucket=config['GCS']['BUCKET'],
                                     ms_bands=['BLUE', 'GREEN', 'RED', 'NIR', 'SWIR1', 'SWIR2', 'TEMP1'],
                                     include_nl=True,
                                     start_year=1990,
                                     end_year=2020,
                                     span_length=3,
                                     chunk_size=chunk_size,
                                     already_in_bucket=already_in_bucket)
        all_tasks.update(survey_tasks)

Processing senegal 1992: 50 locations from input DataFrame.
Exporting ONE TFRecord per location.
Skipping location indices based on already_in_bucket: []
Time period: 1990-2020, Composites: 3-year spans.
Including Nighttime Lights...
Creating NL composites: DMSP (pre-2012), VIIRS (2012-onwards)
Combined NL collection size: 138
NL composites created: 11 images.
NL Composites successfully created.
Created Landsat stacked image with 77 bands.
Created NL stacked image with 11 bands.
Created final stacked image with 88 total bands.
--- Processing Location Index: 0 (UID: senegal_1992_00000) ---
Starting export task: Export_senegal_1992_00000 for region: [[[-16.53123151910879, 16.000857912883387], [-16.442829268544248, 16.000857912883387], [-16.442829268544248, 16.086362342713848], [-16.53123151910879, 16.086362342713848], [-16.53123151910879, 16.000857912883387]]]
Task MJ7TRPJ5F4S4G2E6QBQ4DVAB started for location index 0 (UID: senegal_1992_00000).
--- Processing Location Index: 1 (UID: sene

In [11]:
wait_on_tasks(all_tasks, poll_interval=60)

Waiting on 600 tasks...
Task Status (2025-05-04 12:59:18): RUNNING: 11, COMPLETED: 60, READY: 529. Remaining: 540
Task Status (2025-05-04 13:01:32): RUNNING: 8, COMPLETED: 25, READY: 507. Remaining: 515
Task Status (2025-05-04 13:03:44): RUNNING: 11, COMPLETED: 21, READY: 483. Remaining: 494
Task Status (2025-05-04 13:05:53): RUNNING: 8, COMPLETED: 25, READY: 461. Remaining: 469
Task Status (2025-05-04 13:08:00): RUNNING: 11, COMPLETED: 20, READY: 438. Remaining: 449
Task Status (2025-05-04 13:10:04): RUNNING: 11, COMPLETED: 18, READY: 420. Remaining: 431
Task Status (2025-05-04 13:12:05): RUNNING: 12, COMPLETED: 17, READY: 402. Remaining: 414
Task Status (2025-05-04 13:14:05): RUNNING: 10, COMPLETED: 21, READY: 383. Remaining: 393
Task Status (2025-05-04 13:16:01): RUNNING: 12, COMPLETED: 19, READY: 362. Remaining: 374
Task Status (2025-05-04 13:17:52): RUNNING: 11, COMPLETED: 23, READY: 340. Remaining: 351
Task Status (2025-05-04 13:19:45): RUNNING: 12, COMPLETED: 15, READY: 324. Rem

## NL only:

In [6]:
# Modified loop in get_data.ipynb

chunk_size = 50
all_tasks = dict()
span_length = 3 # Define span length here for calculation
base_start_year = 1990 # Anchor year for calculating composite spans

for survey in surveys:
    # Get the specific survey year
    country = survey[0]
    survey_year = survey[1] # e.g., 2019

    last_started = latest_tasks[survey]
    survey_df = df[(df['country'] == country) & (df['year'] == survey_year)]

    if survey_df.empty:
        print(f"Skipping survey {survey}: DataFrame is empty.")
        continue

    expected_nr_of_tasks = int(math.ceil(len(survey_df) / chunk_size))

    if last_started < expected_nr_of_tasks - 1:
        print(f"\nProcessing survey {country} {survey_year}")
        
        # --- Calculate the relevant 3-year composite span ---
        # Find the start year of the span containing the survey year
        years_since_base = survey_year - base_start_year
        span_index = years_since_base // span_length
        composite_start_year = base_start_year + span_index * span_length
        composite_end_year = composite_start_year + span_length - 1
        # Optional: Adjust if you strictly want the span *ending* near the survey year
        # e.g., for 2019, maybe you want 2017-2019 instead of 2019-2021
        # If so, adjust calculation: composite_start_year = survey_year - span_length + 1
        # Let's stick to the span *containing* the survey year for now:
        print(f"  Targeting composite span: {composite_start_year}-{composite_end_year} for survey year {survey_year}")
        # ---

        already_in_bucket = list(range(last_started + 1))
        print(f"  Starting exports for task indices {last_started + 1} onwards...")

        survey_tasks = export_images(
            survey_df,
            country=country,
            year=survey_year, # Keep original survey year for file naming etc.
            export_folder=config['GCS']['EXPORT_FOLDER'],
            export='gcs',
            bucket=config['GCS']['BUCKET'],
            ms_bands=[], # Only NL
            include_nl=True,
            # MODIFIED: Pass the calculated single span
            start_year=composite_start_year,
            end_year=composite_end_year,
            span_length=span_length, # Pass span_length consistency
            # ---
            chunk_size=chunk_size,
            already_in_bucket=already_in_bucket
        )
        all_tasks.update(survey_tasks)
    else:
        print(f"Skipping survey {survey}: All expected tasks already started/completed (last_started={last_started}, expected={expected_nr_of_tasks}).")

# Wait for tasks if any were submitted
# if all_tasks:
#     print(f"\nWaiting for {len(all_tasks)} newly submitted tasks...")
#     wait_on_tasks(all_tasks, poll_interval=60)
# else:
#     print("\nNo new tasks were submitted in this run.")


Processing survey senegal 1992
  Targeting composite span: 1990-1992 for survey year 1992
  Starting exports for task indices 0 onwards...
Processing senegal 1992: 50 locations from input DataFrame.
Exporting ONE TFRecord per location.
Skipping location indices based on already_in_bucket: []
Time period: 1990-1992, Composites: 3-year spans.
Including Nighttime Lights...
Creating NL composites: DMSP (pre-2012), VIIRS (2012-onwards)
Combined NL collection size: 1
NL composites created: 1 images.
NL Composites successfully created.
Created Landsat stacked image with 0 bands.
Created NL stacked image with 1 bands.
Created final stacked image with 1 total bands.
--- Processing Location Index: 0 (UID: senegal_1992_00000) ---
Starting export task: Export_senegal_1992_00000 for region: [[[-16.575646049136378, 15.960024566397163], [-16.42410748563058, 15.960024566397163], [-16.42410748563058, 16.106603593261504], [-16.575646049136378, 16.106603593261504], [-16.575646049136378, 15.9600245663971

In [7]:
wait_on_tasks(all_tasks, poll_interval=60)

Waiting on 600 tasks...
Task Status (2025-05-04 04:46:19): RUNNING: 10, COMPLETED: 227, READY: 363. Remaining: 373
Task Status (2025-05-04 04:48:13): RUNNING: 10, COMPLETED: 74, READY: 289. Remaining: 299
Task Status (2025-05-04 04:49:58): RUNNING: 4, COMPLETED: 68, READY: 227. Remaining: 231
Task Status (2025-05-04 04:51:32): RUNNING: 6, COMPLETED: 60, READY: 165. Remaining: 171
Task Status (2025-05-04 04:52:58): RUNNING: 8, COMPLETED: 51, READY: 112. Remaining: 120
Task Status (2025-05-04 04:54:16): RUNNING: 6, COMPLETED: 48, READY: 66. Remaining: 72
Task Status (2025-05-04 04:55:29): RUNNING: 9, COMPLETED: 43, READY: 20. Remaining: 29
Task Status (2025-05-04 04:56:33): COMPLETED: 29. Remaining: 0
All monitored tasks have finished or encountered errors.
Final Summary (600 tasks submitted in this run):  COMPLETED: 600
