In [1]:
import os
import sys
import math
import torch
import numpy as np
import matplotlib as mtp
import matplotlib.pyplot as plt
import multiprocessing as mp
import pandas as pd

In [2]:
datasets_dir = os.path.join("/homeLocal/jpulzdeoliveira/datasets")
pastis_dir = os.path.join(datasets_dir, "PASTIS")

In [3]:
backend_device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {backend_device} as backend device")

Using cuda as backend device


In [4]:
import geopandas as gpd
import json
from torch.utils.data import Dataset, random_split    

def pool_processing(args):
    params, selection = args
    for _, patch in selection.iterrows():
        patch_id, timestamps = patch
        data = np.load(
            os.path.join(params['root'], "DATA_S2", f"S2_{patch_id}.npy")
            ).astype(np.float32)
        np.save(
            os.path.join(params['out_folder'], f"S2_{patch_id}"),
            data[timestamps]
            )

def drop_clouds(root, cloud_analisys, out_folder, cloud_treshold=0.25, num_threads=-1):
    print("Reading cloud analisys...")
    cloud_ta = gpd.read_file(cloud_analisys)
    cloud_ta.cloud_percentage = cloud_ta.cloud_percentage.astype(float)
    cloud_ta.drop(cloud_ta[cloud_ta.cloud_percentage >= cloud_treshold].index, inplace=True)
    cloud_ta.timestamp = cloud_ta.timestamp.astype(int)

    selected_tmps = {}
    print(f"Dropping timestamps with c_per < {cloud_treshold}...")
    for _, patch in cloud_ta.iterrows():
        patch_id, timestamp, c_per, _ = patch
        patch_id = patch_id.split('_')[1]

        if patch_id not in selected_tmps:
            selected_tmps[patch_id] = []

        selected_tmps[patch_id].append(timestamp)

    selected_df = {
        'id': [],
        'timestamps': [],
    }
    cloud_aux = {
        'id': [],
        'tmps_count': [],
    }
    for id in selected_tmps:
        selected_df['id'].append(id)
        cloud_aux['id'].append(id)
        selected_df['timestamps'].append(selected_tmps[id])
        cloud_aux['tmps_count'].append(len(selected_tmps[id]))

    pd.DataFrame(cloud_aux).to_csv("cloud_aux.csv", index=False)
    selected_df = pd.DataFrame(selected_df)

    num_processes = mp.cpu_count() if num_threads == -1 else num_threads
    chuncks = np.array_split(selected_df, num_processes)
    print("Pool processing...")
    params = {
        'root': root,
        'out_folder': out_folder,
    }
    with mp.Pool(num_processes) as pool:
        pool.map(pool_processing, zip([params] * num_processes, chuncks,))

    print("Done")

In [5]:
drop_clouds(
    pastis_dir,
    "./pastis_cloud_analysis.csv",
    os.path.join("pastis_cloud_refined"),
    num_threads=mp.cpu_count() * 4 // 5
    )

Reading cloud analisys...


ERROR 1: PROJ: proj_create_from_database: Open of /opt/conda/share/proj failed


Dropping timestamps with c_per < 0.25...
Pool processing...


  return bound(*args, **kwds)


Done
