## In this notebook

- Check metadata.
- Unify metadata and downloaded files.

In [1]:
import os

# analytics
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
import numpy as np

# plot
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
DATA_FOLDER = "../data"

FEATURES_METADATA_FILEPATH = "../data/features_metadata.csv"
FEATURES_METADATA_PROCESSED_FILEPATH = "../data/features_metadata_processed.csv"

TRAIN_FEATURES_FOLDER = "../data/train_features/"
TRAIN_LABELS_METADATA_FILEPATH = "../data/train_agbm_metadata.csv"
TRAIN_LABELS_FOLDER = "../data/train_agbm/"

TEST_FEATURES_FOLDER = "../data/test_features/"

In [3]:
! ls -l $DATA_FOLDER

total 124176
-rw-r--r--     1 nobody nogroup 83745777 Oct 25 17:59 features_metadata.csv
-rw-r--r--     1 nobody nogroup 40456802 Dec  4 20:07 features_metadata_processed.csv
drwxr-xr-x     2 nobody nogroup       64 Dec  3 10:05 test_agbm
drwxr-xr-x 63350 nobody nogroup  2027200 Dec  2 21:22 test_features
drwxr-xr-x  8691 nobody nogroup   278112 Dec  2 20:10 train_agbm
-rw-r--r--     1 nobody nogroup  2448108 Oct 24 20:44 train_agbm_metadata.csv
drwxr-xr-x 65535 nobody nogroup  6050560 Dec  3 00:56 train_features


## Load metadata

In [4]:
%%time

df = (
    pd
    .read_csv(FEATURES_METADATA_FILEPATH)
    .drop(["size", "cksum", "s3path_us", "s3path_eu", "s3path_as"], axis=1)
    .rename({"corresponding_agbm": "label_filename"}, axis=1)
)

df.tail()

CPU times: user 716 ms, sys: 77.3 ms, total: 794 ms
Wall time: 793 ms


Unnamed: 0,filename,chip_id,satellite,split,month,label_filename
252421,fff812c0_S2_07.tif,fff812c0,S2,test,April,fff812c0_agbm.tif
252422,fff812c0_S2_08.tif,fff812c0,S2,test,May,fff812c0_agbm.tif
252423,fff812c0_S2_09.tif,fff812c0,S2,test,June,fff812c0_agbm.tif
252424,fff812c0_S2_10.tif,fff812c0,S2,test,July,fff812c0_agbm.tif
252425,fff812c0_S2_11.tif,fff812c0,S2,test,August,fff812c0_agbm.tif


## Check metadata

- Everything seems to be correct.

In [5]:
# rows count per split

df.split.value_counts()

train    189078
test      63348
Name: split, dtype: int64

In [6]:
# unique lables per split

(
    df
    .groupby(["split"])
    .label_filename
    .nunique()
)

split
test     2773
train    8689
Name: label_filename, dtype: int64

In [7]:
# number of actually downloaded data is the same as expected

downloaded_train_features = os.listdir(TRAIN_FEATURES_FOLDER)
downloaded_test_features = os.listdir(TEST_FEATURES_FOLDER)
downloaded_train_labels = os.listdir(TRAIN_LABELS_FOLDER)

print("Number of downloaded data files:")
print(f"- train features: {len(downloaded_train_features):,.0f}")
print(f"- test features: {len(downloaded_test_features):,.0f}")
print(f"- train lables: {len(downloaded_train_labels):,.0f}")

Number of downloaded data files:
- train features: 189,078
- test features: 63,348
- train lables: 8,689


## Process metadata

In [8]:
# create filepath columns

df["filepath"] = np.where(
    df.split == "train",
    "/usr/src/app/data/train_features/" + df.filename,
    "/usr/src/app/data/test_features/" + df.filename,
)

df["label_filepath"] = np.where(
    df.split == "train",
    "/usr/src/app/data/train_agbm/" + df.label_filename,
    "/usr/src/app/data/test_agbm/" + df.label_filename,
)

df = df.loc[:, [
    "filename", "filepath", "label_filename", "label_filepath", "chip_id", "satellite", "split", "month",
]]

df.tail()

Unnamed: 0,filename,filepath,label_filename,label_filepath,chip_id,satellite,split,month
252421,fff812c0_S2_07.tif,/usr/src/app/data/test_features/fff812c0_S2_07...,fff812c0_agbm.tif,/usr/src/app/data/test_agbm/fff812c0_agbm.tif,fff812c0,S2,test,April
252422,fff812c0_S2_08.tif,/usr/src/app/data/test_features/fff812c0_S2_08...,fff812c0_agbm.tif,/usr/src/app/data/test_agbm/fff812c0_agbm.tif,fff812c0,S2,test,May
252423,fff812c0_S2_09.tif,/usr/src/app/data/test_features/fff812c0_S2_09...,fff812c0_agbm.tif,/usr/src/app/data/test_agbm/fff812c0_agbm.tif,fff812c0,S2,test,June
252424,fff812c0_S2_10.tif,/usr/src/app/data/test_features/fff812c0_S2_10...,fff812c0_agbm.tif,/usr/src/app/data/test_agbm/fff812c0_agbm.tif,fff812c0,S2,test,July
252425,fff812c0_S2_11.tif,/usr/src/app/data/test_features/fff812c0_S2_11...,fff812c0_agbm.tif,/usr/src/app/data/test_agbm/fff812c0_agbm.tif,fff812c0,S2,test,August


## Save metadata

In [9]:
df.to_csv(FEATURES_METADATA_PROCESSED_FILEPATH, index=False)