# Accessing and Downloading our MUSA650 Final Project Data from Radiant MLHub
**Kristin Chang & Jenna Epstein**

Note: must enable the radiant earth mlhub python client (run `pip install radiant-mlhub`)

In [10]:
from radiant_mlhub import Dataset, Collection, client, get_session
import tarfile
from pathlib import Path
import os
from io import BytesIO
from glob import glob
from tqdm.notebook import tqdm
import tifffile as tiff
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL as pil

In [11]:
# MLHub API Key (Jenna's)
os.environ['MLHUB_API_KEY'] = '673e0a44d16279e72d310dc580273c2cd52b37a5857321555e23e19a84af8eb4'

In [12]:
# The dataset
dataset = Dataset.fetch('rti_rwanda_crop_type')

# Print all collections associated with the dataset
print(f'ID: {dataset.id}')
print(f'Title: {dataset.title}')
print('Collections:')
for collection in dataset.collections:
    print(f'* {collection.id}')

ID: rti_rwanda_crop_type
Title: Drone Imagery Classification Training Dataset for Crop Types in Rwanda
Collections:
* rti_rwanda_crop_type_labels
* rti_rwanda_crop_type_source
* rti_rwanda_crop_type_raw


The three collections associated with this dataset are:

* rti_rwanda_crop_type_source: includes the image inputs
* rti_rwanda_crop_type_labels: includes the labls for the images
* rti_rwanda_crop_type_raw: raw data


## Downloading Labels and Source Imagery

In [13]:
# Define the path to which we will download everything
output_path = Path("./data/").resolve()

In [14]:
# Downloading labels
collection = Collection.fetch('rti_rwanda_crop_type_labels')
collection.download(f"{output_path}")  # Will raise exception if the file already exists

WindowsPath('C:/Users/jenna/Documents/MCP/Spring_2022/MUSA650_RemoteSensing/Final/MUSA650_Final_ChangEpstein/data/rti_rwanda_crop_type_labels.tar.gz')

In [15]:
# Downloading source images
collection = Collection.fetch('rti_rwanda_crop_type_source')
collection.download(f"{output_path}/Data")   # Will raise exception if the file already exists

WindowsPath('C:/Users/jenna/Documents/MCP/Spring_2022/MUSA650_RemoteSensing/Final/MUSA650_Final_ChangEpstein/data/Data/rti_rwanda_crop_type_source.tar.gz')

In [None]:
# Extract images and labels
import shutil
shutil.unpack_archive(f"{output_path}/rti_rwanda_crop_type_source.tar.gz", f"{output_path}/Extracted")
shutil.unpack_archive(f"{output_path}/rti_rwanda_crop_type_labels.tar.gz", f"{output_path}/Extracted")

In [8]:
import os # use os to access the files on the local system
labels_path = f"{output_path}/Extracted/rti_rwanda_crop_type_labels" # label files
sources_path = f"{output_path}/Extracted/rti_rwanda_crop_type_source" # source image files

In [9]:
# Get the path to all image source files
source_images = []
for root, dirs, files in os.walk(sources_path):
    for file in files:
        if(file.endswith(".png")):
            source_images.append(root)

In [10]:
import pandas as pd
dataset_df = pd.DataFrame(data = source_images,columns=["source_file"])
dataset_df.head()

Unnamed: 0,source_file
0,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...
1,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...
2,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...
3,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...
4,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...


In [11]:
# A sample path to a source file
dataset_df["source_file"][0]

'C:\\Users\\jenna\\Documents\\MCP\\Spring_2022\\MUSA650_RemoteSensing\\Final\\MUSA650_Final_ChangEpstein\\data/Extracted/rti_rwanda_crop_type_source\\rti_rwanda_crop_type_source_0'

In [12]:
import json 
def get_label(source_file, labels_path):
    image_id = source_file.split('_')[-1]
    label_file = f"{labels_path}/rti_rwanda_crop_type_labels_{str(image_id)}/labels.json"
    #load label
    with open(label_file) as file:
        label = json.load(file)["label"]

    return label

In [13]:
# Add label column to pandas and read the label for each source image
dataset_df["label"] = dataset_df["source_file"].apply(get_label, args=(labels_path,))

In [14]:
# View five rows
dataset_df.head()

Unnamed: 0,source_file,label
0,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...,banana
1,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...,maize
2,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...,legumes
3,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...,legumes
4,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...,structure


In [16]:
# Note that we need to transform to string (using a new name - output_path_full) or else cant use + to concatenate different data types 
output_path_full = str(Path(output_path))

In [17]:
# Create target folder for each each label
# Define a list of keywords based on the label classes ()
keys = (
    "banana",
    "maize",
    "other",
    "forest",
    "legumes",
    "structure"
)
if not os.path.isdir(f"{output_path_full}/Images/"):
    os.makedirs(f"{output_path_full}/Images/")
for key in keys:
    if not os.path.isdir(output_path_full + '/Images/' + key):
        os.mkdir(output_path_full + '/Images/' + key)

In [18]:
# Copying the source imagery to corresponding label folders

for key in keys:
    key_df = dataset_df[dataset_df["label"]==key]
    key_df = key_df.reset_index()
    for row in key_df.iterrows():
        shutil.copy(f"{row[1]['source_file']}/Image.png", f"{output_path_full}/Images/{key}/{str(row[1]['index'])}.png")
        # The index of the image will be the name of the file

In [19]:
key_df.head() # illustrates how the copy works with respect to key_df

Unnamed: 0,index,source_file,label
0,4,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...,structure
1,5,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...,structure
2,8,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...,structure
3,9,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...,structure
4,10,C:\Users\jenna\Documents\MCP\Spring_2022\MUSA6...,structure


In [20]:
# The source_file contains the file - we have to append the image location (Image.png)
key_df.source_file[0]

'C:\\Users\\jenna\\Documents\\MCP\\Spring_2022\\MUSA650_RemoteSensing\\Final\\MUSA650_Final_ChangEpstein\\data/Extracted/rti_rwanda_crop_type_source\\rti_rwanda_crop_type_source_1000'

In [22]:
# Access to the data on local system
data_root=f"{output_path_full}/Images" 

In [24]:
# Displaying the path and folders available
selectedClasses = (os.listdir(data_root))

# Print the classes from the Images folder
print (selectedClasses)

['banana', 'forest', 'legumes', 'maize', 'other', 'structure']
