In [1]:
import pandas as pd

In [2]:
import sys
sys.path.append('..')

from marsbench.data.classification.BaseClassificationDataset import BaseClassificationDataset
from marsbench.utils.transforms import get_transforms

In [None]:
data='DeepMars_Landmark'
from hydra import initialize_config_dir, compose
import os
config_dir = os.path.abspath('../configs')
with initialize_config_dir(config_dir=config_dir, version_base='1.1'):
    cfg = compose(config_name='config', overrides=[f"data={data.lower()}"])

In [None]:
# Create a annotation csv file
txt_file = cfg.data.txt_file
with open(txt_file, "r", encoding="utf-8") as text:
    text = text.read().splitlines()
df = pd.DataFrame([x.split() for x in text])
df.columns = ['image_path', 'label']
df['label'] = df['label'].replace('6', '5').astype(int)
df.to_csv(cfg.data.annot_csv, index=False)

In [5]:
from typing import List, Optional, Tuple, Union, Literal
import os
import torch

class DeepMars_Landmark(BaseClassificationDataset):
    """
    DeepMars_Landmark https://zenodo.org/records/1048301
    """

    def __init__(
        self,
        cfg,
        data_dir,
        transform,
        annot_csv: Union[str, os.PathLike],
        split: Literal['train', 'val', 'test'] = 'train',
        generator: Optional[torch.Generator] = None,
    ):
        self.cfg = cfg
        self.annot = pd.read_csv(annot_csv)
        generator = torch.Generator().manual_seed(cfg.seed) if generator is None else generator
        total_size = len(self.annot)
        self.indices = self.determine_data_splits(total_size, generator, split)
        super(DeepMars_Landmark, self).__init__(cfg, data_dir, transform)
        
    def _load_data(self) -> Tuple[List[str], List[int]]:
        annot_subset = self.annot if self.indices is None else self.annot.iloc[self.indices]
        image_paths = annot_subset['image_path'].astype(str).tolist()
        labels = annot_subset['label'].astype(int).tolist()
        return image_paths, labels


In [None]:
train_transform, test_transform = get_transforms(cfg)

In [7]:
train_dataset = DeepMars_Landmark(cfg, cfg.data.data_dir, train_transform, cfg.data.annot_csv, split='train')

In [10]:
data='DeepMars_Surface'
from hydra import initialize_config_dir, compose
import os
config_dir = os.path.abspath('../configs')
with initialize_config_dir(config_dir=config_dir, version_base='1.1'):
    cfg = compose(config_name='config', overrides=[f"data={data.lower()}"])

In [None]:
cfg.data.txt_files

In [14]:
df = pd.DataFrame()
for split in cfg.data.txt_files:
    with open(cfg.data.txt_files.get(split), "r", encoding="utf-8") as text:
        text = text.read().splitlines()
    df_split = pd.DataFrame([x.split() for x in text])
    df_split.columns = ['image_path', 'label']
    df_split['split'] = split
    df = pd.concat([df, df_split])
df['label'] = df['label'].replace({'23':'22', '24':'23'}).astype(int)
df.to_csv(cfg.data.annot_csv, index=False)

In [17]:
class DeepMars_Surface(BaseClassificationDataset):
    """
    DeepMars_Surface https://zenodo.org/records/1049137
    """

    def __init__(
            self, 
            cfg, 
            data_dir, 
            transform, 
            annot_csv: Union[str, os.PathLike],
            split: Literal['train', 'val', 'test'] = 'train'):
        self.annot = pd.read_csv(annot_csv)
        self.split = split
        super(DeepMars_Surface, self).__init__(cfg, data_dir, transform)

    def _load_data(self) -> Tuple[List[str], List[int]]:
        annot_subset = self.annot[self.annot['split'] == self.split]
        image_paths = annot_subset['image_path'].astype(str).tolist()
        labels = annot_subset['label'].astype(int).tolist()
        return image_paths, labels


In [None]:
train_transform, test_transform = get_transforms(cfg)

In [18]:
train_dataset = DeepMars_Surface(cfg, cfg.data.data_dir, train_transform, cfg.data.annot_csv, split='train')

In [None]:
next(iter(train_dataset))

In [21]:
data='DoMars16k'
from hydra import initialize_config_dir, compose
import os
config_dir = os.path.abspath('../configs')
with initialize_config_dir(config_dir=config_dir, version_base='1.1'):
    cfg = compose(config_name='config', overrides=[f"data={data.lower()}"])

In [25]:
import glob
from itertools import chain
df = pd.DataFrame()
for split in cfg.data.data_dir:
    data_dir = cfg.data.data_dir.get(split)
    image_paths = []
    labels = []
    extensions = cfg.data.valid_image_extensions
    for label, class_dir in enumerate(os.listdir(data_dir)):
        class_dir_path = os.path.join(data_dir, class_dir)
        matched_files = list(
            chain.from_iterable(
                glob.glob(os.path.join(class_dir_path, f"*.{ext}"))
                for ext in extensions
            )
        )
        image_paths.extend([os.path.relpath(file, data_dir) for file in matched_files])
        labels.extend([label] * len(matched_files))
    df_split = pd.DataFrame({'image_path': image_paths, 'label': labels})
    df_split['split'] = split
    df = pd.concat([df, df_split])

df.to_csv(cfg.data.annot_csv, index=False)

In [22]:
import os
from typing import List
from typing import Tuple, Union, Literal
import pandas as pd

class DoMars16k(BaseClassificationDataset):
    """
    DoMars16k dataset https://zenodo.org/records/4291940
    """

    def __init__(
        self,
        cfg,
        data_dir,
        transform,
        annot_csv: Union[str, os.PathLike],
        split: Literal["train", "val", "test"] = "train",
    ):
        self.split = split
        self.annot = pd.read_csv(annot_csv)
        self.annot = self.annot[self.annot["split"] == split]
        data_dir = data_dir + f"/{split}"
        super(DoMars16k, self).__init__(cfg, data_dir, transform)

    def _load_data(self) -> Tuple[List[str], List[int]]:
        image_paths = self.annot['image_path'].astype(str).tolist()
        labels = self.annot['label'].astype(int).tolist()
        return image_paths, labels

In [23]:
train_transform, test_transform = get_transforms(cfg)

In [24]:
train_dataset = DoMars16k(cfg, cfg.data.data_dir, train_transform, cfg.data.annot_csv, split='train')

In [None]:
next(iter(train_dataset))

In [30]:
data='HiRISENet'
from hydra import initialize_config_dir, compose
import os
config_dir = os.path.abspath('../configs')
with initialize_config_dir(config_dir=config_dir, version_base='1.1'):
    cfg = compose(config_name='config', overrides=[f"data={data}"])

In [19]:
txt_file = cfg.data.txt_file
with open(txt_file, "r", encoding="utf-8") as text:
    rows = []
    for line in text:
        image_name, class_type_str, split_style = line.strip().split()[:3]
        rows.append({'image_path': image_name, 'label': class_type_str, 'split': split_style})
df = pd.DataFrame(rows)
df.to_csv(cfg.data.annot_csv, index=False)

In [31]:
class HiRISENet(BaseClassificationDataset):
    """
    Mars Image Content Classfication-HiRISENet https://zenodo.org/records/4002935
    """

    def __init__(
        self,
        cfg,
        data_dir,
        transform,
        annot_csv: Union[str, os.PathLike],
        split: Literal["train", "val", "test"] = "train",
    ):
        self.annot = pd.read_csv(annot_csv)
        self.annot = self.annot[self.annot['split'] == split]
        super(HiRISENet, self).__init__(cfg, data_dir, transform)

    def _load_data(self) -> Tuple[List[str], List[int]]:
        image_paths = self.annot['image_path'].astype(str).tolist()
        labels = self.annot['label'].astype(int).tolist()
        return image_paths, labels

In [32]:
train_transform, test_transform = get_transforms(cfg)

In [33]:
train_dataset = HiRISENet(cfg, cfg.data.data_dir, train_transform, cfg.data.annot_csv, split='train')

In [None]:
next(iter(train_dataset))

In [38]:
data='MartianFrost'
from hydra import initialize_config_dir, compose
import os
config_dir = os.path.abspath('../configs')
with initialize_config_dir(config_dir=config_dir, version_base='1.1'):
    cfg = compose(config_name='config', overrides=[f"data={data.lower()}"])

In [None]:
data_dir

In [43]:
from pathlib import Path
data_dir = cfg.data.data_dir
df = pd.DataFrame()
for split in cfg.data.txt_files:
    image_paths = []
    labels = []
    txt_file = cfg.data.txt_files.get(split)
    with open(txt_file, "r", encoding="utf-8") as text:
        valid_parents = set(line.strip() for line in text)

    data_dir = Path(data_dir)

    patterns = [("frost", 1), ("background", 0)]

    for subfolder, label in patterns:
        for image_path in data_dir.glob(f"*/tiles/{subfolder}/*"):
            each_folder = image_path.parents[2].name
            parent_directory = each_folder[:15]
            if parent_directory in valid_parents:
                image_paths.append(str(image_path.relative_to(data_dir)))
                labels.append(label)
    df_split = pd.DataFrame({'image_path': image_paths, 'label': labels, 'split': split})
    df = pd.concat([df, df_split])
df.to_csv(cfg.data.annot_csv, index=False)

In [44]:
class MartianFrost(BaseClassificationDataset):
    """
    Martian Frost dataset
    https://dataverse.jpl.nasa.gov/dataset.xhtml?persistentId=doi:10.48577/jpl.QJ9PYA
    """

    def __init__(
        self,
        cfg,
        data_dir,
        transform,
        annot_csv: Union[str, os.PathLike],
        split: Literal["train", "val", "test"] = "train",
    ):
        self.annot = pd.read_csv(annot_csv)
        self.annot = self.annot[self.annot['split'] == split]
        super(MartianFrost, self).__init__(cfg, data_dir, transform)

    def _load_data(self) -> Tuple[List[str], List[int]]:
        image_paths = self.annot['image_path'].astype(str).tolist()
        labels = self.annot['label'].astype(int).tolist()
        return image_paths, labels


In [45]:
train_transform, test_transform = get_transforms(cfg)

In [46]:
train_dataset = MartianFrost(cfg, cfg.data.data_dir, train_transform, cfg.data.annot_csv, split='train')

In [None]:
next(iter(train_dataset))

In [48]:
data='MSLNet'
from hydra import initialize_config_dir, compose
import os
config_dir = os.path.abspath('../configs')
with initialize_config_dir(config_dir=config_dir, version_base='1.1'):
    cfg = compose(config_name='config', overrides=[f"data={data}"])

In [50]:
df = pd.DataFrame()
for split in cfg.data.txt_files:
    txt_file = cfg.data.txt_files.get(split)
    with open(txt_file, "r", encoding="utf-8") as text:
        text = text.read().splitlines()
    df_split = pd.DataFrame([x.split() for x in text])
    df_split.columns = ['image_path', 'label']
    df_split['split'] = split
    df = pd.concat([df, df_split])
df.to_csv(cfg.data.annot_csv, index=False)

In [52]:
class MSLNet(BaseClassificationDataset):
    """
    Mars Image Content Classification Mastcam & MAHILI Dataset
    https://zenodo.org/records/4033453
    """

    def __init__(
        self,
        cfg,
        data_dir,
        transform,
        annot_csv: Union[str, os.PathLike],
        split: Literal["train", "val", "test"] = "train",
    ):
        self.annot = pd.read_csv(annot_csv)
        self.annot = self.annot[self.annot['split'] == split]
        super(MSLNet, self).__init__(cfg, data_dir, transform)

    def _load_data(self) -> Tuple[List[str], List[int]]:
        image_paths = self.annot['image_path'].astype(str).tolist()
        labels = self.annot['label'].astype(int).tolist()
        return image_paths, labels

In [None]:
train_transform, test_transform = get_transforms(cfg)
train_dataset = MSLNet(cfg, cfg.data.data_dir, train_transform, cfg.data.annot_csv, split='train')
next(iter(train_dataset))