# Setup MAndMs Dataset
Download the dataset from https://mega.nz/folder/FxAmhbRJ#Dwugf8isRSR9CCZ6Qnza4w and place the contents in directory `data_path`.

Directory should have the following file structure:
- Training
  - Labeled
  - Unlabeled
- Validation
- Testing
- 211230_M&Ms_Dataset_information_diagnosis_opendataset.csv

In [1]:
import pandas as pd
import glob
import shutil
from pathlib import Path

data_path = Path("/tmp/data/MAndMs-RAW")
output_path = Path("/tmp/data/MAndMs")
output_path.mkdir(parents=True, exist_ok=True)

In [2]:
df = pd.read_csv(data_path / "211230_M&Ms_Dataset_information_diagnosis_opendataset.csv", index_col=0)

shutil.move(data_path / "211230_M&Ms_Dataset_information_diagnosis_opendataset.csv", output_path);

In [3]:
# Union the Labeled and Unlabeled train examples
for file_path in glob.iglob(str(data_path / "Training" / "Labeled" / "*")):
    shutil.move(file_path, data_path / "Training")

for file_path in glob.iglob(str(data_path / "Training" / "Unlabeled" / "*")):
    shutil.move(file_path, data_path / "Training")

shutil.rmtree(data_path / "Training" / "Labeled")
shutil.rmtree(data_path / "Training" / "Unlabeled")

In [4]:
df_files = pd.DataFrame(columns=["split", "file_path", "file_name"])

train_files = [Path(file_path) for file_path in glob.iglob(str(data_path / "Training" / "*"))]
df_files = pd.concat([
    df_files,
    pd.DataFrame.from_dict({
        "split": "Training",
        "file_path": train_files,
        "file_name": [fp.name for fp in train_files]
    })
])

val_files = [Path(file_path) for file_path in glob.iglob(str(data_path / "Validation" / "*"))]
df_files = pd.concat([
    df_files,
    pd.DataFrame.from_dict({
        "split": "Training",  # We split train & validation later
        "file_path": val_files,
        "file_name": [fp.name for fp in val_files]
    })
])

test_files = [Path(file_path) for file_path in glob.iglob(str(data_path / "Testing" / "*"))]
df_files = pd.concat([
    df_files,
    pd.DataFrame.from_dict({
        "split": "Testing",
        "file_path": test_files,
        "file_name": [fp.name for fp in test_files]
    })
])

df_merged = pd.merge(df_files, df, left_on='file_name', right_on='External code')
df_merged

Unnamed: 0,split,file_path,file_name,External code,VendorName,Vendor,Centre,ED,ES,Age,Pathology,Sex,Height,Weight
0,Training,/tmp/data/MAndMs-RAW/Training/E4M2Q7,E4M2Q7,E4M2Q7,Philips,B,3,0,8,27,DCM,F,,50.0
1,Training,/tmp/data/MAndMs-RAW/Training/H1W2Y1,H1W2Y1,H1W2Y1,Philips,B,2,29,10,61,HCM,M,,79.0
2,Training,/tmp/data/MAndMs-RAW/Training/M2P1R1,M2P1R1,M2P1R1,Siemens,A,1,0,10,72,NOR,F,160.0,63.0
3,Training,/tmp/data/MAndMs-RAW/Training/G2M7W4,G2M7W4,G2M7W4,Philips,B,3,24,9,31,DCM,M,,83.0
4,Training,/tmp/data/MAndMs-RAW/Training/A1E9Q1,A1E9Q1,A1E9Q1,Siemens,A,1,0,9,16,DCM,M,175.0,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340,Testing,/tmp/data/MAndMs-RAW/Testing/A4R4T0,A4R4T0,A4R4T0,Canon,D,5,21,8,63,Other,F,148.0,54.0
341,Testing,/tmp/data/MAndMs-RAW/Testing/G1K1V3,G1K1V3,G1K1V3,Philips,B,2,29,11,21,NOR,F,,60.0
342,Testing,/tmp/data/MAndMs-RAW/Testing/E6M6P2,E6M6P2,E6M6P2,GE,C,4,24,7,24,Other,M,,75.0
343,Testing,/tmp/data/MAndMs-RAW/Testing/E1L7M3,E1L7M3,E1L7M3,Canon,D,5,1,12,15,HCM,M,157.0,50.0


In [5]:
for vendor in df["VendorName"].unique():
    for split in df_merged["split"].unique():
        (output_path / vendor / split).mkdir(exist_ok=True, parents=True)

In [6]:
for _, row in df_merged.iterrows():
    # print(row["file_path"], "->", output_path / row["VendorName"] / row["split"])
    shutil.move(row["file_path"], output_path / row["VendorName"] / row["split"])

# Upload to W&B

In [7]:
import wandb
from pathlib import Path

In [8]:
run = wandb.init(project="UDA-Datasets")

artifact = wandb.Artifact("MAndMs", type="raw_data")
artifact.add_dir(output_path)
run.log_artifact(artifact)

wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33miserh[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (/tmp/data/MAndMs)... Done. 3.4s


VBox(children=(Label(value='14667.715 MB of 14667.715 MB uploaded (14667.613 MB deduped)\r'), FloatProgress(va…