In [None]:
# Imports
import pandas as pd
import zipfile

# Pandas options
pd.set_option('display.max_colwidth', None)

The provided dataset (../data/raw/sunburst_images.zip) contains images. <br>
We want to create our own dataset using the classifications in the dataset. <br>
Since we want to use the uncompressed intensity data, we won't be using the images. <br>
Here we create a list of all the sunbursts in the provided dataset.

In [None]:
zip_file_path = "data/raw/sunburst_images.zip"

with zipfile.ZipFile(zip_file_path, "r") as zip_file:
    file_names = zip_file.namelist()

# Create a DataFrame with all the file names
df = pd.DataFrame(file_names, columns=["File Name"])

# Filter out MacOS artifacts and directories
df = df[~df["File Name"].str.contains("__MACOSX")]
df = df[~df["File Name"].str.contains("DS_Store")]
df = df[~df["File Name"].str.endswith("/")]

# Extract the last folder, file name, and file extension
df["Classification"] = df["File Name"].str.split("/").str[-2]
df["File Name"] = df["File Name"].str.split("/").str[-1]
df["Extension"] = df["File Name"].str.split(".").str[-1]
df["File Name"] = df["File Name"].str.split(".").str[0]

# Extract info from the file name
df["Start"] = pd.to_datetime(df["File Name"].str.split("_").str[0], format="%Y-%m-%d %H-%M-%S")
df["End"] = pd.to_datetime(df["File Name"].str.split("_").str[1], format="%Y-%m-%d %H-%M-%S")
df["Instrument"] = df["File Name"].str.split("_").str[2:5].str.join("-")
df["Instrument"] = df["Instrument"].str.replace("-None", "")

# Reorder the columns
df = df[["Classification", "Instrument", "Start", "End", "File Name", "Extension"]]

df

In [None]:
df.to_csv("data/processed/sunburst_images.csv", index=False)