In [1]:
from azure.storage.blob import BlobServiceClient
import json
import os
from PIL import Image
import matplotlib.pyplot as plt
import io  # Used to convert bytes to a file-like object
import pandas as pd
import re

In [2]:
#set up storage
#connection_string = 
container_name = "publicdata"

#create client
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)

In [3]:
# Loop through all blobs and process JSON files
json_data = {}

blobs = container_client.list_blobs()
for blob in blobs:
    if blob.name.endswith(".json"):
        # Get the blob content
        blob_client = container_client.get_blob_client(blob.name)
        blob_content = blob_client.download_blob().content_as_text()

        # Convert to JSON and add to the dictionary using the blob's name as the key
        json_data[blob.name] = json.loads(blob_content)  # Now storing JSON content as a dictionary

# Assigning specific JSON data to variables
category = json_data.get("v1.0-mini/category.json", [])
sensor = json_data.get("v1.0-mini/sensor.json", {})
surface_ann = json_data.get("v1.0-mini/surface_ann.json", {})
attribute = json_data.get("v1.0-mini/attribute.json", {})
log = json_data.get("v1.0-mini/log.json", {})
calibrated_sensor = json_data.get("v1.0-mini/calibrated_sensor.json", {})
sample_data = json_data.get("v1.0-mini/sample_data.json", {})
sample = json_data.get("v1.0-mini/sample.json", {})
ego_pose = json_data.get("v1.0-mini/ego_pose.json", {})
object_ann = json_data.get("v1.0-mini/object_ann.json", {})

# Category Clustering and Label Encodeing

### 1. Category JSON: Group chosen Categories in their Parent Category

In [16]:
#convert into actual json
object_ann_json = json.dumps(object_ann, indent=4)
sample_data_json = json.dumps(sample_data, indent=4)
category_json = json.dumps(category, indent=4)

#convert to dataframe for better processing
object_ann_df = pd.read_json(object_ann_json)
category_df = pd.read_json(category_json)
sample_data_df = pd.read_json(sample_data_json)

#clustering of categories
def transform_category(category_str):
    if re.match(r'^human\.', category_str):
        return 'Human'
    if re.match(r'^movable_object\.barrier', category_str):
        return 'Barrier'
    if re.match(r'^movable_object\.cone', category_str):
        return 'Cone'
    if re.match(r'^vehicle\.bicycle', category_str):
        return 'Bike'
    if re.match(r'^vehicle\.motorcycle', category_str):
        return 'Motorcycle'
    if re.match(r'^vehicle\.truck', category_str):
        return 'Truck'
    if re.match(r'^vehicle\.car', category_str):
        return 'Car'
    if re.match(r'^movable_object\.trafficcone', category_str):
        return 'Trafficcone'
    return None

#apply function to the category column -> clustering of chosen categories
category_df['name'] = category_df["name"].apply(transform_category)

#remove description
category_df = category_df.drop(columns="description")

#drop rows that are not included in the chosen categories
category_df = category_df.dropna(subset=['name'])


#merge dataframes based on foreign keys to connect labeling with image data

#category_df.to_excel("Cat.xlsx", index=False)
# object_ann_df.to_excel("Obj.xlsx", index=False)
sample_data_df.to_excel("Sample.xlsx", index=False)

obj_cat = pd.merge(object_ann_df, category_df, left_on='category_token', right_on='token', how='inner')
obj_cat = obj_cat.dropna(subset=['name'])


#remove list of columns from dataframe
columns_to_remove = ["token_x", "token_y", "category_token", "bbox", "mask", "attribute_tokens", ]
obj_cat = obj_cat.drop(columns=columns_to_remove)

#filter so that only key frames are included
print(sample_data_df.info())
sample_data_df = sample_data_df[sample_data_df["is_key_frame"] == True]

#merge with sample data
label_data_v1 = pd.merge(sample_data_df, obj_cat, left_on='token', right_on='sample_data_token', how='left')
label_data_v1['name'] = label_data_v1['name'].fillna('empty')
label_data_v1.to_excel("label_data_v1.xlsx", index=False)

#####################
label_data_v2 = label_data_v1.groupby("filename")["name"].value_counts().reset_index()
#####################
#label_data_v2.to_excel("label_data_v2.xlsx", index=False)




#remove list of columns from dataframe
#columns_to_remove = ["sample_data_token", "sample_token", "ego_pose_token", "calibrated_sensor_token", "fileformat", "timestamp", "prev", "next"]
#label_data_v2 = label_data_v1.drop(columns=columns_to_remove)




dummies = pd.get_dummies(label_data_v2['name'])

# Join the dummy variables to the main dataframe
labeled = pd.concat([label_data_v2, dummies], axis=1)
labeled[dummies.columns] = labeled[dummies.columns].astype(int)


# Drop the original 'name' column
labeled = labeled.drop(['name'], axis=1)

labeled = labeled.drop(['empty'], axis=1)

labeled = labeled.drop(['count'], axis=1)
labeled.to_excel("lbl.xlsx", index=False)


# Group by 'filename' and aggregate the data
labeled = labeled.groupby('filename').agg({
    'Human': 'max',
    'Barrier': 'max',
    'Bike': 'max',
    'Motorcycle': 'max',
    'Truck': 'max',
    'Car': 'max',
    'Trafficcone': 'max'
}).reset_index()

labeled.to_excel("labeled.xlsx", index=False)
print(labeled.shape)

#Test shows: currently issue that Images with no annotation of the chosen categories are not included in the labeled dataframe
test = pd.merge(sample_data_df, labeled, on="filename", how='left')
test.to_excel("test.xlsx", index=False)
print(test.shape)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650 entries, 0 to 649
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   token                    650 non-null    object        
 1   sample_token             650 non-null    object        
 2   ego_pose_token           650 non-null    object        
 3   calibrated_sensor_token  650 non-null    object        
 4   filename                 650 non-null    object        
 5   fileformat               650 non-null    object        
 6   width                    650 non-null    int64         
 7   height                   650 non-null    int64         
 8   timestamp                650 non-null    datetime64[ns]
 9   is_key_frame             650 non-null    bool          
 10  prev                     650 non-null    object        
 11  next                     650 non-null    object        
dtypes: bool(1), datetime64[ns](1), int64

### 2. Removing categories that are out-of-scope

### 3. Remove unnecessary items from object_ann