In [1]:
from azure.storage.blob import BlobServiceClient
import json
import os
from PIL import Image
import matplotlib.pyplot as plt
import io  # Used to convert bytes to a file-like object
import pandas as pd
import re
from scipy.stats import chi2_contingency
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
#set up storage
#connection_string = 
container_name = "publicdata"

#create client
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)

In [3]:
# Loop through all blobs and process JSON files
json_data = {}

blobs = container_client.list_blobs()
for blob in blobs:
    if blob.name.endswith(".json"):
        # Get the blob content
        blob_client = container_client.get_blob_client(blob.name)
        blob_content = blob_client.download_blob().content_as_text()

        # Convert to JSON and add to the dictionary using the blob's name as the key
        json_data[blob.name] = json.loads(blob_content)  # Now storing JSON content as a dictionary

# Assigning specific JSON data to variables
category = json_data.get("v1.0-mini/category.json", [])
sensor = json_data.get("v1.0-mini/sensor.json", {})
surface_ann = json_data.get("v1.0-mini/surface_ann.json", {})
attribute = json_data.get("v1.0-mini/attribute.json", {})
log = json_data.get("v1.0-mini/log.json", {})
calibrated_sensor = json_data.get("v1.0-mini/calibrated_sensor.json", {})
sample_data = json_data.get("v1.0-mini/sample_data.json", {})
sample = json_data.get("v1.0-mini/sample.json", {})
ego_pose = json_data.get("v1.0-mini/ego_pose.json", {})
object_ann = json_data.get("v1.0-mini/object_ann.json", {})

# Category Clustering and Label Encodeing

### Convert data into actual json

In [4]:
object_ann_json = json.dumps(object_ann, indent=4)
sample_data_json = json.dumps(sample_data, indent=4)
category_json = json.dumps(category, indent=4)

### Convert to dataframe for better processing

In [5]:
object_ann_df = pd.read_json(object_ann_json)
category_df = pd.read_json(category_json)
sample_data_df = pd.read_json(sample_data_json)

### Cluster chosen Categories into right "Parent" - category

In [6]:
#clustering of categories
def transform_category(category_str):
    if re.match(r'^human\.', category_str):
        return 'Human'
    if re.match(r'^movable_object\.barrier', category_str):
        return 'Barrier'
    if re.match(r'^movable_object\.cone', category_str):
        return 'Cone'
    if re.match(r'^vehicle\.bicycle', category_str):
        return 'Bike'
    if re.match(r'^vehicle\.motorcycle', category_str):
        return 'Motorcycle'
    if re.match(r'^vehicle\.truck', category_str):
        return 'Truck'
    if re.match(r'^vehicle\.car', category_str):
        return 'Car'
    if re.match(r'^movable_object\.trafficcone', category_str):
        return 'Trafficcone'
    return None


#apply function to the category column -> clustering of chosen categories
category_df['name'] = category_df["name"].apply(transform_category)

#remove description
category_df = category_df.drop(columns="description")

#drop rows that are not included in the chosen categories
category_df = category_df.dropna(subset=['name'])

### Merging Dataframes based on Data model and FK-PK dependencies

In [7]:
#merge dataframes based on foreign keys to connect labeling with image data
obj_cat = pd.merge(object_ann_df, category_df, left_on='category_token', right_on='token', how='inner')
obj_cat = obj_cat.dropna(subset=['name'])

#remove list of columns from dataframe
columns_to_remove = ["token_x", "token_y", "category_token", "bbox", "mask", "attribute_tokens", ]
obj_cat = obj_cat.drop(columns=columns_to_remove)

#filter so that only key frames are included
print(sample_data_df.info())
sample_data_df = sample_data_df[sample_data_df["is_key_frame"] == True]

#merge with sample data
label_data_v1 = pd.merge(sample_data_df, obj_cat, left_on='token', right_on='sample_data_token', how='left')
label_data_v1['name'] = label_data_v1['name'].fillna('empty')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650 entries, 0 to 649
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   token                    650 non-null    object        
 1   sample_token             650 non-null    object        
 2   ego_pose_token           650 non-null    object        
 3   calibrated_sensor_token  650 non-null    object        
 4   filename                 650 non-null    object        
 5   fileformat               650 non-null    object        
 6   width                    650 non-null    int64         
 7   height                   650 non-null    int64         
 8   timestamp                650 non-null    datetime64[ns]
 9   is_key_frame             650 non-null    bool          
 10  prev                     650 non-null    object        
 11  next                     650 non-null    object        
dtypes: bool(1), datetime64[ns](1), int64

### Processing and transformation of results

In [8]:
#group to see all labels for each image
label_data_v2 = label_data_v1.groupby("filename")["name"].value_counts().reset_index()

#turn labels from names to dummie variables
dummies = pd.get_dummies(label_data_v2['name'])

# Join the dummy variables to the main dataframe
labeled = pd.concat([label_data_v2, dummies], axis=1)
labeled[dummies.columns] = labeled[dummies.columns].astype(int)

# Drop the original 'name' column
labeled = labeled.drop(['name'], axis=1)
labeled = labeled.drop(['empty'], axis=1)
labeled = labeled.drop(['count'], axis=1)

# Group by 'filename' and aggregate the data
labeled = labeled.groupby('filename').agg({
    'Human': 'max',
    'Barrier': 'max',
    'Bike': 'max',
    'Motorcycle': 'max',
    'Truck': 'max',
    'Car': 'max',
    'Trafficcone': 'max'
}).reset_index()

labeled["filename"][1]

'samples/CAM_BACK/n003-2018-01-08-11-30-34+0800__CAM_BACK__1515382745757583.jpg'

### Analysis of the data set

In [9]:
labeled['Total'] = labeled[["Human", "Barrier", "Bike", "Motorcycle", "Truck", "Car", "Trafficcone"]].sum(axis=1)
labeled['NoDetec'] = labeled[["Human", "Barrier", "Bike", "Motorcycle", "Truck", "Car", "Trafficcone"]].apply(lambda row: 1 if (row == 0).all() else 0, axis=1)
labeled


labeled

Unnamed: 0,filename,Human,Barrier,Bike,Motorcycle,Truck,Car,Trafficcone,Total,NoDetec
0,samples/CAM_BACK/n003-2018-01-03-12-03-23+0800...,1,0,0,1,1,1,0,4,0
1,samples/CAM_BACK/n003-2018-01-08-11-30-34+0800...,1,1,0,1,1,1,0,5,0
2,samples/CAM_BACK/n003-2018-07-12-15-40-35+0800...,1,0,0,1,0,1,0,3,0
3,samples/CAM_BACK/n005-2018-06-14-20-11-03+0800...,0,0,0,0,0,1,0,1,0
4,samples/CAM_BACK/n008-2018-06-04-16-30-00-0400...,0,0,0,0,1,1,0,2,0
5,samples/CAM_BACK/n009-2018-05-08-15-52-41-0400...,0,0,1,0,1,1,0,3,0
6,samples/CAM_BACK/n010-2018-08-27-12-00-23+0800...,1,1,0,0,1,1,0,4,0
7,samples/CAM_BACK/n013-2018-08-20-14-38-24+0800...,0,0,1,0,1,1,0,3,0
8,samples/CAM_BACK/n013-2018-08-21-11-46-25+0800...,1,0,0,0,0,1,0,2,0
9,samples/CAM_BACK/n013-2018-08-28-16-04-27+0800...,1,0,0,0,0,1,0,2,0


In [10]:
relative_occurrence = labeled[["Human", "Barrier", "Bike", "Motorcycle", "Truck", "Car", "Trafficcone", "Total"]].mean()

relative_occurrence


Human          0.60
Barrier        0.26
Bike           0.20
Motorcycle     0.20
Truck          0.38
Car            0.72
Trafficcone    0.28
Total          2.64
dtype: float64

## Check for potential bias: Barrier & Bike

In [11]:
df = labeled
columns = ['Human', 'Barrier', 'Bike', 'Motorcycle', 'Truck', 'Car', 'Trafficcone']
results = []

# Nested loops to compute the Chi-squared test for each pair of variables
for col1 in columns:
    for col2 in columns:
        if col1 != col2:
            # Create a contingency table
            contingency_table = pd.crosstab(df[col1], df[col2])
            # Perform the chi-squared test
            chi2, p_value, _, _ = chi2_contingency(contingency_table)
            # Store results
            results.append({'Variable 1': col1, 'Variable 2': col2, 'Chi-squared': chi2, 'p-value': p_value})

# Convert results to DataFrame
result_df = pd.DataFrame(results)
result_df

Unnamed: 0,Variable 1,Variable 2,Chi-squared,p-value
0,Human,Barrier,1.251733,0.263222
1,Human,Bike,0.130208,0.718216
2,Human,Motorcycle,1.171875,0.279016
3,Human,Truck,1.276882,0.25848
4,Human,Car,0.004134,0.948737
5,Human,Trafficcone,0.334821,0.562834
6,Barrier,Human,1.251733,0.263222
7,Barrier,Bike,0.786123,0.375275
8,Barrier,Motorcycle,0.0,1.0
9,Barrier,Truck,2.891542,0.089046


A p-value above 0.05, suggests no significant associations and thus no apparent bias or dependency among the variables tested in the dataset.
Therefore, only the combination of Car & Truck (and vice versa) shows a dependency which might result in a bias in the models that will be trained based on this data.

This fact should be kept in mind when proceeding with the evaluation of the models.


In [12]:
labeled = labeled.drop('Total', axis=1)
labeled.to_csv("labeled_data.csv")
