In [4]:
import pandas as pd
import numpy as np
from scipy.io import loadmat

# Load the .mat file
peta_mat = loadmat('data/PETA/PETA.mat')

# Extract data and attributes
data = peta_mat['peta']['data'][0, 0]  # Shape (19000, 109)
attributes = peta_mat['peta']['attribute'][0, 0]  # Shape (105,)
selected_attributes_idx = peta_mat['peta']['selected_attribute'][0, 0]  # Indices of selected attributes

# Inspect attributes structure
print(type(attributes))
print(len(attributes))  # Should be 105
print(attributes)       # Inspect the contents

# Convert attributes to a list of strings
# Flatten the attributes array if necessary
attribute_names = [attr[0] for attr in attributes.flatten()]  # Ensure we get 105 names
print(f"Number of attributes: {len(attribute_names)}")


# Split data into metadata and annotations
metadata_columns = ['Image_Index', 'Global_Person_ID', 'Dataset_Index', 'Original_Person_ID']
metadata = data[:, :4]  # Columns 1-4 are metadata
annotations = data[:, 4:]  # Columns 5-109 are attributes

# Create a DataFrame for the full dataset
df = pd.DataFrame(annotations, columns=attribute_names)  # Add attribute columns
df.insert(0, metadata_columns[0], metadata[:, 0])  # Add Image_Index
df.insert(1, metadata_columns[1], metadata[:, 1])  # Add Global_Person_ID
df.insert(2, metadata_columns[2], metadata[:, 2])  # Add Dataset_Index
df.insert(3, metadata_columns[3], metadata[:, 3])  # Add Original_Person_ID

# Create a DataFrame for selected attributes
# Convert 1-based indices to 0-based indices for Python
selected_attribute_names = [attribute_names[idx - 1] for idx in selected_attributes_idx.flatten()]  # Adjust for 1-based index
selected_df = df[['Image_Index'] + selected_attribute_names]  # Filter only selected attributes

# Save the DataFrames to CSV files
df.to_csv('peta_full_dataset.csv', index=False)
selected_df.to_csv('peta_selected_attributes.csv', index=False)

# Output for confirmation
print("Full Dataset:")
print(df.head())

print("\nSelected Attributes Dataset:")
print(selected_df.head())


<class 'numpy.ndarray'>
105
[[array(['personalLess30'], dtype='<U14')]
 [array(['personalLess45'], dtype='<U14')]
 [array(['personalLess60'], dtype='<U14')]
 [array(['personalLarger60'], dtype='<U16')]
 [array(['carryingBackpack'], dtype='<U16')]
 [array(['carryingOther'], dtype='<U13')]
 [array(['lowerBodyCasual'], dtype='<U15')]
 [array(['upperBodyCasual'], dtype='<U15')]
 [array(['lowerBodyFormal'], dtype='<U15')]
 [array(['upperBodyFormal'], dtype='<U15')]
 [array(['accessoryHat'], dtype='<U12')]
 [array(['upperBodyJacket'], dtype='<U15')]
 [array(['lowerBodyJeans'], dtype='<U14')]
 [array(['footwearLeatherShoes'], dtype='<U20')]
 [array(['upperBodyLogo'], dtype='<U13')]
 [array(['hairLong'], dtype='<U8')]
 [array(['personalMale'], dtype='<U12')]
 [array(['carryingMessengerBag'], dtype='<U20')]
 [array(['accessoryMuffler'], dtype='<U16')]
 [array(['accessoryNothing'], dtype='<U16')]
 [array(['carryingNothing'], dtype='<U15')]
 [array(['upperBodyPlaid'], dtype='<U14')]
 [array(['car