In [None]:
import os
import pandas as pd

"""
Dowload ShapeNetPart dataset from https://shapenet.cs.stanford.edu/media/shapenetcore_partanno_segmentation_benchmark_v0_normal.zip

In the Data folder, create a folder called ShapeNet and unzip the downloaded file in it.

Rename the folder in ShapeNet to shape_data (this is because the train test split json files use "shape_data" as the directory name).

Note: We are no longer using the train test split json files. We just did an 80/20 split on the data ourselves.

The folder structure should be as follows:

Data
    ShapeNet
        shape_data
"""

## Rename Folders For ShapeNetPart Dataset

In [None]:
def get_category_names():
    """
    Get the category names from the file synsetoffset2category.txt
    :return: a list of category names
    """
    category_names = []
    category_synset_ids = []
    with open('../Data/ShapeNet/shape_data/synsetoffset2category.txt', 'r') as f:
        for line in f:
            category_names.append(line.split()[0])
            category_synset_ids.append(line.split()[1])

    return category_names, category_synset_ids

def get_category_synset_id(category_name):
    """
    Given a category name, return the corresponding synset id
    """
    category_names, category_synset_ids = get_category_names()

    # Check if the category name is in the list of category names
    if category_name not in category_names:
        raise ValueError('Category name not in list of category names')

    return category_synset_ids[category_names.index(category_name)]

# Rename folders to category names
def rename_folders_to_category_names():
    """
    Rename folders to category names
    :return: None
    """
    category_names, category_synset_ids = get_category_names()
    for category_name, category_synset_id in zip(category_names, category_synset_ids):
        try:
            os.rename('../Data/ShapeNet/shape_data/' + category_synset_id + '/', '../Data/ShapeNet/shape_data/' + category_name + '/')
        except:
            pass

In [None]:
rename_folders_to_category_names()

## Get ShapeNetV2 binvox files for ShapeNetPart dataset

In [None]:
"""
Create a directory called ShapeNetV2 in the Data folder. In this directory store each zip file from the huggingface dataset that matches the 16 category names from ShapeNetPart. (see ShapeNet/shape_data/synsetoffset2category.txt for the category names and synset ids)
Within the directory create an excludefiles.txt file and add the following lines to it:

*.binvox
*.mtl
*.png
*.jpg
images
screenshots
untitled

This will save considerable space as the zip files contain a lot of unnecessary files for our purposes.

Make sure that 7zip is installed on your system and that the 7z executable is in your PATH variable (see https://www.7-zip.org/download.html).
"""

import subprocess

zips = os.listdir('../Data/ShapeNetV2/')
zips = [zip for zip in zips if zip.endswith('.zip')]
os.chdir('../Data/ShapeNetV2/')

# Unzip the zip files
for zip in zips:
    subprocess.run(['7z', 'x', zip, '-xr@excludefiles.txt'])

In [None]:
"""
The directory structure should be as follows:

ShapeNetV2
    {category synset id}
        {model hash}
            models
                model_normalized.solid.binvox

The binvox files are named model_normalized.solid.binvox. We want to rename them to {model hash}.binvox and move them to the ShapeNet folder under the corresponding category name.
"""

def move_binvox_files():
    """
    Rename binvox files and move them to the ShapeNet folder under the corresponding category name
    :return: None
    """
    category_names, category_synset_ids = get_category_names()
    for category_name, category_synset_id in zip(category_names, category_synset_ids):
        
        # Get the list of model hashes for the category
        model_hashes = os.listdir('../Data/ShapeNetV2/' + category_synset_id + '/')
        part_hashes = os.listdir('../Data/ShapeNet/shape_data/' + category_name + '/')

        # Get all .txt files in the category folder
        part_hashes = [part_hash for part_hash in part_hashes if part_hash.endswith('.txt')]

        # Remove .txt from the model hashes
        part_hashes = [part_hash.replace('.txt', '') for part_hash in part_hashes]

        # Get the intersection of the two lists
        model_hashes = list(set(model_hashes).intersection(set(part_hashes)))

        for model_hash in model_hashes:
            # Rename the obj file and move it to the ShapeNet folder under the corresponding category name
            try:
                os.rename('../Data/ShapeNetV2/' + category_synset_id + '/' + model_hash + '/models/model_normalized.obj', '../Data/ShapeNet/shape_data/' + category_name + '/' + model_hash + '.obj')
                os.rename('../Data/ShapeNetV2/' + category_synset_id + '/' + model_hash + '/models/model_normalized.json', '../Data/ShapeNet/shape_data/' + category_name + '/' + model_hash + '.json')
            except FileNotFoundError:
                print('File not found: ' + model_hash)

In [None]:
move_binvox_files()

## Create Train and Test Splits for ShapeNetPart Dataset

In [None]:
# Create an 80-20 train-test split
def create_test_train():
    # Get the category names
    category_names, category_synset_ids = get_category_names()

    train = {}
    test = {}

    # Iterate through each folder and create a train split 80%
    for category_name, synset_id in zip(category_names, category_synset_ids):

        train[category_name] = []
        test[category_name] = []
        
        # Get the list of folders in the category
        files = os.listdir('../Data/ShapeNet/shape_data/' + category_name)

        # Omit files that contain .obj
        files = [file for file in files if '.obj' not in file]

        split_range = int(len(files) * 0.8)

        i = 0

        # Iterate through files and create a train test splits
        for file in files:
            if i < split_range:
                train[category_name].append(file[:-4])
            else:
                test[category_name].append(file[:-4])

            i += 1

        obj_files = os.listdir('../Data/ShapeNet/shape_data/' + category_name)
        obj_files = [file for file in obj_files if '.obj' in file]

        # Iterate through dictionaries and find any files that do not have a corresponding .obj file
        for file in train[category_name]:
            if file.split(".")[0] + '.obj' not in obj_files:
                train[category_name].remove(file)
        for file in test[category_name]:
            if file.split(".")[0] + '.obj' not in obj_files:
                test[category_name].remove(file)

    return train, test

In [None]:
train, test = create_test_train()

## Create Metadata File For ShapeNetPart Dataset

In [None]:
# Create a dataframe with the file paths and the corresponding category names
def create_metadata_file(train, test):
    """
    Create a metadata file
    :return: None
    """
    category_names, category_synset_ids = get_category_names()
    metadata = []

    for category_name, synset_id in zip(category_names, category_synset_ids):
        i = 0
        for file in train[category_name]:
            metadata.append([i, category_name, 'train', category_name + '/' + file + '.obj'])
            i += 1
        for file in test[category_name]:
            metadata.append([i, category_name, 'test', category_name + '/' + file + '.obj'])
            i += 1

    df = pd.DataFrame(metadata, columns=['object_id', 'class', 'split', 'object_path'])
    df.to_csv('../Data/ShapeNet/metadata_shapenet.csv', index=False)

In [None]:
create_metadata_file(train, test)