In [1]:
import pandas as pd
import os
from pathlib import Path

def create_single_label_csv(target_pressure):
    co2_data_all_labels = pd.read_csv('co2_data_all_labels.csv')
    co2_data_all_labels.drop('surface_area_m2g', axis=1, inplace=True)

    target_pressure = f'{target_pressure}'
    columns_to_keep = ['id', 'surface_area_m2cm3', 'void_fraction', 'lcd', 'pld', target_pressure]
    co2_data_single_label = co2_data_all_labels.copy()
    co2_data_single_label = co2_data_single_label[columns_to_keep]
    co2_data_single_label.rename(columns={target_pressure: 'target'}, inplace=True)

    # Reorder the columns as 'id', 'surface_area_m2_cm3', 'void_fraction', 'lcd', 'pld', 'target'
    column_order = ['id', 'surface_area_m2cm3', 'void_fraction', 'lcd', 'pld', 'target']
    co2_data_single_label = co2_data_single_label.reindex(columns=column_order)
    co2_data_single_label = co2_data_single_label.set_index('id')
    return co2_data_single_label

def find_directory():
    """Finds the directory of the python script or Jupyter notebook.

    Returns:
        directory (str): directory of script.
    """
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':  # If running in a Jupter notebook
            directory = os.getcwd()
        else:
            directory = Path(__file__).parent
    except NameError:
        directory = Path(__file__).parent
    return directory

def get_cif_IDs():
    directory = find_directory()
    cif_directory = f'{directory}/cif'
    filenames = os.listdir(cif_directory)

    # Remove file extensions and return list
    cif_ids = [Path(file).stem for file in filenames if os.path.isfile(os.path.join(cif_directory, file))]
    return cif_ids

def build_training_val_set(target_pressure):
    # Make dataframe with single label at a given pressure
    co2_data_single_label = create_single_label_csv(target_pressure)

    # Filter dataframe so that it only contains MOFs that have corresponding cif files
    cif_ids = get_cif_IDs()
    co2_data_single_label_training_val = co2_data_single_label[co2_data_single_label.index.isin(cif_ids)]

    # Save csv as training+validation set
    co2_data_single_label_training_val.to_csv('training_val.csv', header=False)

In [4]:
build_training_val_set(target_pressure=0.1)