## Data processing for PUC

- keep function defs at the top
- keep variable assignments seperate from function calls
- outputs go to outputs folder

In [15]:
# LIBRARIES AND FUNCTIONS

import numpy as np
import os
import pandas as pd

def print_npy_file_contents(file_path):
    """
    This function loads and prints the contents of a specific .npy file.
    
    Parameters:
    file_path (str): The path to the .npy file.
    """
    if not file_path.endswith('.npy'):
        print(f"The file {file_path} is not a .npy file.")
        return
    
    if not os.path.isfile(file_path):
        print(f"The file {file_path} does not exist.")
        return
    
    try:
        data = np.load(file_path, allow_pickle=True)
        print(f"Contents of {file_path}:")
        print(data)
        print("\n")
    except Exception as e:
        print(f"An error occurred while loading {file_path}: {e}")


            
            
def populate_dataframe_from_nested_dict(nested_dict):
    def flatten_dict(d, parent_key='', sep='_'):
        items = []
        for k, v in d.items():
            new_key = f'{parent_key}{sep}{k}' if parent_key else k
            if isinstance(v, dict):
                items.extend(flatten_dict(v, new_key, sep=sep).items())
            else:
                items.append((new_key, v))
        return dict(items)
    
    def nested_dict_to_rows(d, keys=[]):
        if isinstance(d, dict):
            rows = []
            for k, v in d.items():
                rows.extend(nested_dict_to_rows(v, keys + [k]))
            return rows
        else:
            return [keys + [d]]
    
    rows = nested_dict_to_rows(nested_dict)
    
    # Determine the number of columns
    max_depth = max(len(row) for row in rows)
    
    # Pad rows to have the same number of columns
    for row in rows:
        while len(row) < max_depth:
            row.append(None)
    
    # Create the DataFrame
    df = pd.DataFrame(rows)
    
    return df



def create_dataframe(file_path, delimiter, has_headers=True):
    if has_headers:
        df = pd.read_csv(file_path, sep=delimiter)
    else:
        df = pd.read_csv(file_path, sep=delimiter, header=None)
    return df



In [9]:
# show outputs of unit_predictions.npy

print_npy_file_contents("C:/Users/LatimerJ/puc/experiments/outputs/unit_predictions.npy")

Contents of C:/Users/LatimerJ/puc/experiments/outputs/unit_predictions.npy:
{'arabica_ratings': {'Pint': {'': 'ValueError', '1': {'magnitude': 1, 'unit': 'dimensionless', 'entity': 'unknown'}, '1 kg': {'magnitude': 1, 'unit': 'kilogram', 'entity': 'mass'}, '1.2': {'magnitude': 1.2, 'unit': 'dimensionless', 'entity': 'unknown'}, '100 lbs': {'magnitude': 100, 'unit': 'pound', 'entity': 'mass'}, '1000 M': {'magnitude': 1000, 'unit': 'molar', 'entity': 'substance'}, '1200': {'magnitude': 1200, 'unit': 'dimensionless', 'entity': 'unknown'}, '1200-1800m': 'TypeError', '1250m': {'magnitude': 1250, 'unit': 'meter', 'entity': 'length'}, '1300': {'magnitude': 1300, 'unit': 'dimensionless', 'entity': 'unknown'}, '1300 MSNM': 'UndefinedUnitError', '1320': {'magnitude': 1320, 'unit': 'dimensionless', 'entity': 'unknown'}, '1400': {'magnitude': 1400, 'unit': 'dimensionless', 'entity': 'unknown'}, '1400-1900M': 'TypeError', '1400ft': {'magnitude': 1400, 'unit': 'foot', 'entity': 'length'}, '1450': {'

In [8]:
# show outputs of cell_annotations.npy

print_npy_file_contents("C:/Users/LatimerJ/puc/experiments/inputs/cell_annotations.npy")

Contents of C:/Users/LatimerJ/puc/experiments/inputs/cell_annotations.npy:
{'hes': {'62 L': {'magnitude': 62.0, 'unit': ['liter', 'L', 'litre', 'l']}, '116 L': {'magnitude': 116.0, 'unit': ['liter', 'L', 'litre', 'l']}, '67l': {'magnitude': 67.0, 'unit': ['liter', 'L', 'litre', 'l']}, '120l': {'magnitude': 120.0, 'unit': ['liter', 'L', 'litre', 'l']}, '165l': {'magnitude': 165.0, 'unit': ['liter', 'L', 'litre', 'l']}, '170l': {'magnitude': 170.0, 'unit': ['liter', 'L', 'litre', 'l']}, '194l': {'magnitude': 194.0, 'unit': ['liter', 'L', 'litre', 'l']}, '225l': {'magnitude': 225.0, 'unit': ['liter', 'L', 'litre', 'l']}, '84L': {'magnitude': 84.0, 'unit': ['liter', 'L', 'litre', 'l']}, '90L': {'magnitude': 90.0, 'unit': ['liter', 'L', 'litre', 'l']}, '102L': {'magnitude': 102.0, 'unit': ['liter', 'L', 'litre', 'l']}, '122L': {'magnitude': 122.0, 'unit': ['liter', 'L', 'litre', 'l']}, '130L': {'magnitude': 130.0, 'unit': ['liter', 'L', 'litre', 'l']}, '147L': {'magnitude': 147.0, 'unit': [

In [35]:
# show outputs of updated_cell_annotations.npy

print_npy_file_contents("C:/Users/LatimerJ/puc/processing/processing_outputs/updated_cell_annotations.npy")

Contents of C:/Users/LatimerJ/puc/processing/processing_outputs/updated_cell_annotations.npy:




In [28]:
# LOADING DATA

# OMOP NumUnit

numunit_omop = pd.read_csv('2024_03_numunit_withOMOPtarget.txt')
columns_to_drop = ['numunitid', 'obsval']
numunit_omop = numunit_omop.drop(columns=columns_to_drop, axis=1)

# OMOP NumUnit description columns only

#as df
numunit_omop_desc_only = pd.read_csv('2024_03_numunit_withOMOPtarget_desc_only.csv')
#as dict
numunit_omop_desc_only_dict = numunit_omop_desc_only[['NumUnit', 'description', 'unit', 'source_code_description']].to_dict('records')


# cell_annotations

cell_annotations = np.load("C:/Users/LatimerJ/puc/experiments/inputs/cell_annotations.npy", allow_pickle=True).item()


In [18]:
# OMOP NmUnit TO .CSV

numunit_omop_csv = pd.read_csv('2024_03_numunit_withOMOPtarget.txt')
numunit_omop_csv.to_csv('2024_03_numunit_withOMOPtarget.csv', index=False)

In [34]:
# CREATING NEW DICT TO ADD TO .NPY

numunit_dict_for_npy = {}

for entry in numunit_omop_desc_only_dict:
    description = entry['description']
    unit = entry['unit']
    source_code_description = entry['source_code_description']
    
    numunit_dict_for_npy[description] = {
        'unit': [unit],
        'source_code_description': source_code_description
    }

cell_annotations['NumUnit'] = numunit_dict_for_npy

np.save('C:/Users/LatimerJ/puc/processing/processing_outputs/updated_cell_annotations.npy', cell_annotations)

In [37]:
# EXPORTING UPDATED CELL_ANNOTATIONS TO .CSV

updated_cell_annotations_npy = np.load('C:/Users/LatimerJ/puc/processing/processing_outputs/updated_cell_annotations.npy', allow_pickle=True).item()

updated_cell_annotations_df = populate_dataframe_from_nested_dict(updated_cell_annotations_npy)
updated_cell_annotations_df.to_csv('C:/Users/LatimerJ/puc/processing/processing_outputs/updated_cell_annotations.csv', index=False)