In [1]:
import tarfile
import pandas as pd
import os
from collections import defaultdict


## Load parquet files and concatenate into dfs

In [2]:
import tarfile
import pandas as pd

tar_path = 'R:\Kevin\Projects\LLM\pdac_tf\omop_gpt_database_ohdsi_cumc_deid_2023q4r2_snapshot5_top_p900_temp_1060_repetition_penalty_1060.tar'

# Initialize DataFrames for concept and concept_ancestor tables
concept_df = None
concept_ancestor_df = None
condition_occurrence_df = None
person_df = None

# Open the tar file and load the Parquet files for specific categories
with tarfile.open(tar_path, 'r') as tar:
    for member in tar.getmembers():
        # Check if the file is a Parquet file in the required categories
        if member.name.endswith(".parquet"):
            with tar.extractfile(member) as f:
                df = pd.read_parquet(f)
                
                # Identify and load the appropriate table
                if '/concept_ancestor/' in member.name:
                    concept_ancestor_df = pd.concat([concept_ancestor_df, df], ignore_index=True) if concept_ancestor_df is not None else df
                elif '/concept/' in member.name:
                    concept_df = pd.concat([concept_df, df], ignore_index=True) if concept_df is not None else df
                elif '/condition_occurrence/' in member.name:
                    condition_occurrence_df = pd.concat([condition_occurrence_df, df], ignore_index=True) if condition_occurrence_df is not None else df
                elif '/person/' in member.name:
                    person_df = pd.concat([person_df, df], ignore_index=True) if person_df is not None else df

# Verify loading
print("Concept Table Shape:", concept_df.shape)
print("Concept Ancestor Table Shape:", concept_ancestor_df.shape)
print("Condition Occurrence Table Shape:", condition_occurrence_df.shape)
print("Person Table Shape:", person_df.shape)


Concept Table Shape: (7384149, 10)
Concept Ancestor Table Shape: (81794858, 4)
Condition Occurrence Table Shape: (49485547, 16)
Person Table Shape: (1403123, 18)


In [38]:
# Gastric Cancer Concept from ALLOfUs
gastric_cancer_ids = [443387]
concept_df['concept_id'] = concept_df['concept_id'].astype(int)
concept_ancestor_df['ancestor_concept_id'] = concept_ancestor_df['ancestor_concept_id'].astype(int)

# Step 2: Filter the 'concept' table for the gastric cancer IDs
filtered_concept_df = concept_df[concept_df['concept_id'].isin(gastric_cancer_ids)]

# Step 2.1: Join 'filtered_concept_df' with 'concept_ancestor' on 'concept_id' and 'ancestor_concept_id'
# to get descendant concept IDs for the target ancestors
filtered_concept_ancestor_df = concept_ancestor_df.merge(
    filtered_concept_df,
    left_on='ancestor_concept_id',
    right_on='concept_id',
    how='inner'
)

# Step 2.2: Extract unique descendant concept IDs from the joined table
descendant_concepts = filtered_concept_ancestor_df['descendant_concept_id'].unique()

# Step 3: Ensure 'condition_concept_id' in 'condition_occurrence_df' is an integer
condition_occurrence_df['condition_concept_id'] = condition_occurrence_df['condition_concept_id'].astype(int)

# Step 3.1: Filter the 'condition_occurrence' table where 'condition_concept_id' matches any of the descendant concepts
filtered_condition_occurrence_df = condition_occurrence_df[
    condition_occurrence_df['condition_concept_id'].isin(descendant_concepts)
]

# Step 4: Select the required columns: 'person_id', 'condition_concept_id', 'condition_start_datetime'
case_gastric_cancer = filtered_condition_occurrence_df[
    ['person_id', 'condition_concept_id', 'condition_start_datetime']
]

# Display the result
print("Resulting case_gastric_cancer DataFrame:")

# Drop duplicate rows based on `person_id`, keeping the first occurrence
case_gastric_cancer = case_gastric_cancer.drop_duplicates(subset=['person_id'])
case_gastric_cancer.shape


Resulting case_gastric_cancer DataFrame:


(1644, 3)

In [43]:
# Step 1: Prepare the datasets
person_df['person_id'] = person_df['person_id'].astype(int)
case_gastric_cancer['person_id'] = case_gastric_cancer['person_id'].astype(int)

# Merge demographics with the condition occurrence data
merged_df = case_gastric_cancer.merge(person_df, on='person_id', how='inner')

# Step 2: Create the dictionary
patient_data = {}

for pat_id, group in merged_df.groupby('person_id'):
    # Extract demographic information for the patient
    demographics = group.iloc[0][['gender_concept_id', 'birth_datetime', 'race_concept_id', 'ethnicity_concept_id']].to_dict()
    
    # Extract sequence of diagnosis codes and timestamps
    diagnoses = group['condition_concept_id'].tolist()
    timestamps = group['condition_start_datetime'].tolist()
    
    # Construct the dictionary entry
    patient_data[pat_id] = {
        'demographics': demographics,
        'diagnoses': diagnoses,
        'timestamps': timestamps,
    }

# Step 3: Use `concept_df` to map concept IDs to names
# Ensure `concept_id` in `concept_df` is of the same type as the IDs in `patient_data`
concept_df['concept_id'] = concept_df['concept_id'].astype(str)  # Ensure consistent types
concept_mapping = concept_df.set_index('concept_id')['concept_name'].to_dict()

# Map demographic IDs and diagnoses in the `patient_data`
for pat_id, data in patient_data.items():
    demographics = data['demographics']
    # Map demographics
    demographics['gender'] = concept_mapping.get(str(demographics.pop('gender_concept_id')), 'Unknown')
    demographics['race'] = concept_mapping.get(str(demographics.pop('race_concept_id')), 'Unknown')
    demographics['ethnicity'] = concept_mapping.get(str(demographics.pop('ethnicity_concept_id')), 'Unknown')
    # Map diagnoses
    data['diagnoses'] = [concept_mapping.get(str(diagnosis), 'Unknown') for diagnosis in data['diagnoses']]

# Step 4: Display the first 5 patients
import pprint

# Extract and display the first 5 patients
first_5_patients = {pat_id: patient_data[pat_id] for pat_id in list(patient_data.keys())[:5]}

# Pretty print the first 5 patients
print("First 5 Patients with Mapped Diagnoses and Demographics:")
pprint.pprint(first_5_patients)


First 5 Patients with Mapped Diagnoses and Demographics:
{10001588: {'demographics': {'birth_datetime': '1942-01-01T00:00:00',
                             'ethnicity': 'No matching concept',
                             'gender': 'MALE',
                             'race': 'No matching concept'},
            'diagnoses': ['Primary malignant neoplasm of stomach'],
            'timestamps': ['2002-05-29T00:00:00']},
 10001719: {'demographics': {'birth_datetime': '1946-01-01T00:00:00',
                             'ethnicity': 'No matching concept',
                             'gender': 'FEMALE',
                             'race': 'No matching concept'},
            'diagnoses': ['Primary malignant neoplasm of stomach'],
            'timestamps': ['2017-03-31T00:00:00']},
 10003951: {'demographics': {'birth_datetime': '1943-01-01T00:00:00',
                             'ethnicity': 'No matching concept',
                             'gender': 'FEMALE',
                             'r

In [44]:
len(patient_data)

1644

In [45]:
import pickle

# Specify the file path to save the pickle
pickle_file_path = "gastric_patient_data.pkl"

# Save the dictionary as a pickle file
with open(pickle_file_path, "wb") as pickle_file:
    pickle.dump(patient_data, pickle_file)

print(f"Patient data has been saved to {pickle_file_path}")


Patient data has been saved to gastric_patient_data.pkl
