In [114]:
import tarfile
import pandas as pd
import os
from collections import defaultdict
import pprint


## Load parquet files and concatenate into dfs

In [69]:
tar_path = 'R:\Kevin\Projects\LLM\pdac_tf\omop_gpt_database_ohdsi_cumc_deid_2023q4r2_snapshot5_top_p900_temp_1060_repetition_penalty_1060.tar'

# Initialize DataFrames for concept and concept_ancestor tables
concept_df = None
concept_ancestor_df = None
condition_occurrence_df = None
person_df = None

# Open the tar file and load the Parquet files for specific categories
with tarfile.open(tar_path, 'r') as tar:
    for member in tar.getmembers():
        # Check if the file is a Parquet file in the required categories
        if member.name.endswith(".parquet"):
            with tar.extractfile(member) as f:
                df = pd.read_parquet(f)
                
                # Identify and load the appropriate table
                if '/concept_ancestor/' in member.name:
                    concept_ancestor_df = pd.concat([concept_ancestor_df, df], ignore_index=True) if concept_ancestor_df is not None else df
                elif '/concept/' in member.name:
                    concept_df = pd.concat([concept_df, df], ignore_index=True) if concept_df is not None else df
                elif '/condition_occurrence/' in member.name:
                    condition_occurrence_df = pd.concat([condition_occurrence_df, df], ignore_index=True) if condition_occurrence_df is not None else df
                elif '/person/' in member.name:
                    person_df = pd.concat([person_df, df], ignore_index=True) if person_df is not None else df

print("Concept Table Shape:", concept_df.shape)
print("Concept Ancestor Table Shape:", concept_ancestor_df.shape)
print("Condition Occurrence Table Shape:", condition_occurrence_df.shape)
print("Person Table Shape:", person_df.shape)


  tar_path = 'R:\Kevin\Projects\LLM\pdac_tf\omop_gpt_database_ohdsi_cumc_deid_2023q4r2_snapshot5_top_p900_temp_1060_repetition_penalty_1060.tar'


Concept Table Shape: (7384149, 10)
Concept Ancestor Table Shape: (81794858, 4)
Condition Occurrence Table Shape: (49485547, 16)
Person Table Shape: (1403123, 18)


## Finding gastric cancer patients

In [119]:
# Gastric Cancer Concept from ALLOfUs
gastric_cancer_ids = [443387]
# Make both int to ensure correct matching
concept_df['concept_id'] = concept_df['concept_id'].astype(int)
concept_ancestor_df['ancestor_concept_id'] = concept_ancestor_df['ancestor_concept_id'].astype(int)

# Filter the 'concept' table for the gastric cancer IDs
filtered_concept_df = concept_df[concept_df['concept_id'].isin(gastric_cancer_ids)]

# Join 'filtered_concept_df' with 'concept_ancestor' on 'concept_id' and 'ancestor_concept_id'
# to get descendant concept IDs for the target ancestors
filtered_concept_ancestor_df = concept_ancestor_df.merge(
    filtered_concept_df,
    left_on='ancestor_concept_id',
    right_on='concept_id',
    how='inner'
)

# Extract unique descendant concept IDs from the joined table
descendant_concepts = filtered_concept_ancestor_df['descendant_concept_id'].unique()

#  Ensure 'condition_concept_id' in 'condition_occurrence_df' is an integer
condition_occurrence_df['condition_concept_id'] = condition_occurrence_df['condition_concept_id'].astype(int)

# Filter the 'condition_occurrence' table where 'condition_concept_id' matches any of the descendant concepts
filtered_condition_occurrence_df = condition_occurrence_df[
    condition_occurrence_df['condition_concept_id'].isin(descendant_concepts)
]

# Select the required columns: 'person_id', 'condition_concept_id', 'condition_start_datetime'
case_gastric_cancer = filtered_condition_occurrence_df[
    ['person_id', 'condition_concept_id', 'condition_start_datetime']
]

# Display the result
print("Resulting case_gastric_cancer DataFrame:")

# Drop duplicate rows based on `person_id`, keeping the first occurrence
case_gastric_cancer = case_gastric_cancer.drop_duplicates(subset=['person_id'])
case_gastric_cancer.shape


Resulting case_gastric_cancer DataFrame:


(1644, 3)

In [120]:
concept_mapping

{1211712: 'menthol 40mg/mL TOPICAL GEL [cbd cryotherapy pain relief roll-on]',
 1211713: 'oxygen 99L/100L RESPIRATORY (INHALATION) GAS',
 1211714: 'alcohol KIT [her lh2476 citrus scented hand sanitizer kit]',
 1211715: 'alcohol KIT [her lt289 vanilla limited too hand sanitizer kit]',
 1211716: 'alcohol KIT [her lt288 citrus limited too hand sanitizer kit]',
 1211717: 'alcohol KIT [her lh2473 vanilla scented hand sanitizer kit]',
 1211718: 'ixabepilone INTRAVENOUS KIT [ixempra]',
 1211719: 'menthol 40mg/mL TOPICAL GEL [cbd pain freeze roll-on]',
 1211720: 'alcohol KIT [her lh2484 strawberry scented hand sanitizer kit]',
 1211721: 'menthol 40mg/g TOPICAL GEL [cbd pain freeze shrink]',
 1211722: 'alcohol 60g/100g TOPICAL GEL [hand sanitizer 63% - lltt]',
 1211723: 'miconazole nitrate 2g/85g TOPICAL POWDER [mycozyl ap]',
 1211724: 'bisacodyl 10mg/1 RECTAL SUPPOSITORY',
 1211725: 'benzalkonium chloride .13g/100mL TOPICAL LIQUID [zoono foot guard]',
 1211726: 'cetylpyridinium chloride .5mg/m

Extracting Patient Info for gastric cancer patients

In [122]:

# Create the concept mapping
concept_mapping = concept_df.set_index('concept_id')['concept_name'].to_dict()
concept_mapping = {str(k): v for k, v in concept_mapping.items()}
# Ensure column types match
person_df['person_id'] = person_df['person_id'].astype(int)
case_gastric_cancer['person_id'] = case_gastric_cancer['person_id'].astype(int)

# Merge demographics with the condition occurrence data
merged_df = case_gastric_cancer.merge(person_df, on='person_id', how='inner')

# Ensure `birth_datetime` and `condition_start_datetime` are datetime objects
merged_df['birth_datetime'] = pd.to_datetime(merged_df['birth_datetime'])
merged_df['condition_start_datetime'] = pd.to_datetime(merged_df['condition_start_datetime'])

# Create the dictionary
patient_data = {}

for pat_id, group in merged_df.groupby('person_id'):
    # Extract demographics: race, gender, and age at first diagnosis based on descendant concepts
    race = concept_mapping.get(str(group.iloc[0]['race_concept_id']), 'Unknown')
    gender = concept_mapping.get(str(group.iloc[0]['gender_concept_id']), 'Unknown')
    
    # Find the first diagnosis age for any condition in descendant_concepts
    age_of_first_diagnosis = None
    for condition_id, timestamp in zip(group['condition_concept_id'], group['condition_start_datetime']):
        if condition_id in descendant_concepts:
            age_of_first_diagnosis = (timestamp - group.iloc[0]['birth_datetime']).days / 365.25
            age_of_first_diagnosis = round(age_of_first_diagnosis, 2)
            break  # Stop after finding the first matching diagnosis

    # Construct demographics
    demographics = {
        'race': race,
        'gender': gender,
        'age_of_first_diagnosis': age_of_first_diagnosis,
    }
    
    # Extract sequence of diagnosis codes and timestamps
    diagnoses = group['condition_concept_id'].tolist()
    timestamps = group['condition_start_datetime'].tolist()
    
    # Construct the dictionary entry
    patient_data[pat_id] = {
        'demographics': demographics,
        'diagnoses': [concept_mapping.get(str(diagnosis), 'Unknown') for diagnosis in diagnoses],
        'timestamps': timestamps,
    }

# Extract and display the first 5 patients
first_5_patients = {pat_id: patient_data[pat_id] for pat_id in list(patient_data.keys())[:5]}

# Pretty print the first 5 patients
print("First 5 Patients with Mapped Diagnoses and Demographics (race, gender, age of first diagnosis based on descendant concepts):")
pprint.pprint(first_5_patients)



First 5 Patients with Mapped Diagnoses and Demographics (race, gender, age of first diagnosis based on descendant concepts):
{10001588: {'demographics': {'age_of_first_diagnosis': 60.41,
                             'gender': 'MALE',
                             'race': 'No matching concept'},
            'diagnoses': ['Primary malignant neoplasm of stomach'],
            'timestamps': [Timestamp('2002-05-29 00:00:00')]},
 10001719: {'demographics': {'age_of_first_diagnosis': 71.24,
                             'gender': 'FEMALE',
                             'race': 'No matching concept'},
            'diagnoses': ['Primary malignant neoplasm of stomach'],
            'timestamps': [Timestamp('2017-03-31 00:00:00')]},
 10003951: {'demographics': {'age_of_first_diagnosis': 50.07,
                             'gender': 'FEMALE',
                             'race': 'Unknown'},
            'diagnoses': ['Primary malignant neoplasm of pyloric antrum'],
            'timestamps': [Timestamp

## Determine missingness of race and gender columns

In [125]:
# Define all possible placeholders for missing data
missing_placeholders = {"No matching concept", "Unknown", None, "", float('nan')}

# Initialize dictionaries to count missing and complete data
no_matching_concept_counts = {}
complete_data_counts = {}

# Iterate through the patient data
for pat_id, data in patient_data.items():
    demographics = data['demographics']
    for key, value in demographics.items():
        # Check for missing data
        if value in missing_placeholders or pd.isna(value):
            if key not in no_matching_concept_counts:
                no_matching_concept_counts[key] = 0
            no_matching_concept_counts[key] += 1
        else:
            # Count non-missing data
            if key not in complete_data_counts:
                complete_data_counts[key] = 0
            complete_data_counts[key] += 1

# Display the results
print("Number of people with missing data for each demographic variable:")
for key, count in no_matching_concept_counts.items():
    print(f"{key}: {count}")

print("\nNumber of people with complete data for each demographic variable:")
for key, count in complete_data_counts.items():
    print(f"{key}: {count}")

Number of people with missing data for each demographic variable:
race: 973
gender: 2

Number of people with complete data for each demographic variable:
gender: 1642
age_of_first_diagnosis: 1644
race: 671


In [127]:
# Subset patient_data to include only patients with valid race, gender, and age at diagnosis
filtered_patient_data = {
    pat_id: data
    for pat_id, data in patient_data.items()
    if data['demographics']['race'] not in ['Unknown', 'No matching concept', "Other Race"] and
       data['demographics']['gender'] not in ['Unknown', 'No matching concept'] and
       data['demographics']['age_of_first_diagnosis'] is not None
}

# Display the count of filtered patients
print(f"Number of patients with valid race, gender, and age at diagnosis: {len(filtered_patient_data)}")

# Display the first few filtered patients
import pprint
print("First few filtered patients:")
pprint.pprint(dict(list(filtered_patient_data.items())[:5]))

Number of patients with valid race, gender, and age at diagnosis: 657
First few filtered patients:
{10005519: {'demographics': {'age_of_first_diagnosis': 54.0,
                             'gender': 'FEMALE',
                             'race': 'White'},
            'diagnoses': ['Primary malignant neoplasm of stomach'],
            'timestamps': [Timestamp('2004-01-01 00:00:00')]},
 10006492: {'demographics': {'age_of_first_diagnosis': 79.29,
                             'gender': 'MALE',
                             'race': 'White'},
            'diagnoses': ['Primary malignant neoplasm of stomach'],
            'timestamps': [Timestamp('1999-04-15 00:00:00')]},
 10007360: {'demographics': {'age_of_first_diagnosis': 90.0,
                             'gender': 'FEMALE',
                             'race': 'White'},
            'diagnoses': ['Primary malignant neoplasm of stomach'],
            'timestamps': [Timestamp('2002-01-01 00:00:00')]},
 10008359: {'demographics': {'age_of_f

## Descriptive stats of gender, race, age at first diagnosis

In [128]:
import pandas as pd
from collections import Counter

# Extract race, gender, and age of first diagnosis from filtered_patient_data
race_data = [data['demographics']['race'] for data in filtered_patient_data.values()]
gender_data = [data['demographics']['gender'] for data in filtered_patient_data.values()]
age_data = [data['demographics']['age_of_first_diagnosis'] for data in filtered_patient_data.values()]

# Compute counts for race and gender
race_counts = Counter(race_data)
gender_counts = Counter(gender_data)

# Convert age data to a DataFrame for numerical statistics
age_df = pd.DataFrame(age_data, columns=['Age at Diagnosis'])

# Summary statistics for age
age_summary = age_df.describe()

# Display results
print("Summary Statistics for Race:")
for race, count in race_counts.items():
    print(f"{race}: {count}")

print("\nSummary Statistics for Gender:")
for gender, count in gender_counts.items():
    print(f"{gender}: {count}")

print("\nSummary Statistics for Age at Diagnosis:")
print(age_summary)


Summary Statistics for Race:
White: 513
Black or African American: 121
American Indian or Alaska Native: 2
Asian: 18
Native Hawaiian or Other Pacific Islander: 3

Summary Statistics for Gender:
FEMALE: 305
MALE: 352

Summary Statistics for Age at Diagnosis:
       Age at Diagnosis
count        657.000000
mean          64.802100
std           15.087091
min            1.180000
25%           56.080000
50%           66.600000
75%           75.440000
max           97.250000


In [129]:
import pickle

# Specify the file path to save the pickle
pickle_file_path = "gastric_patient_data.pkl"

# Save the dictionary as a pickle file
with open(pickle_file_path, "wb") as pickle_file:
    pickle.dump(filtered_patient_data, pickle_file)

print(f"Patient data has been saved to {pickle_file_path}")


Patient data has been saved to gastric_patient_data.pkl


# SEER Matching

In [99]:
seer_data = pd.read_csv('R:\Kevin\Projects\LLM\pdac_tf\SEER data cleaned.csv')
seer_data

  seer_data = pd.read_csv('R:\Kevin\Projects\LLM\pdac_tf\SEER data cleaned.csv')


Unnamed: 0,Patient ID,Sex,"Race and origin recode (NHW, NHB, NHAIAN, NHAPI, Hispanic)","Race recode (W, B, AI, API)",Age recode with single ages and 85+,Rural-Urban Continuum Code,Median household income inflation adj to 2022,Marital status at diagnosis,Year of diagnosis,Survival months,...,CS lymph nodes (2004-2015),CS mets at dx (2004-2015),Mets at DX-Distant LN (2016+),Grade Clinical (2018+),Grade Pathological (2018+),RX Summ--Surg Prim Site (1998+),RX Summ--Scope Reg LN Sur (2003+),Reason no cancer-directed surgery,Year of follow-up recode,Type of Reporting Source
0,8644,Male,Non-Hispanic White,White,51 years,Counties in metropolitan areas ge 1 million pop,"$110,000 - $119,999",Unknown,2000,5,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),0,Blank(s),Surgery performed,2001,Hospital inpatient/outpatient or clinic
1,8734,Male,Non-Hispanic Black,Black,77 years,Counties in metropolitan areas ge 1 million pop,"$95,000 - $99,999",Unknown,2000,14,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),30,Blank(s),Surgery performed,2001,Hospital inpatient/outpatient or clinic
2,8753,Female,Non-Hispanic White,White,33 years,Counties in metropolitan areas ge 1 million pop,"$120,000+",Divorced,2002,234,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),0,Blank(s),"Recommended but not performed, unknown reason",2021,Hospital inpatient/outpatient or clinic
3,10708,Male,Non-Hispanic White,White,82 years,Counties in metropolitan areas ge 1 million pop,"$110,000 - $119,999",Married (including common law),2003,28,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),51,4 or more regional lymph nodes removed,Surgery performed,2006,Hospital inpatient/outpatient or clinic
4,11464,Male,Non-Hispanic White,White,54 years,Counties in metropolitan areas ge 1 million pop,"$95,000 - $99,999",Married (including common law),2000,9,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),0,Blank(s),Not recommended,2001,Hospital inpatient/outpatient or clinic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113472,63367655,Male,Non-Hispanic White,White,78 years,Nonmetropolitan counties not adjacent to a met...,"$40,000 - $44,999",Divorced,2021,Unknown,...,Blank(s),Blank(s),Unknown,9,9,99,Unknown or not applicable,Unknown; death certificate; or autopsy only (2...,2021,Death certificate only
113473,63367708,Male,Hispanic (All Races),White,68 years,Counties in metropolitan areas ge 1 million pop,"$70,000 - $74,999",Married (including common law),2021,Unknown,...,Blank(s),Blank(s),Unknown,9,9,99,Unknown or not applicable,Unknown; death certificate; or autopsy only (2...,2021,Death certificate only
113474,63368048,Male,Hispanic (All Races),White,72 years,Counties in metropolitan areas ge 1 million pop,"$100,000 - $109,999",Married (including common law),2021,Unknown,...,Blank(s),Blank(s),Unknown,9,9,99,Unknown or not applicable,Unknown; death certificate; or autopsy only (2...,2021,Death certificate only
113475,63368267,Male,Non-Hispanic Black,Black,77 years,Counties in metropolitan areas ge 1 million pop,"$100,000 - $109,999",Single (never married),2021,Unknown,...,Blank(s),Blank(s),Unknown,9,9,99,Unknown or not applicable,Unknown; death certificate; or autopsy only (2...,2021,Death certificate only


In [130]:
# Display unique race values in SEER's "Race recode (W, B, AI, API)" column
seer_race_values = seer_data['Race recode (W, B, AI, API)'].unique()
print("Unique race values in SEER's Race recode column:")
print(seer_race_values)

Unique race values in SEER's Race recode column:
['White' 'Black' 'Asian or Pacific Islander' 'Unknown'
 'American Indian/Alaska Native']


In [101]:
from collections import Counter

# Define the mapping between filtered_patient_data races and SEER races
race_mapping = {
    'White': 'White',
    'Black or African American': 'Black',
    'American Indian or Alaska Native': 'American Indian/Alaska Native',
    'Asian': 'Asian or Pacific Islander',
    'Native Hawaiian or Other Pacific Islander': 'Asian or Pacific Islander'
}

# Define the mapping for genders
gender_mapping = {
    'MALE': 'Male',
    'FEMALE': 'Female'
}

# Update the races and genders in filtered_patient_data
for pat_id, data in filtered_patient_data.items():
    # Map race
    original_race = data['demographics']['race']
    data['demographics']['race'] = race_mapping.get(original_race, 'Unknown')  # Default to 'Unknown' if not in mapping
    
    # Map gender
    original_gender = data['demographics']['gender']
    data['demographics']['gender'] = gender_mapping.get(original_gender, 'Unknown')  # Default to 'Unknown' if not in mapping

# Generate counters for race and gender
race_counter = Counter(data['demographics']['race'] for data in filtered_patient_data.values())
gender_counter = Counter(data['demographics']['gender'] for data in filtered_patient_data.values())

# Display updated race and gender counters
print("Race Counts:")
for race, count in race_counter.items():
    print(f"{race}: {count}")

print("\nGender Counts:")
for gender, count in gender_counter.items():
    print(f"{gender}: {count}")



Race Counts:
White: 513
Black: 121
American Indian/Alaska Native: 2
Asian or Pacific Islander: 21

Gender Counts:
Female: 305
Male: 352


In [102]:
# Prepare filtered_patient_data for matching
filtered_patient_df = pd.DataFrame([
    {
        'patient_id': pat_id,
        'race': data['demographics']['race'],
        'sex': data['demographics']['gender'],
        'age': data['demographics']['age_of_first_diagnosis']
    }
    for pat_id, data in filtered_patient_data.items()
])


In [133]:
# Pre-group SEER data by Race and Sex
seer_groups = seer_data.groupby(['Race recode (W, B, AI, API)', 'Sex'])

matched_pairs = []
used_seer_ids = set()

for _, patient in filtered_patient_df.iterrows():
    patient_id = patient['patient_id']
    patient_race = patient['Race recode (W, B, AI, API)']
    patient_sex = patient['Sex']
    patient_age = patient['age']
    
    # Get the relevant SEER group
    try:
        group = seer_groups.get_group((patient_race, patient_sex))
    except KeyError:
        continue  # Skip if no group matches the race and sex
    
    # Filter SEER group for valid age differences
    valid_seer_rows = group[(abs(group['Age recode with single ages and 85+'] - patient_age) <= 1.5)]
    
    # Find the first unused SEER patient
    for _, seer_row in valid_seer_rows.iterrows():
        seer_id = seer_row['Patient ID']
        if seer_id not in used_seer_ids:
            matched_pairs.append({
                'patient_id': patient_id,
                'seer_id': seer_id,
                'Race': patient_race,
                'Sex': patient_sex,
                'patient_age': patient_age,
                'seer_age': seer_row['Age recode with single ages and 85+']
            })
            used_seer_ids.add(seer_id)
            break  # Stop after finding the first valid match

matched_df = pd.DataFrame(matched_pairs)
matched_df.rename(columns={'patient_age': 'age_first_diagnosis_synthetic'}, inplace=True)
matched_df.rename(columns={'seer_age': 'age_first_diagnosis_seer'}, inplace=True)
print("Matched patients:")
print(matched_df.head())


Matched patients:
   patient_id  seer_id   Race     Sex  age_first_diagnosis_synthetic  \
0    10005519   788078  White  Female                          54.00   
1    10006492   783695  White    Male                          79.29   
2    10008359   796158  White  Female                          61.44   
3    10012670   784862  Black  Female                          60.00   
4    10018834   801168  Black  Female                          71.00   

   age_first_diagnosis_seer  
0                      54.0  
1                      80.0  
2                      62.0  
3                      61.0  
4                      71.0  


In [134]:
# Ensure the DataFrame 'matched_df' exists
file_path = r'R:\Kevin\Projects\LLM\pdac_tf\matched_pairs.csv'
matched_df.to_csv(file_path, index=False)

print(f"Matched pairs saved to {file_path}")


Matched pairs saved to R:\Kevin\Projects\LLM\pdac_tf\matched_pairs.csv
