In [13]:
# Using Simulacrum V2 instead
import os
import pandas as pd
import warnings 

### 1. Read the datasets with utf-8 encoding and make all column names lower case

In [14]:
def read_file_with_encoding(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding)
            return df
        except UnicodeDecodeError:
            continue
    
    raise ValueError(f"Failed to decode {file_path} with available encodings.")

In [15]:
def read_file_utf8(file_path):
    try:
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always", pd.errors.DtypeWarning)
            df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
            
            # Check if any warnings were triggered
            if any(issubclass(warn.category, pd.errors.DtypeWarning) for warn in w):
                print(f"DtypeWarning encountered in file: {file_path}")
            
        return df
    
    except UnicodeDecodeError:
        raise ValueError(f"Failed to decode {file_path} with UTF-8 encoding.")

In [16]:
#directory = "../datasets/simulacrum_v2.1.0/Data"
directory = "/Users/janexu/Documents/0. 2024 3rd Year/SCIE30002/simulacrum_v2.1.0/Data"
files = os.listdir(directory)

for file_name in files:
    file_path = os.path.join(directory, file_name)
    
    try:
        # Read the file with encoding handling
        df = read_file_with_encoding(file_path)
        
        # Convert column names to lowercase
        df.columns = [col.lower() for col in df.columns]
        
        # Save the updated file
        df.to_csv(file_path, index=False, encoding = 'utf-8')
        
        print(f"Successfully processed and saved: {file_name}")
        
    except Exception as e:
        print(f"Error processing {file_name}: {e}")

  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_rtds_exposure.csv


  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_rtds_prescription.csv
Successfully processed and saved: sim_av_patient.csv
Successfully processed and saved: sim_sact_outcome.csv
Successfully processed and saved: sim_sact_drug_detail.csv


  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_av_tumour.csv
Successfully processed and saved: sim_sact_regimen_new.csv
Successfully processed and saved: sim_av_gene.csv


  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_rtds_episode.csv


  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_sact_cycle.csv
Successfully processed and saved: sim_sact_regimen.csv


  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_rtds_combined.csv


### 2. Insert underscores in column names for REDCap upload

#### AV Files (Patient, Tumour, Gene)

In [17]:
sim_av_patient = pd.read_csv(os.path.join(directory, 'sim_av_patient.csv'))

new_field_names = {
    'patientid': 'patient_id',
    'gender': 'gender_patient',
    'ethnicity': 'ethnicity_patient',
    'deathcausecode_1a': 'death_cause_code_1a',
    'deathcausecode_1b': 'death_cause_code_1b',
    'deathcausecode_1c': 'death_cause_code_1c',
    'deathcausecode_2': 'death_cause_code_2',
    'deathcausecode_underlying': 'death_cause_code_underlying',
    'deathlocationcode': 'death_location_code',
    'vitalstatus': 'vital_status',
    'vitalstatusdate': 'vital_status_date',
    'linknumber': 'link_number'
}

sim_av_patient.rename(columns=new_field_names, inplace=True)
sim_av_patient.to_csv(os.path.join(directory, 'sim_av_patient.csv'), index=False, encoding = 'utf-8')

In [18]:
sim_av_tumour = pd.read_csv(os.path.join(directory, 'sim_av_tumour.csv'))

new_field_names = {
    'tumourid': 'tumour_id',
    'patientid': 'patient_id',
    'gender': 'gender_patient', #unsure about this one
    'diagnosisdatebest': 'diagonsis_date_best',
    'site_icd10r4_o2_3char_from2013': 'site_icd10r4_o2_3char_from_2013',
    'site_icd10r4_o2_from2013': 'site_icd10r4_o2_from_2013',
    'site_icdo3rev2011': 'site_icdo3rev_2011',
    'morph_icdo3rev2011': 'morph_icdo3rev_2011',
    'behaviour_icdo3rev2011': 'behaviour_icdo3rev_2011',
    'screeningstatusfull_code': 'screening_status_full_code',
    'cancercareplanintent': 'cancer_care_plan_intent',
    'performancestatus': 'performance_status'
}

sim_av_tumour.rename(columns=new_field_names, inplace=True)
sim_av_tumour.to_csv(os.path.join(directory, 'sim_av_tumour.csv'), index=False, encoding = 'utf-8')

  sim_av_tumour = pd.read_csv(os.path.join(directory, 'sim_av_tumour.csv'))


In [19]:
sim_av_gene = pd.read_csv(os.path.join(directory, 'sim_av_gene.csv'))

new_field_names = {
    'geneid': 'gene_id',
    'tumourid': 'tumour_id',
    'patientid': 'patient_id',
    'all_teststatuses': 'all_test_statuses'
}

sim_av_gene.rename(columns=new_field_names, inplace=True)
sim_av_gene.to_csv(os.path.join(directory, 'sim_av_gene.csv'), index=False, encoding = 'utf-8')

#### RTDS Files (Episode, Exposure, Prescription, Combined)

In [20]:
sim_rtds_episode = pd.read_csv(os.path.join(directory, 'sim_rtds_episode.csv'))

new_field_names = {
    'patientid': 'patient_id',
    'radiotherapyepisodeid': 'radiotherapy_episode_id',
    'attendid': 'attend_id',
    'apptdate': 'appt_date',
    'linkcode': 'link_code',
    'decisiontotreatdate': 'decision_to_treat_date',
    'earliestclinappropdate': 'earliest_clin_approp_date',
    'radiotherapypriority': 'radiotherapy_priority',
    'radiotherapyintent': 'radiotherapy_intent'
}

sim_rtds_episode.rename(columns=new_field_names, inplace=True)
sim_rtds_episode.to_csv(os.path.join(directory, 'sim_rtds_episode.csv'), index=False, encoding = 'utf-8')

  sim_rtds_episode = pd.read_csv(os.path.join(directory, 'sim_rtds_episode.csv'))


In [21]:
sim_rtds_prescription = pd.read_csv(os.path.join(directory, 'sim_rtds_prescription.csv'))

new_field_names = {
    'patientid': 'patient_id',
    'prescriptionid': 'prescription_id',
    'rttreatmentmodality': 'rt_treatment_modality',
    'rtprescribeddose': 'rt_prescribed_dose',
    'rtprescribedfractions': 'rt_prescribed_fractions',
    'rtactualdose': 'rt_actual_dose',
    'rtactualfractions': 'rt_actual_fractions',
    'rttreatmentregion': 'rt_treatment_region',
    'rttreatmentanatomicalsite': 'rt_treatment_anatomical_site',
    'radiotherapyepisodeid': 'radiotherapy_episode_id',
    'linkcode': 'link_code',
    'attendid': 'attend_id',
    'apptdate': 'appt_date',
}

sim_rtds_prescription.rename(columns=new_field_names, inplace=True)
sim_rtds_prescription.to_csv(os.path.join(directory, 'sim_rtds_prescription.csv'), index=False, encoding = 'utf-8')

  sim_rtds_prescription = pd.read_csv(os.path.join(directory, 'sim_rtds_prescription.csv'))


In [22]:
sim_rtds_exposure = pd.read_csv(os.path.join(directory, 'sim_rtds_exposure.csv'))

new_field_names = {
    'prescriptionid': 'prescription_id',
    'radioisotope': 'radio_isotope',
    'rtprescribeddose': 'rt_prescribed_dose',
    'radiotherapybeamtype': 'radiotherapy_beam_type',
    'radiotherapybeamenergy': 'radiotherapy_beam_energy',
    'timeofexposure': 'time_of_exposure',
    'apptdate': 'appt_date',
    'attendid': 'attend_id',
    'patientid': 'patient_id',
    'radiotherapyepisodeid': 'radiotherapy_episode_id',
    'linkcode': 'link_code',
}

sim_rtds_exposure.rename(columns=new_field_names, inplace=True)
sim_rtds_exposure.to_csv(os.path.join(directory, 'sim_rtds_exposure.csv'), index=False, encoding = 'utf-8')

  sim_rtds_exposure = pd.read_csv(os.path.join(directory, 'sim_rtds_exposure.csv'))


In [23]:
sim_rtds_combined = pd.read_csv(os.path.join(directory, 'sim_rtds_combined.csv'))
# a df combining prescription, episode and exposure (contains all their columns)

new_field_names = {
    'patientid': 'patient_id',
    'prescriptionid': 'prescription_id', # from prescription
    'rttreatmentmodality': 'rt_treatment_modality', # prescription
    'radiotherapypriority': 'radiotherapy_priority', # episode
    'radiotherapyintent': 'radiotherapy_intent', # episode
    'rtprescribeddose': 'rt_prescribed_dose', # prescription
    'rtprescribedfractions': 'rt_prescribed_fractions', # prescription
    'rtactualdose': 'rt_actual_dose', # prescription
    'rtactualfractions': 'rt_actual_fractions', # prescription
    'rttreatmentregion': 'rt_treatment_region', # prescription
    'rttreatmentanatomicalsite': 'rt_treatment_anatomical_site', # prescription
    'decisiontotreatdate': 'decision_to_treat_date', # episode
    'earliestclinappropdate': 'earliest_clin_approp_date', # episode
    'radiotherapyepisodeid': 'radiotherapy_episode_id', # episode
    'linkcode': 'link_code',
    'radioisotope': 'radio_isotope', # exposure
    'radiotherapybeamtype': 'radiotherapy_beam_type', # exposure
    'radiotherapybeamenergy': 'radiotherapy_beam_energy', # exposure
    'timeofexposure': 'time_of_exposure', # exposure
    'apptdate': 'appt_date',
    'attendid': 'attend_id',
    'patientid': 'patient_id',
}

sim_rtds_combined.rename(columns=new_field_names, inplace=True)
sim_rtds_combined.to_csv(os.path.join(directory, 'sim_rtds_combined.csv'), index=False, encoding = 'utf-8')

  sim_rtds_combined = pd.read_csv(os.path.join(directory, 'sim_rtds_combined.csv'))


KeyboardInterrupt: 

#### SACT Files (Regimen, Cycle, Drug Details, Outcome)

Columns don't need renaming as they all already have proper underscores

In [None]:
sim_sact_regimen = pd.read_csv(os.path.join(directory, 'sim_sact_regimen.csv')) # shape: (781,389 x 12)
sim_sact_cycle = pd.read_csv(os.path.join(directory, 'sim_sact_cycle.csv')) # shape: (2,741,674 x 6)
sim_sact_drug_detail = pd.read_csv(os.path.join(directory, 'sim_sact_drug_detail.csv')) # shape: (7,662,030 x 7)
sim_sact_outcome = pd.read_csv(os.path.join(directory, 'sim_sact_outcome.csv')) # shape: (784,135 x 6)

  sim_sact_cycle = pd.read_csv(os.path.join(directory, 'sim_sact_cycle.csv')) # shape: (2,741,674 x 6)


Dropping Irrelevant Fields

In [None]:
# Dropping start_date in sim_sact_regimen and just keeping sact_cycle as a more accurate starting date
sim_sact_regimen.drop(['start_date_of_regimen', 'height_at_start_of_regimen', 'weight_at_start_of_regimen'], axis=1, inplace=True)
sim_sact_regimen.to_csv(os.path.join(directory, 'sim_sact_regimen_new.csv'), index=False)


Looking to drop records where males have gynaecological cancer or females have prostrate cancer (assuming a patient's gender is their biological/assigned at birth sex). Can only do this by finding matching cancer tumour genes in av_gene. Unsure of accuracy due to lack of knowledge/background in biology. 

Gynaecological Cancer Tumour Genes (present in dataset):
1. BRCA1 (code 7), BRCA2 (code 8)
2. TP53 (code 79)
3. CDH1 (code 794)
4. STK11 (code 76)

Prostrate Cancer Tumour Genes (present in dataset):
1. ATM (code 794)
2. BRCA1 (code 7), BRCA2 (code 8)

BRCA1 and BCRA2 are both associated with F&M reproductive area cancers so may need to exclude this from filtering list.

ALTERNATIVE:
Use the 'site_icd_x' field variables to find relevant disease. 
1. Prostrate cancer: C61 [C60-63 are male genital organs]
2. Gynaecological cancers: C51-52 (51: vulva & 52: vagina), C53 (cervix), C54-55 (54: corpus uteri, 55: unspecified part in uterus), C56 (ovary), C57-58 (57: other/unspecified female genital organs, 58: placenta) [C51-58 are female genital organs]

Gender Codes:
M: 1
F: 1
Indeterminate: 9


### Merging SACT Files into one

Unique Regimen ID count:
- In sim_sact_outcome: 784,135
- In sim_sact_regimen: 781,389
- In sim_sact_cycle: 756,595 (24,794 less than regimen)
- In final merged df: 751,911 (29,478 less than regimen)

Unique Cycle ID count:
- In sim_sact_cycle: 2,741,674
- In sim_sact_drug_detail: 2,729,567 (12,107 less than cycle)

Note: there are some discrepancies between the number of unique regimen ID's in the original regimen file and the final merged dataframe. This may be because some records don't have corresponding records in other tables (e.g. some regimens do not have corresponding cycles, and some cycles do not have drug details)

In [None]:
directory = "../datasets/simulacrum_v2.1.0/Data"

In [None]:
# Drug + Cycle
drug_cycle = pd.merge(sim_sact_drug_detail, sim_sact_cycle, on='merged_cycle_id', how='left') # shape: (7,662,030 x 12)

# Drug + Cycle + Regimen
drug_cycle_regimen = pd.merge(drug_cycle, sim_sact_regimen, on='merged_regimen_id', how='left') # shape: (7,662,030 x 23)

# All SACT Files
sact = pd.merge(drug_cycle_regimen, sim_sact_outcome, on='merged_regimen_id', how='left') # shape: (7,662,030 x 28)

### Merging AV Files into one

In [None]:
sim_av_patient = pd.read_csv(os.path.join(directory, 'sim_av_patient.csv')) # shape: (1,871,605 x 12)
sim_av_tumour = pd.read_csv(os.path.join(directory, 'sim_av_tumour.csv')) # shape: (1,995,570 x 37)
sim_av_gene = pd.read_csv(os.path.join(directory, 'sim_av_gene.csv')) # shape: (255,728 x 29)

  sim_av_tumour = pd.read_csv(os.path.join(directory, 'sim_av_tumour.csv')) # shape: (1,995,570 x 37)


In [None]:
tumour_patient = pd.merge(sim_av_tumour, sim_av_patient, on='patient_id', how='left') # shape: (1,995,570 x 48)

# Note: not all tumours have corresponding gene records
av = pd.merge(tumour_patient, sim_av_gene, on='tumour_id', how='left') # shape: (2,154,804 x 76)
print(av.columns)

Index(['tumour_id', 'gender_patient_x', 'patient_id_x', 'diagonsis_date_best',
       'site_icd10_o2_3char', 'site_icd10_o2',
       'site_icd10r4_o2_3char_from_2013', 'site_icd10r4_o2_from_2013',
       'site_icdo3rev_2011', 'site_icdo3rev2011_3char', 'morph_icd10_o2',
       'morph_icdo3rev_2011', 'behaviour_icd10_o2', 'behaviour_icdo3rev_2011',
       't_best', 'n_best', 'm_best', 'stage_best', 'grade', 'age', 'creg_code',
       'stage_best_system', 'laterality', 'screening_status_full_code',
       'er_status', 'pr_status', 'her2_status', 'quintile_2019',
       'date_first_surgery', 'cancer_care_plan_intent', 'performance_status',
       'chrl_tot_27_03', 'comorbidities_27_03', 'gleason_primary',
       'gleason_secondary', 'gleason_tertiary', 'gleason_combined',
       'gender_patient_y', 'ethnicity_patient', 'death_cause_code_1a',
       'death_cause_code_1b', 'death_cause_code_1c', 'death_cause_code_2',
       'death_cause_code_underlying', 'death_location_code', 'vital_status

Looking to drop records where males have gynaecological cancer or females have prostrate cancer (assuming a patient's gender is their biological/assigned at birth sex). Can only do this by finding matching cancer tumour genes in av_gene. Unsure of accuracy due to lack of knowledge/background in biology. 

Gynaecological Cancer Tumour Genes (present in dataset):
1. BRCA1 (code 7), BRCA2 (code 8)
2. TP53 (code 79)
3. CDH1 (code 794)
4. STK11 (code 76)

Prostrate Cancer Tumour Genes (present in dataset):
1. ATM (code 794)
2. BRCA1 (code 7), BRCA2 (code 8)

BRCA1 and BCRA2 are both associated with F&M reproductive area cancers so may need to exclude this from filtering list.

ALTERNATIVE:
Use the 'site_icd_x' field variables to find relevant disease. 
1. Prostrate cancer: C61
2. Gynaecological cancers: C51-52 (51: vulva & 52: vagina), C53 (cervix), C54-55 (54: corpus uteri, 55: unspecified part in uterus), C56 (ovary), C57-58 (57: other/unspecified female genital organs, 58: placenta)

Gender Codes:
M: 1
F: 1
Indeterminate: 9

Going to use site_icd10_o2_3char

In [None]:
# Querying to see if gender values match across av_patient and av_tumour
matching = av['gender_patient_x'] == av['gender_patient_y']
all_match = matching.all()

print(all_match)

NameError: name 'av' is not defined

In [None]:
# Trying to delete records where males have gynaecological cancer/females have prostrate cancer
prostrate_cancer_df = av['site_icd10_o2_3char' == 'C61']
print(prostrate_cancer_df)

NameError: name 'av' is not defined

### Playground

In [None]:
sact_regimen_ids = set(sact['merged_regimen_id'].unique())
regimen_regimen_ids = set(sim_sact_regimen['merged_regimen_id'].unique())

# Find regimen id's that aren't in the final merged df
missing_regimen_ids = regimen_regimen_ids - sact_regimen_ids

# Filter the sim_sact_regimen df for these missing regimen_id's
missing_regimens = sim_sact_regimen[sim_sact_regimen['merged_regimen_id'].isin(missing_regimen_ids)]

missing_regimens.head()

Unnamed: 0,encore_patient_id,merged_regimen_id,height_at_start_of_regimen,weight_at_start_of_regimen,intent_of_treatment,date_decision_to_treat,start_date_of_regimen,mapped_regimen,clinical_trial,chemo_radiation,benchmark_group,link_number
72,10405821,10030812,0.0,0.0,P,2019-09-25,2019-10-24,Hydroxycarbamide,2.0,N,HYDROXYCARBAMIDE,101836626
135,10407709,10030955,0.0,55.0,,2019-12-09,2019-12-09,Hydroxycarbamide,,,HYDROXYCARBAMIDE,100912531
179,10409421,10031066,0.0,,P,,2019-12-01,,2.0,N,ENZALUTAMIDE,100316506
188,10409813,10031091,0.0,,,,2018-12-28,,2.0,,ENZALUTAMIDE,101470000
350,10415146,10031491,0.0,,P,,2021-04-21,,2.0,,HYDROXYCARBAMIDE,100617606
