In [30]:
# Using Simulacrum V2 instead
import os
import pandas as pd
import warnings 

### 1. Read the datasets with utf-8 encoding and make all column names lower case

In [31]:
def read_file_with_encoding(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding)
            return df
        except UnicodeDecodeError:
            continue
    
    raise ValueError(f"Failed to decode {file_path} with available encodings.")

In [32]:
def read_file_utf8(file_path):
    try:
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always", pd.errors.DtypeWarning)
            df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
            
            # Check if any warnings were triggered
            if any(issubclass(warn.category, pd.errors.DtypeWarning) for warn in w):
                print(f"DtypeWarning encountered in file: {file_path}")
            
        return df
    
    except UnicodeDecodeError:
        raise ValueError(f"Failed to decode {file_path} with UTF-8 encoding.")

In [None]:
directory = "../datasets/simulacrum_v2.1.0/Data"
# directory = "/Users/janexu/Documents/0. 2024 3rd Year/SCIE30002/simulacrum_v2.1.0/Data"
files = os.listdir(directory)

for file_name in files:
    file_path = os.path.join(directory, file_name)
    
    try:
        # Read the file with encoding handling
        df = read_file_with_encoding(file_path)
        
        # Convert column names to lowercase
        df.columns = [col.lower() for col in df.columns]
        
        # Save the updated file
        df.to_csv(file_path, index=False, encoding = 'utf-8')
        
        print(f"Successfully processed and saved: {file_name}")
        
    except Exception as e:
        print(f"Error processing {file_name}: {e}")

### 2. Insert underscores in column names for REDCap upload

#### AV Files (Patient, Tumour, Gene)

In [34]:
sim_av_patient = pd.read_csv(os.path.join(directory, 'sim_av_patient.csv'))

new_field_names = {
    'patientid': 'patient_id',
    'gender': 'gender_patient',
    'ethnicity': 'ethnicity_patient',
    'deathcausecode_1a': 'death_cause_code_1a',
    'deathcausecode_1b': 'death_cause_code_1b',
    'deathcausecode_1c': 'death_cause_code_1c',
    'deathcausecode_2': 'death_cause_code_2',
    'deathcausecode_underlying': 'death_cause_code_underlying',
    'deathlocationcode': 'death_location_code',
    'vitalstatus': 'vital_status',
    'vitalstatusdate': 'vital_status_date',
    'linknumber': 'link_number'
}

sim_av_patient.rename(columns=new_field_names, inplace=True)
sim_av_patient.to_csv(os.path.join(directory, 'sim_av_patient.csv'), index=False, encoding = 'utf-8')

In [36]:
sim_av_tumour = pd.read_csv(os.path.join(directory, 'sim_av_tumour.csv'))

new_field_names = {
    'tumourid': 'tumour_id',
    'gender': 'gender_patient', #unsure about this one
    'diagnosisdatebest': 'diagonsis_date_best',
    'site_icd10r4_o2_3char_from2013': 'site_icd10r4_o2_3char_from_2013',
    'site_icd10r4_o2_from2013': 'site_icd10r4_o2_from_2013',
    'site_icdo3rev2011': 'site_icdo3rev_2011',
    'morph_icdo3rev2011': 'morph_icdo3rev_2011',
    'behaviour_icdo3rev2011': 'behaviour_icdo3rev_2011',
    'screeningstatusfull_code': 'screening_status_full_code',
    'cancercareplanintent': 'cancer_care_plan_intent',
    'performancestatus': 'performance_status'
}

sim_av_tumour.rename(columns=new_field_names, inplace=True)
sim_av_tumour.to_csv(os.path.join(directory, 'sim_av_tumour.csv'), index=False, encoding = 'utf-8')

  sim_av_tumour = pd.read_csv(os.path.join(directory, 'sim_av_tumour.csv'))


In [37]:
sim_av_gene = pd.read_csv(os.path.join(directory, 'sim_av_gene.csv'))

new_field_names = {
    'geneid': 'gene_id',
    'tumourid': 'tumour_id',
    'patientid': 'patient_id',
    'all_teststatuses': 'all_test_statuses'
}

sim_av_gene.rename(columns=new_field_names, inplace=True)
sim_av_gene.to_csv(os.path.join(directory, 'sim_av_gene.csv'), index=False, encoding = 'utf-8')

#### RTDS Files (Episode, Exposure, Prescription, Combined)

In [38]:
sim_rtds_episode = pd.read_csv(os.path.join(directory, 'sim_rtds_episode.csv'))

new_field_names = {
    'patientid': 'patient_id',
    'radiotherapyepisodeid': 'radiotherapy_episode_id',
    'attendid': 'attend_id',
    'apptdate': 'appt_date',
    'linkcode': 'link_code',
    'decisiontotreatdate': 'decision_to_treat_date',
    'earliestclinappropdate': 'earliest_clin_approp_date',
    'radiotherapypriority': 'radiotherapy_priority',
    'radiotherapyintent': 'radiotherapy_intent'
}

sim_rtds_episode.rename(columns=new_field_names, inplace=True)
sim_rtds_episode.to_csv(os.path.join(directory, 'sim_rtds_episode.csv'), index=False, encoding = 'utf-8')

  sim_rtds_episode = pd.read_csv(os.path.join(directory, 'sim_rtds_episode.csv'))


In [39]:
sim_rtds_prescription = pd.read_csv(os.path.join(directory, 'sim_rtds_prescription.csv'))

new_field_names = {
    'patientid': 'patient_id',
    'prescriptionid': 'prescription_id',
    'rttreatmentmodality': 'rt_treatment_modality',
    'rtprescribeddose': 'rt_prescribed_dose',
    'rtprescribedfractions': 'rt_prescribed_fractions',
    'rtactualdose': 'rt_actual_dose',
    'rtactualfractions': 'rt_actual_fractions',
    'rttreatmentregion': 'rt_treatment_region',
    'rttreatmentanatomicalsite': 'rt_treatment_anatomical_site',
    'radiotherapyepisodeid': 'radiotherapy_episode_id',
    'linkcode': 'link_code',
    'attendid': 'attend_id',
    'apptdate': 'appt_date',
}

sim_rtds_prescription.rename(columns=new_field_names, inplace=True)
sim_rtds_prescription.to_csv(os.path.join(directory, 'sim_rtds_prescription.csv'), index=False, encoding = 'utf-8')

  sim_rtds_prescription = pd.read_csv(os.path.join(directory, 'sim_rtds_prescription.csv'))


In [40]:
sim_rtds_exposure = pd.read_csv(os.path.join(directory, 'sim_rtds_exposure.csv'))

new_field_names = {
    'prescriptionid': 'prescription_id',
    'radioisotope': 'radio_isotope',
    'rtprescribeddose': 'rt_prescribed_dose',
    'radiotherapybeamtype': 'radiotherapy_beam_type',
    'radiotherapybeamenergy': 'radiotherapy_beam_energy',
    'timeofexposure': 'time_of_exposure',
    'apptdate': 'appt_date',
    'attendid': 'attend_id',
    'patientid': 'patient_id',
    'radiotherapyepisodeid': 'radiotherapy_episode_id',
    'linkcode': 'link_code',
}

sim_rtds_exposure.rename(columns=new_field_names, inplace=True)
sim_rtds_exposure.to_csv(os.path.join(directory, 'sim_rtds_exposure.csv'), index=False, encoding = 'utf-8')

  sim_rtds_exposure = pd.read_csv(os.path.join(directory, 'sim_rtds_exposure.csv'))


In [41]:
sim_rtds_combined = pd.read_csv(os.path.join(directory, 'sim_rtds_combined.csv'))
# a df combining prescription, episode and exposure (contains all their columns)

new_field_names = {
    'patientid': 'patient_id',
    'prescriptionid': 'prescription_id', # from prescription
    'rttreatmentmodality': 'rt_treatment_modality', # prescription
    'radiotherapypriority': 'radiotherapy_priority', # episode
    'radiotherapyintent': 'radiotherapy_intent', # episode
    'rtprescribeddose': 'rt_prescribed_dose', # prescription
    'rtprescribedfractions': 'rt_prescribed_fractions', # prescription
    'rtactualdose': 'rt_actual_dose', # prescription
    'rtactualfractions': 'rt_actual_fractions', # prescription
    'rttreatmentregion': 'rt_treatment_region', # prescription
    'rttreatmentanatomicalsite': 'rt_treatment_anatomical_site', # prescription
    'decisiontotreatdate': 'decision_to_treat_date', # episode
    'earliestclinappropdate': 'earliest_clin_approp_date', # episode
    'radiotherapyepisodeid': 'radiotherapy_episode_id', # episode
    'linkcode': 'link_code',
    'radioisotope': 'radio_isotope', # exposure
    'radiotherapybeamtype': 'radiotherapy_beam_type', # exposure
    'radiotherapybeamenergy': 'radiotherapy_beam_energy', # exposure
    'timeofexposure': 'time_of_exposure', # exposure
    'apptdate': 'appt_date',
    'attendid': 'attend_id',
    'patientid': 'patient_id',
}

sim_rtds_combined.rename(columns=new_field_names, inplace=True)
sim_rtds_combined.to_csv(os.path.join(directory, 'sim_rtds_combined.csv'), index=False, encoding = 'utf-8')

  sim_rtds_combined = pd.read_csv(os.path.join(directory, 'sim_rtds_combined.csv'))


#### SACT Files (Regimen, Cycle, Drug Details, Outcome)

Columns don't need renaming as they all already have proper underscores

In [42]:
sim_sact_regimen = pd.read_csv(os.path.join(directory, 'sim_sact_regimen.csv'))
sim_sact_cycle = pd.read_csv(os.path.join(directory, 'sim_sact_cycle.csv'))
sim_sact_drug_detail = pd.read_csv(os.path.join(directory, 'sim_sact_drug_detail.csv'))
sim_sact_outcome = pd.read_csv(os.path.join(directory, 'sim_sact_outcome.csv'))

  sim_sact_cycle = pd.read_csv(os.path.join(directory, 'sim_sact_cycle.csv'))


### Merging SACT Files into one
Cycles link to the regimen table by MERGED_REGIMEN_ID. 
Drug administrations can be linked to the cycle table by MERGED_CYCLE_ID.
Outcomes link to the regimen table by MERGED_REGIMEN_ID.

In [None]:
# Regimen + Cycle
