In [2]:
# Using Simulacrum V2 instead
import os
import pandas as pd
import warnings 

1. Process the datasets with utf-8 encoding

In [3]:
def read_file_with_encoding(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding)
            return df
        except UnicodeDecodeError:
            continue
    
    raise ValueError(f"Failed to decode {file_path} with available encodings.")

In [4]:
def read_file_utf8(file_path):
    try:
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always", pd.errors.DtypeWarning)
            df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
            
            # Check if any warnings were triggered
            if any(issubclass(warn.category, pd.errors.DtypeWarning) for warn in w):
                print(f"DtypeWarning encountered in file: {file_path}")
            
        return df
    
    except UnicodeDecodeError:
        raise ValueError(f"Failed to decode {file_path} with UTF-8 encoding.")

In [7]:
directory = "/Users/janexu/Documents/0. 2024 3rd Year/SCIE30002/simulacrum_v2.1.0/Data"
files = os.listdir(directory)

for file_name in files:
    file_path = os.path.join(directory, file_name)
    
    try:
        # Read the file with encoding handling
        df = read_file_with_encoding(file_path)
        
        # Convert column names to lowercase
        df.columns = [col.lower() for col in df.columns]
        
        # Save the updated file
        df.to_csv(file_path, index=False, encoding = 'utf-8')
        
        print(f"Successfully processed and saved: {file_name}")
        
    except Exception as e:
        print(f"Error processing {file_name}: {e}")

  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_rtds_exposure.csv


  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_rtds_prescription.csv
Successfully processed and saved: sim_av_patient.csv
Successfully processed and saved: sim_sact_outcome.csv
Successfully processed and saved: sim_sact_drug_detail.csv


  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_av_tumour.csv
Successfully processed and saved: sim_av_gene.csv


  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_rtds_episode.csv


  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_sact_cycle.csv
Successfully processed and saved: sim_sact_regimen.csv


  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_rtds_combined.csv


2. Make all field names lowercase and insert needed underscores

In [None]:
sim_av_patient = pd.read_csv(os.path.join(directory, 'sim_av_patient.csv'))

new_field_names = {
    'patientid': 'patient_id',
    'gender': 'gender_patient',
    'ethnicity': 'ethnicity_patient',
    'deathcausecode_1a': 'death_cause_code_1a',
    'deathcausecode_1b': 'death_cause_code_1b',
    'deathcausecode_1c': 'death_cause_code_1c',
    'deathcausecode_2': 'death_cause_code_2',
    'deathlocationcode': 'death_location_code',
    'vitalstatus': 'vital_status',
    'linknumber': 'link_number'
}

sim_av_patient.rename(columns=new_field_names, implace=True)
sim_av_patient.to_csv(os.path.join(directory, 'sim_av_patient.csv'), index=False)

In [None]:
sim_av_tumour = pd.read_csv(os.path.join(directory, 'sim_av_tumour.csv'))

new_field_names = {
    'tumourid': 'tumour_id',
    'gender': 'gender_patient', #unsure about this one
    'diagnosisdatebest': 'diagonsis_date_best',
    'site_icd10r4_o2_3char_from2013': 'site_icd10r4_o2_3char_from_2013',
    'site_icd10r4_o2_from2013': 'site_icd10r4_o2_from_2013',
    'morph_icdo3rev2011': 'morph_icdo3rev_2011',
    'behaviour_icdo3rev2011': 'behaviour_icdo3rev_2011',
    'behaviour_icdorev2011': 'behaviour_icdorev_2011',
    'screeningstatusfull_code': 'screening_status_full_code',
    'cancercareplanintent': 'cancer_care_plan_intent',
    'performancestatus': 'performance_status'
}

sim_av_tumour.rename(columns=new_field_names, implace=True)
sim_av_tumour.to_csv(os.path.join(directory, 'sim_av_tumour'), index=False)

In [None]:
sim_av_gene = pd.read_csv(os.path.join(directory, 'sim_av_gene.csv'))

new_field_names = {
    'geneid': 'gene_id',
    'tumourid': 'tumour_id',
    'patientid': 'patient_id',
    'all_teststatuses': 'all_test_statuses'
}

sim_av_gene.rename(columns=new_field_names, implace=True)
sim_av_gene.to_csv(os.path.join(directory, 'sim_av_gene'), index+False)

In [None]:

sim_rtds_episode = pd.read_csv(os.path.join(directory, 'sim_rtds_episode'))

new_field_names = {
    'patientid': 'patient_id',
    'radiotherapyepisodeid': 'radio_therapy_episode_id',
    'attendid': 'attend_id',
    'apptdate': 'appt_date',
    'linkcode': 'link_code',
    'decisiontotreatdate': 'decision_to_treat_date',
    'earliestclinappropdate': 'earliest_clin_approp_date',
    'radiotherapypriority': 'radiotherapy_priority',
    "radiotherapyintent": 'radio_therapy_intent'
}

sim_rtds_episode.rename(columns=new_field_names, implace=True)
sim_rtds_episode.to_csv(os.path.join(directory, 'sim_rtds_episode'), index+False)

In [None]:
sim_rtds_prescription = pd.read_csv(os.path.join(directory, 'sim_rtds_prescription'))
