## Redefining Field Variables

In [2]:
import os
import pandas as pd

In [10]:
def read_file_with_encoding(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding)
            return df
        except UnicodeDecodeError:
            continue
    
    raise ValueError(f"Failed to decode {file_path} with available encodings.")

In [11]:
# Define the directory of files (you should modify it to match your local directory)
directory = "../simulacrum_release_v1.2.0.2017/data/"
files = os.listdir(directory)

for file_name in files:
    file_path = os.path.join(directory, file_name)
    
    try:
        # Read the file with encoding handling
        df = read_file_with_encoding(file_path)
        
        # Convert column names to lowercase
        df.columns = [col.lower() for col in df.columns]
        
        # Save the updated file
        df.to_csv(file_path, index=False)
        
        print(f"Successfully processed and saved: {file_name}")
        
    except Exception as e:
        print(f"Error processing {file_name}: {e}")


Successfully processed and saved: .DS_Store
Successfully processed and saved: sim_av_patient.csv
Successfully processed and saved: sim_sact_outcome.csv
Successfully processed and saved: sim_sact_drug_detail.csv
Successfully processed and saved: sim_sact_patient.csv


  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_av_tumour.csv
Successfully processed and saved: sim_sact_tumour.csv


  df = pd.read_csv(file_path, encoding=encoding)


Successfully processed and saved: sim_sact_cycle.csv
Successfully processed and saved: sim_sact_regimen.csv


In [15]:
av_patient = pd.read_csv(os.path.join(directory, 'sim_av_patient.csv'))
av_patient.columns

Index(['patientid', 'sex', 'linknumber', 'ethnicity', 'deathcausecode_1a',
       'deathcausecode_1b', 'deathcausecode_1c', 'deathcausecode_2',
       'deathcausecode_underlying', 'deathlocationcode', 'newvitalstatus',
       'vitalstatusdate'],
      dtype='object')

In [16]:
av_tumour = pd.read_csv(os.path.join(directory, 'sim_av_tumour.csv'))
av_tumour.columns

  av_tumour = pd.read_csv(os.path.join(directory, 'sim_av_tumour.csv'))


Index(['tumourid', 'patientid', 'diagnosisdatebest', 'site_icd10_o2',
       'site_icd10_o2_3char', 'morph_icd10_o2', 'behaviour_icd10_o2', 't_best',
       'n_best', 'm_best', 'stage_best', 'stage_best_system', 'grade', 'age',
       'sex', 'creg_code', 'linknumber', 'screeningstatusfull_code',
       'er_status', 'er_score', 'pr_status', 'pr_score', 'her2_status',
       'cancercareplanintent', 'performancestatus', 'cns', 'ace27',
       'gleason_primary', 'gleason_secondary', 'gleason_tertiary',
       'gleason_combined', 'date_first_surgery', 'laterality',
       'quintile_2015'],
      dtype='object')

In [17]:
sact_patient = pd.read_csv(os.path.join(directory, 'sim_sact_patient.csv'))
sact_patient.columns

Index(['merged_patient_id', 'link_number'], dtype='object')

In [18]:
sact_outcome = pd.read_csv(os.path.join(directory, 'sim_sact_outcome.csv'))
sact_outcome.columns

Index(['merged_outcome_id', 'merged_regimen_id', 'date_of_final_treatment',
       'regimen_mod_dose_reduction', 'regimen_mod_time_delay',
       'regimen_mod_stopped_early', 'regimen_outcome_summary',
       'merged_patient_id', 'merged_tumour_id'],
      dtype='object')

In [19]:
sact_drug_detail = pd.read_csv(os.path.join(directory, 'sim_sact_drug_detail.csv'))
sact_drug_detail.columns

Index(['merged_drug_detail_id', 'merged_cycle_id', 'org_code_of_drug_provider',
       'actual_dose_per_administration', 'opcs_delivery_code',
       'administration_route', 'administration_date', 'drug_group',
       'merged_patient_id', 'merged_tumour_id', 'merged_regimen_id'],
      dtype='object')

In [20]:
sact_tumour = pd.read_csv(os.path.join(directory, 'sim_sact_tumour.csv'))
sact_tumour.columns

Index(['merged_tumour_id', 'merged_patient_id', 'consultant_speciality_code',
       'primary_diagnosis', 'morphology_clean'],
      dtype='object')

In [21]:
sact_cycle = pd.read_csv(os.path.join(directory, 'sim_sact_cycle.csv'))
sact_cycle.columns

  sact_cycle = pd.read_csv(os.path.join(directory, 'sim_sact_cycle.csv'))


Index(['merged_cycle_id', 'merged_regimen_id', 'cycle_number',
       'start_date_of_cycle', 'opcs_procurement_code',
       'perf_status_start_of_cycle', 'merged_patient_id', 'merged_tumour_id'],
      dtype='object')

In [None]:
sact_regimen = pd.read_csv(os.path.join(directory, 'sim_sact_regimen.csv'))
sact_regimen.columns