## Redefining Field Variables

In [26]:
import os
import pandas as pd
import warnings 

In [27]:
def read_file_with_encoding(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding)
            return df
        except UnicodeDecodeError:
            continue
    
    raise ValueError(f"Failed to decode {file_path} with available encodings.")

In [30]:
def read_file_utf8(file_path):
    try:
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always", pd.errors.DtypeWarning)
            df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
            
            # Check if any warnings were triggered
            if any(issubclass(warn.category, pd.errors.DtypeWarning) for warn in w):
                print(f"DtypeWarning encountered in file: {file_path}")
            
        return df
    
    except UnicodeDecodeError:
        raise ValueError(f"Failed to decode {file_path} with UTF-8 encoding.")

In [31]:
# Define the directory of files (you should modify it to match your local directory)
# Jane's
# directory = "/Users/janexu/Documents/0. 2024 3rd Year/SCIE30002/simulacrum_release_v1.2.0.2017/data"

# Kathleen's
# ../simulacrum_release_v1.2.0.2017/data"
directory = "/Users/janexu/Documents/0. 2024 3rd Year/SCIE30002/simulacrum_release_v1.2.0.2017/data"
files = os.listdir(directory)

for file_name in files:
    file_path = os.path.join(directory, file_name)
    
    try:
        # Read the file with encoding handling
        df = read_file_with_encoding(file_path)
        
        # Convert column names to lowercase
        df.columns = [col.lower() for col in df.columns]
        
        # Save the updated file
        df.to_csv(file_path, index=False, encoding = 'utf-8')
        
        print(f"Successfully processed and saved: {file_name}")
        
    except Exception as e:
        print(f"Error processing {file_name}: {e}")


Successfully processed and saved: sim_av_patient.csv
Successfully processed and saved: sim_sact_outcome.csv
Successfully processed and saved: sim_sact_drug_detail.csv
Successfully processed and saved: sim_sact_patient.csv


  df = pd.read_csv(file_path, encoding=encoding)


KeyboardInterrupt: 

In [19]:
av_patient = pd.read_csv(os.path.join(directory, 'sim_av_patient.csv'))

new_names = {
    'patientid': 'patient_id',
    'sex': 'sex',
    'linknumber': 'link_number',
    'ethnicity': 'ethnicity',
    'deathcausecode_1a': 'death_cause_code_1a',
    'deathcausecode_1b': 'death_cause_code_1b',
    'deathcausecode_1c': 'death_cause_code_1c',
    'deathcausecode_2': 'death_cause_code_2',
    'deathcausecode_underlying': 'death_cause_code_underlying',
    'deathlocationcode': 'death_location_code',
    'newvitalstatus': 'new_vital_status',
    'vitalstatusdate': 'vital_status_date'
}

av_patient.rename(columns=new_names, inplace=True)
av_patient.to_csv(os.path.join(directory, 'sim_av_patient.csv'), index=False)


In [20]:
av_tumour = pd.read_csv(os.path.join(directory, 'sim_av_tumour.csv'))

rename = {
    'tumourid': 'tumour_id',
    'patientid': 'patient_id',
    'diagnosisdatebest': 'diagnosis_date_best',
    'site_icd10_o2': 'site_icd10_o2',
    'site_icd10_o2_3char': 'site_icd10_o2_3_char',
    'morph_icd10_o2': 'morph_icd10_o2',
    'behaviour_icd10_o2': 'behaviour_icd10_o2',
    't_best': 't_best',
    'n_best': 'n_best',
    'm_best': 'm_best',
    'stage_best': 'stage_best',
    'stage_best_system': 'stage_best_system',
    'grade': 'grade',
    'age': 'age',
    'sex': 'sex',
    'creg_code': 'creg_code',
    'linknumber': 'link_number',
    'screeningstatusfull_code': 'screening_status_full_code',
    'er_status': 'er_status',
    'er_score': 'er_score',
    'pr_status': 'pr_status',
    'pr_score': 'pr_score',
    'her2_status': 'her2_status',
    'cancercareplanintent': 'cancer_care_plan_intent',
    'performancestatus': 'performance_status',
    'cns': 'cns',
    'ace27': 'ace27',
    'gleason_primary': 'gleason_primary',
    'gleason_secondary': 'gleason_secondary',
    'gleason_tertiary': 'gleason_tertiary',
    'gleason_combined': 'gleason_combined',
    'date_first_surgery': 'date_first_surgery',
    'laterality': 'laterality',
    'quintile_2015': 'quintile_2015'
}

av_tumour.rename(columns=rename, inplace=True)
av_tumour.to_csv(os.path.join(directory, 'sim_av_tumour.csv'), index=False)

  av_tumour = pd.read_csv(os.path.join(directory, 'sim_av_tumour.csv'))


In [32]:
sact_patient = pd.read_csv(os.path.join(directory, 'sim_sact_patient.csv'))
sact_patient.columns

Index(['merged_patient_id'], dtype='object')

In [36]:
# Attempt to fix unknown error by dropping a column from the original dataset
# Commented out due to removing duplicate column from file

#sact_patient = sact_patient.drop(columns = ['link_number'])
#sact_patient.to_csv(os.path.join(directory, 'sim_sact_patient.csv'), index=False, encoding = 'utf-8')

KeyError: "['link_number'] not found in axis"

In [37]:
sact_outcome = pd.read_csv(os.path.join(directory, 'sim_sact_outcome.csv'))
sact_outcome.columns
#sact_outcome.to_csv(os.path.join(directory, 'sim_sact_outcome.csv'), index=False, encoding = 'utf-8')

Index(['merged_outcome_id', 'merged_regimen_id', 'date_of_final_treatment',
       'regimen_mod_dose_reduction', 'regimen_mod_time_delay',
       'regimen_mod_stopped_early', 'regimen_outcome_summary',
       'merged_patient_id', 'merged_tumour_id'],
      dtype='object')

In [38]:
sact_drug_detail = pd.read_csv(os.path.join(directory, 'sim_sact_drug_detail.csv'))
sact_drug_detail.columns

Index(['merged_drug_detail_id', 'merged_cycle_id', 'org_code_of_drug_provider',
       'actual_dose_per_administration', 'opcs_delivery_code',
       'administration_route', 'administration_date', 'drug_group',
       'merged_patient_id', 'merged_tumour_id', 'merged_regimen_id'],
      dtype='object')

In [39]:
sact_tumour = pd.read_csv(os.path.join(directory, 'sim_sact_tumour.csv'))
sact_tumour.columns

Index(['merged_tumour_id', 'merged_patient_id', 'consultant_speciality_code',
       'primary_diagnosis', 'morphology_clean'],
      dtype='object')

In [40]:
sact_cycle = pd.read_csv(os.path.join(directory, 'sim_sact_cycle.csv'))
sact_cycle.columns

  sact_cycle = pd.read_csv(os.path.join(directory, 'sim_sact_cycle.csv'))


Index(['merged_cycle_id', 'merged_regimen_id', 'cycle_number',
       'start_date_of_cycle', 'opcs_procurement_code',
       'perf_status_start_of_cycle', 'merged_patient_id', 'merged_tumour_id'],
      dtype='object')

In [41]:
sact_regimen = pd.read_csv(os.path.join(directory, 'sim_sact_regimen.csv'))
sact_regimen.columns

Index(['merged_regimen_id', 'merged_tumour_id', 'height_at_start_of_regimen',
       'weight_at_start_of_regimen', 'intent_of_treatment',
       'date_decision_to_treat', 'start_date_of_regimen', 'mapped_regimen',
       'clinical_trial', 'chemo_radiation', 'merged_patient_id',
       'benchmark_group'],
      dtype='object')

In [46]:
# Extracting subsets from sact_cycle, sact_patient & sact_drug_detail
# sact_cycle has inconsistent 5 field values, sact_patient produces unknown error 
# & sact_drug_detail is just too big (with no other known errors for now)

# Extract 100 records randomly 
subset_sact_cycle = sact_cycle.sample(n=100, random_state=1)
subset_sact_patient = sact_patient.sample(n=100, random_state=1)
subset_sact_drug_detail = sact_drug_detail.sample(n=100, random_state=1)
subset_av_patient = av_patient.sample(n=100, random_state=1)
subset_av_tumour = av_tumour.sample(n=100, random_state=1)

# Save them
subset_sact_cycle.to_csv(os.path.join(directory, 'subset_sact_cycle.csv'), index=False)
subset_sact_patient.to_csv(os.path.join(directory, 'subset_sact_patient.csv'), index=False)
subset_sact_drug_detail.to_csv(os.path.join(directory, 'subset_sact_drug_detail.csv'), index=False)
subset_av_patient.to_csv(os.path.join(directory, 'subset_av_patient.csv'), index=False)
subset_av_tumour.to_csv(os.path.join(directory, 'subset_av_tumour.csv'), index=False)
