In [1]:
import os

def rename_files(source_folder):
    # List all files in the source folder
    files = os.listdir(source_folder)

    for file_name in files:
        # Check if the file name contains the pattern we're interested in
        if '_icd' in file_name:
            # Split the file name around underscores
            parts = file_name.split('_')
            # Assemble the new file name from the parts we want to keep
            new_name = parts[1] + '.xls'  # Assuming they are .xls files

            # Print old and new file names
            print(f'Old Name: {file_name} -> New Name: {new_name}')

            # Rename the file on the disk
            os.rename(os.path.join(source_folder, file_name), 
                      os.path.join(source_folder, new_name))

# Example usage
source_folder = '/mnt/d/pydatascience/g3_regress/data/icd10'
rename_files(source_folder)


Old Name: 7449737_icd104a_bw.xls -> New Name: icd104a.xls
Old Name: 7449738_icd104b_bw.xls -> New Name: icd104b.xls
Old Name: 7449739_icd106a_bw.xls -> New Name: icd106a.xls
Old Name: 7449740_icd106b_bw.xls -> New Name: icd106b.xls


In [2]:

import pandas as pd
import os
import glob
import lxml

def html_to_csv(file_path, output_folder):
    # Read the HTML file, assuming the first table contains the relevant data
    try:
        df_list = pd.read_html(file_path)  # This reads all tables into a list of dataframes
        if df_list:
            df = df_list[0]  # Assuming the first table is the one you need
            csv_file_path = os.path.join(output_folder, os.path.splitext(os.path.basename(file_path))[0] + '.csv')
            df.to_csv(csv_file_path, index=False)
            print(f'Successfully converted {file_path} to {csv_file_path}')
        else:
            print(f'No tables found in {file_path}')
    except Exception as e:
        print(f'Failed to convert {file_path}. Error: {e}')

def convert_folder_html_to_csv(source_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    html_files = glob.glob(os.path.join(source_folder, '*.xls'))  # Assuming .xls files might be HTML

    for file_path in html_files:
        html_to_csv(file_path, output_folder)

# Example usage
source_folder = '/mnt/d/pydatascience/g3_regress/data/icd10'
destination_folder = '/mnt/d/pydatascience/g3_regress/data/icd10'
convert_folder_html_to_csv(source_folder, destination_folder)


Successfully converted /mnt/d/pydatascience/g3_regress/data/icd10/icd104a.xls to /mnt/d/pydatascience/g3_regress/data/icd10/icd104a.csv
Successfully converted /mnt/d/pydatascience/g3_regress/data/icd10/icd104b.xls to /mnt/d/pydatascience/g3_regress/data/icd10/icd104b.csv
Successfully converted /mnt/d/pydatascience/g3_regress/data/icd10/icd106a.xls to /mnt/d/pydatascience/g3_regress/data/icd10/icd106a.csv
Successfully converted /mnt/d/pydatascience/g3_regress/data/icd10/icd106b.xls to /mnt/d/pydatascience/g3_regress/data/icd10/icd106b.csv


In [21]:
df = pd.read_csv('/mnt/d/pydatascience/g3_regress/data/a1c/a1c2015q12.csv', header=1)
df.columns

Index(['Reference Key', 'Date of Birth (yyyy-mm-dd)', 'Exact date of birth',
       'Sex', 'LIS Reference Datetime',
       'LIS Result (28 days) - LIS Reference Date',
       'LIS Performing Lab Hospital', 'LIS Request No.', 'LIS Case No.',
       'LIS Result (28 days) - LIS Test Description',
       'LIS Result (28 days) - LIS Test Description (ePR)',
       'LIS Result (28 days) - LIS Result',
       'LIS Result (28 days) - LIS Specimen',
       'LIS Result (28 days) - LIS Specimen (ePR)',
       'LIS Result (28 days) - LIS Result: Numeric Result',
       'LIS Result (28 days) - LIS Test Unit',
       'LIS Result (28 days) - LIS Result Flagging',
       'LIS Result (28 days) - LIS Lower Reference Range',
       'LIS Result (28 days) - LIS Upper Reference Range'],
      dtype='object')

In [3]:
def clean_csv(file_path):
    # Read the CSV file, skipping the first row if it is unwanted
    df = pd.read_csv(file_path, header=1)  # Adjust 'header' index if your data starts from another row

    # Optional: Remove unnecessary columns if they are consistent across files
    # List columns to drop - example given here
    columns_to_drop = ['Exact date of birth', 
                       'LIS Result (28 days) - LIS Reference Date ', 
                       'LIS Performing Lab Hospital',
                    #    'LIS Request No.',
                    #    'LIS Result (28 days) - LIS Test Description',
                    #    'LIS Result (28 days) - LIS Test Description (ePR)',
                    #    'LIS Result (28 days) - LIS Result',
                    #    'LIS Result (28 days) - LIS Specimen',
                    #    'LIS Result (28 days) - LIS Specimen (ePR)',
                    #    'LIS Result (28 days) - LIS Test Unit',
                       'LIS Result (28 days) - LIS Result Flagging',
                       'LIS Result (28 days) - LIS Lower Reference Range',
                       'LIS Result (28 days) - LIS Upper Reference Range']
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

    # Save the cleaned DataFrame to a new CSV file
    df.to_csv(file_path, index=False)
    print(f'File cleaned and saved to {file_path}')

def batch_clean_csv(source_folder):
    # Ensure the output folder exists
    os.makedirs(source_folder, exist_ok=True)

    # Find all CSV files in the source folder
    csv_files = glob.glob(os.path.join(source_folder, '*.csv'))

    # Loop through all found CSV files
    for file_path in csv_files:
        # Construct the output file path
        base_name = os.path.basename(file_path)
        output_file_path = os.path.join(source_folder, base_name)

        # Call the clean_csv function for each file
        clean_csv(file_path)

# Example usage
source_folder = '/mnt/d/pydatascience/g3_regress/data/upacr'
output_folder = '/path/to/save/cleaned/csv/files'
batch_clean_csv(source_folder)


File cleaned and saved to /mnt/d/pydatascience/g3_regress/data/upacr/Upacr2023.csv
File cleaned and saved to /mnt/d/pydatascience/g3_regress/data/upacr/Upacr2010.csv
File cleaned and saved to /mnt/d/pydatascience/g3_regress/data/upacr/Upacr2011.csv
File cleaned and saved to /mnt/d/pydatascience/g3_regress/data/upacr/Upacr2012.csv
File cleaned and saved to /mnt/d/pydatascience/g3_regress/data/upacr/Upacr2013.csv
File cleaned and saved to /mnt/d/pydatascience/g3_regress/data/upacr/Upacr2014.csv
File cleaned and saved to /mnt/d/pydatascience/g3_regress/data/upacr/Upacr2015.csv
File cleaned and saved to /mnt/d/pydatascience/g3_regress/data/upacr/Upacr2016.csv
File cleaned and saved to /mnt/d/pydatascience/g3_regress/data/upacr/Upacr2017.csv
File cleaned and saved to /mnt/d/pydatascience/g3_regress/data/upacr/Upacr2018.csv
File cleaned and saved to /mnt/d/pydatascience/g3_regress/data/upacr/Upacr2019.csv
File cleaned and saved to /mnt/d/pydatascience/g3_regress/data/upacr/Upacr2020.csv
File