In [None]:
! pip install openpyxl
! pip install --upgrade xlrd
! pip install msoffcrypto-tool

In [None]:
# Code part 1
import pandas as pd
import msoffcrypto
import io

# File path
file_path = "2024-Data/PEFR_asthma_114_medinfo_07.15.xlsx"

# Password
password = "1234"

# Create a file-like object
decrypted_workbook = io.BytesIO()

# Open the encrypted file
with open(file_path, "rb") as file:
    office_file = msoffcrypto.OfficeFile(file)
    office_file.load_key(password=password)
    office_file.decrypt(decrypted_workbook)

# Read the decrypted workbook
df_patient_list = pd.read_excel(decrypted_workbook, engine="openpyxl")

columns_to_drop = ["BCODE", "UID1", "UID2"]

df_patient_list.drop(columns=columns_to_drop, axis=1, inplace=True)

# Set display options to show all rows and columns
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)
# pd.set_option("display.width", None)
# pd.set_option("display.max_colwidth", None)

# Display the first few rows of the dataframe
print(df_patient_list.columns)
display(df_patient_list)

# Display the entire dataframe
# print(df.to_string())

# Optionally, reset display options to default
# pd.reset_option("display.max_rows")
# pd.reset_option("display.max_columns")
# pd.reset_option("display.width")
# pd.reset_option("display.max_colwidth")

In [None]:
# Code part 2
import os
import pandas as pd


def process_file(file_path):
    # First, read just the first few rows to check the content
    df_check = pd.read_excel(
        io=file_path,
        sheet_name="Database",
        nrows=3,
        usecols="B,G,H,I",
        header=None,
        engine="openpyxl",
    )

    # Check if the value in row 2, column B (index 1,0) is a date
    if pd.to_datetime(df_check.iloc[2, 0], errors="coerce") is not pd.NaT:
        header_row = 1  # Use row 1 as header
    else:
        header_row = 2  # Use row 2 as header

    # Now read the full data with the determined header
    df = pd.read_excel(
        io=file_path,
        sheet_name="Database",
        header=header_row,
        usecols="B,G,H,I",
        names=None,
        keep_default_na=True,
        engine="openpyxl",
        parse_dates=True,
    )

    # Drop any remaining columns that are entirely empty
    df.dropna(how="all", axis=1, inplace=True)

    # Set display options to show all rows and columns
    # pd.set_option("display.max_rows", None)
    # pd.set_option("display.max_columns", None)
    # pd.set_option("display.width", None)
    # pd.set_option("display.max_colwidth", None)

    # Display the dataframe
    print(f"Using row {header_row + 1} as header")
    display(df)

    # Optionally, reset display options to default
    # pd.reset_option("display.max_rows")
    # pd.reset_option("display.max_columns")
    # pd.reset_option("display.width")
    # pd.reset_option("display.max_colwidth")

    return df


def code_part_2():
    folder_path = "2024-Data/SCH_asthma_114"
    patient_data = {}

    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".xlsx"):
            # Extract patient ID from the filename
            patient_id = filename.split()[0]
            file_path = os.path.join(folder_path, filename)

            # Process the file and store the dataframe in the dictionary
            patient_data[patient_id] = process_file(file_path)

    # Display the results
    for patient_id, df in patient_data.items():
        print(f"Patient ID: {patient_id}")
        # print(df)
        print("\n" + "=" * 50 + "\n")

    return patient_data


if __name__ == "__main__":
    patient_data_dict = code_part_2()

In [None]:
# Code part 3
import pandas as pd
import numpy as np


def merge_dataframes(df_part1, patient_data_dict):
    # Create a list to store all the merged dataframes
    merged_dfs = []

    # Iterate through each patient's data in the dictionary
    for patient_id, patient_df in patient_data_dict.items():
        # Find the corresponding row in df_part1
        patient_info = df_part1[df_part1["ID"] == patient_id]

        if not patient_info.empty:
            # Identify the correct column names in patient_df
            date_col = next(
                (col for col in patient_df.columns if col.lower() in ["date", "a04"]),
                None,
            )
            morning_col = next(
                (
                    col
                    for col in patient_df.columns
                    if col.lower() in ["morning", "c01"]
                ),
                None,
            )
            afternoon_col = next(
                (
                    col
                    for col in patient_df.columns
                    if col.lower() in ["afternoon", "c02", "evening"]
                ),
                None,
            )
            other_col = next(
                (
                    col
                    for col in patient_df.columns
                    if col.lower()
                    not in [
                        "date",
                        "a04",
                        "morning",
                        "c01",
                        "afternoon",
                        "c02",
                        "evening",
                    ]
                ),
                None,
            )

            # Rename columns to standardized names
            column_mapping = {}
            if date_col:
                column_mapping[date_col] = "Date"
            if morning_col:
                column_mapping[morning_col] = "Morning_PEFR"
            if afternoon_col:
                column_mapping[afternoon_col] = "Afternoon_PEFR"
            if other_col:
                column_mapping[other_col] = "Other_PEFR"

            patient_df = patient_df.rename(columns=column_mapping)

            # Convert columns to numeric, coercing errors to NaN
            for col in ["Morning_PEFR", "Afternoon_PEFR", "Other_PEFR"]:
                if col in patient_df.columns:
                    patient_df[col] = pd.to_numeric(patient_df[col], errors="coerce")

            # Merge patient_info with patient_df
            merged_patient_df = pd.merge(patient_info, patient_df, how="cross")

            # Append this merged dataframe to the list
            merged_dfs.append(merged_patient_df)

    # Concatenate all merged dataframes
    result_df = pd.concat(merged_dfs, ignore_index=True)

    # Ensure all columns exist, fill with NaN if missing
    # for col in ["Date", "Morning_PEFR", "Afternoon_PEFR", "Other_PEFR"]:
    #     if col not in result_df.columns:
    #         result_df[col] = np.nan

    # Sort the dataframe by ID and Date
    result_df = result_df.sort_values(["ID", "Date"])

    return result_df


# Assuming df from part 1 and patient_data_dict from part 2 are available
result_df = merge_dataframes(df_patient_list, patient_data_dict)

# Display the result
print(result_df.columns)
display(result_df)
result_df.to_csv("result_df.csv", index=False, sep=",")