In [5]:
import pandas as pd
import numpy as np

from collections import defaultdict
import os
from tqdm import tqdm

from dotenv import dotenv_values

import warnings
warnings.filterwarnings("ignore")

In [6]:
def data_retrieval(path):
    """
        extracts the data in specific rows and columns in different excel files.
    
        Parameters
        ------------
        path: str
            the path to the folder in which contains all the subfolders of excel files

        Returns
        ------------
        None
    
    """

    global df
    
    EXCEL_FILE = pd.read_excel(path, sheet_name = "Urinalysis")
    
    CONTAINER = defaultdict(lambda: np.nan)

    PATIENT_AGE = EXCEL_FILE.iloc[7, 2]
    
    if not pd.isna(PATIENT_AGE):
        if PATIENT_AGE.endswith("YEARS OLD") or PATIENT_AGE.endswith("YEAR OLD"):
            CONTAINER["Age"] = int("".join(char for char in PATIENT_AGE if char.isnumeric()))
        else:
            CONTAINER["Age"] = int("".join(char for char in PATIENT_AGE if char.isnumeric()))/10
    else:
        CONTAINER["Age"] = np.nan
    
    PATIENT_GENDER = EXCEL_FILE.iloc[7, 4]
    CONTAINER["Gender"] = PATIENT_GENDER
    
    INDECIES_AND_FEATURE_NAMES = ((17, "Color"),
                                  (18, "Transparency"),
                                  (21, "Glucose"),
                                  (22, "Protein"),
                                  (23, "pH"),
                                  (24, "Specific Gravity"),
                                  (27, "WBC"),
                                  (28, "RBC"),
                                  (29, "Epithelial Cells"),
                                  (30, "Mucous Threads"),
                                  (31, "Amorphous Urates"),
                                  (32, "Bacteria"))
    
    COLUMN_NUMBER = 5
    
    for ROW, COLUMN_NAME in INDECIES_AND_FEATURE_NAMES:
        CONTAINER[COLUMN_NAME] = EXCEL_FILE.loc[ROW][COLUMN_NUMBER]

    df = df.append(CONTAINER, ignore_index=True)

In [8]:
DIRECTORY_PATH = dotenv_values(".env.local")["DATASET_PATH"]
DIRECTORY_FOLDERS = os.listdir(DIRECTORY_PATH)

df = pd.DataFrame(columns = ["Age", "Gender", 
                             "Color", "Transparency",
                             "Glucose", "Protein", "pH", "Specific Gravity", 
                             "WBC", "RBC", "Epithelial Cells", "Mucous Threads", "Amorphous Urates", "Bacteria"])

for SUBFOLDER_NAME in tqdm(DIRECTORY_FOLDERS, position=0, leave=True):
    SUBFOLDER_PATH = os.path.join(DIRECTORY_PATH, SUBFOLDER_NAME)
    
    EXCEL_FILES = os.listdir(SUBFOLDER_PATH)

    for EXCEL_NAME in EXCEL_FILES:
        PATH = rf"{SUBFOLDER_PATH}\{EXCEL_NAME}"
        data_retrieval(path=PATH)

100%|██████████| 7/7 [02:08<00:00, 18.33s/it]


In [9]:
FILE_NAME = "UTI DATASET (UNLABELED).xlsx"
df.to_excel(FILE_NAME, index=False)

In [10]:
display(df)

Unnamed: 0,Age,Gender,Color,Transparency,Glucose,Protein,pH,Specific Gravity,WBC,RBC,Epithelial Cells,Mucous Threads,Amorphous Urates,Bacteria
0,76,FEMALE,LIGHT YELLOW,CLEAR,NEGATIVE,NEGATIVE,5,1.010,1-3,0-2,OCCASIONAL,RARE,NONE SEEN,OCCASIONAL
1,9,MALE,DARK YELLOW,SLIGHTLY HAZY,NEGATIVE,1+,5,1.030,1-3,0-2,RARE,FEW,FEW,MODERATE
2,12,MALE,LIGHT YELLOW,SLIGHTLY HAZY,NEGATIVE,TRACE,5,1.030,0-3,0-2,RARE,FEW,MODERATE,RARE
3,77,MALE,BROWN,CLOUDY,NEGATIVE,1+,6,1.020,5-8,LOADED,RARE,RARE,NONE SEEN,FEW
4,29,FEMALE,YELLOW,HAZY,NEGATIVE,TRACE,6,1.025,1-4,0-2,RARE,RARE,NONE SEEN,FEW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1432,0.6,MALE,YELLOW,CLEAR,NEGATIVE,NEGATIVE,6,1.015,1-2,0-1,RARE,FEW,NONE SEEN,RARE
1433,42,MALE,YELLOW,CLEAR,NEGATIVE,NEGATIVE,6.5,1.010,0-2,0-2,RARE,NONE SEEN,NONE SEEN,RARE
1434,47,FEMALE,DARK YELLOW,CLEAR,NEGATIVE,TRACE,6,1.030,2-4,0-2,MODERATE,MODERATE,NONE SEEN,RARE
1435,57,FEMALE,DARK YELLOW,CLEAR,NEGATIVE,TRACE,5,1.030,0-2,0-2,PLENTY,PLENTY,NONE SEEN,FEW
