## Section 2: Data Extraction

In this section, we will extract the generated data in JSON format and transform it into a Data Frame to be used for further analysis.

In [5]:
# import libraries
import pandas as pd
import json
import os
import warnings
import uuid

# ignore warnings
warnings.filterwarnings("ignore")

In [6]:
# Define the path to the raw data directory
raw_data_path = os.path.join("data", "raw")

# Initialize an empty list to store all patient records
all_patients = []

# Check if the directory exists
if os.path.exists(raw_data_path):
    # Loop through all JSON files in the directory
    for filename in os.listdir(raw_data_path):
        if filename.endswith('.json'):
            file_path = os.path.join(raw_data_path, filename)
            
            # Read the JSON file
            with open(file_path, 'r') as f:
                try:
                    data = json.load(f)
                    
                    # Process each patient record in the file
                    for patient in data:
                        # Extract questionnaire responses and flatten them

                        # Convert the patient_id to 5 digit uuid4
                        patient['patient_id'] = str(uuid.uuid4())[:5]

                        if 'questionnaire_responses' in patient:
                            for key, value in patient['questionnaire_responses'].items():
                                patient[key] = value
                            
                            # Remove the original nested structure
                            patient.pop('questionnaire_responses')
                        
                        # Add to our collection
                        all_patients.append(patient)
                        
                except json.JSONDecodeError:
                    print(f"Error decoding JSON from file: {filename}")
    
    # Create DataFrame from all patient records
    patient_df = pd.DataFrame(all_patients)
    
    # Display basic information about the DataFrame
    print(f"Total number of patient records: {len(patient_df)}")
    print("\nDataFrame columns:")
    print(patient_df.columns.tolist())
    print("\nDataFrame shape:", patient_df.shape)
    print("\nFirst 5 rows:")
    display(patient_df.head())
    
else:
    print(f"Directory {raw_data_path} does not exist.")

Total number of patient records: 1199

DataFrame columns:
['patient_id', 'age', 'gender', 'medical_history', 'deterioration_label', 'timestamp', 'hear_rate', 'blood_pressure_sys', 'blood_pressure_dia', 'oxygen_saturation', 'temperature', 'respiratory_rate', 'describe_fatigue_level', 'describe_lifestyle', 'describe_mental_health']

DataFrame shape: (1199, 15)

First 5 rows:


Unnamed: 0,patient_id,age,gender,medical_history,deterioration_label,timestamp,hear_rate,blood_pressure_sys,blood_pressure_dia,oxygen_saturation,temperature,respiratory_rate,describe_fatigue_level,describe_lifestyle,describe_mental_health
0,9b04b,65,Male,History of hypertension and type 2 diabetes.,True,2023-10-27T10:00:00Z,95.5,160.2,98.7,90.3,38.5,22.1,"Severe fatigue, difficulty getting out of bed.","Sedentary, poor diet.",Feeling anxious and low.
1,bffd5,45,Female,No significant medical history.,False,2023-10-27T10:05:00Z,70.2,120.5,75.0,98.5,36.8,16.0,Mild fatigue occasionally.,"Active, balanced diet.",Generally good.
2,fb35e,78,Male,"Chronic obstructive pulmonary disease (COPD), ...",True,2023-10-27T10:10:00Z,105.0,150.0,90.0,88.0,37.9,25.5,Constant exhaustion.,Very limited activity due to breathlessness.,Feeling down and worried.
3,1e30e,30,Female,Mild asthma.,False,2023-10-27T10:15:00Z,65.0,110.0,70.0,99.0,36.5,14.0,Not fatigued.,"Very active, regular exercise.",Good.
4,116a4,55,Male,High cholesterol.,False,2023-10-27T10:20:00Z,75.5,135.0,85.0,97.0,37.0,17.0,Moderate fatigue after work.,Moderately active.,Stable.
