## Section 2: Data Extraction

In this section, we will extract the generated data in JSON format and transform it into a Data Frame to be used for further analysis.

In [3]:
# import libraries
import pandas as pd
import json
import os
import warnings
import uuid

# ignore warnings
warnings.filterwarnings("ignore")

## 2.1 Data Extraction

Extracts all the patients data in the 40 JSON files. Then, combine them into a single CSV file for further analysis.

In [12]:
# Define the path to the raw data directory
raw_data_path = "../data/exam"

# Initialize an empty list to store all patient records
all_patients = []

# Check if the directory exists
if os.path.exists(raw_data_path):
    # Loop through all CSV files in the directory
    for filename in os.listdir(raw_data_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(raw_data_path, filename)
            
            try:
                # Read the CSV file with error handling for parsing issues
                data = pd.read_csv(file_path, on_bad_lines='skip')
                
                # Process each patient record in the file
                for index, patient in data.iterrows():
                    # Convert the row to a dictionary
                    patient_dict = patient.to_dict()
                    
                    # Convert the patient_id to 5 digit uuid4 if needed
                    patient_dict['CustomerID'] = str(uuid.uuid4())[:7]
                    
                    # Add to our collection
                    all_patients.append(patient_dict)
                    
            except Exception as e:
                print(f"Error reading CSV file {filename}: {e}")
    
    # Create DataFrame from all patient records
    patient_df = pd.DataFrame(all_patients)
    
    # Display basic information about the DataFrame
    print(f"Total number of patient records: {len(patient_df)}")
    print("\nDataFrame columns:")
    print(patient_df.columns.tolist())
    print("\nDataFrame shape:", patient_df.shape)
    print("\nFirst 5 rows:")
    display(patient_df.head())
    
else:
    print(f"Directory {raw_data_path} does not exist.")

Total number of patient records: 880

DataFrame columns:
['CustomerID', 'Age', 'Gender', 'Location', 'MembershipLevel', 'TotalPurchases', 'TotalSpent', 'FavoriteCategory', 'LastPurchaseDate', 'WebsiteClickRate', 'TimeSpentOnSite', 'SocialMediaEngagement', 'AdClickHistory', 'GeneratedReview', 'CustomerSentimentScore', 'PersonaTag', 'Churn']

DataFrame shape: (880, 17)

First 5 rows:


Unnamed: 0,CustomerID,Age,Gender,Location,MembershipLevel,TotalPurchases,TotalSpent,FavoriteCategory,LastPurchaseDate,WebsiteClickRate,TimeSpentOnSite,SocialMediaEngagement,AdClickHistory,GeneratedReview,CustomerSentimentScore,PersonaTag,Churn
0,4efed90,Female,Denver,CO,Silver,12,753.6,Clothing,2023-10-20,0.065,15.2,Medium,Clicked,"Great experience, love this store!",0.91,Regular Buyer,0
1,d7f26e8,Male,Los Angeles,CA,Gold,28,2155.4,Electronics,2023-10-25,0.092,22.5,High,Sometimes,Very happy with my purchases.,0.95,Loyal,0
2,6b4a427,Other,Chicago,IL,Platinum,41,4510.1,Home Goods,2023-09-18,0.115,28.1,High,Clicked,Excellent service and products.,0.98,Loyal,0
3,68eec52,Male,Houston,TX,Bronze,2,85.7,Books,2023-08-01,0.041,8.9,Low,Not Clicked,"Had some issues, not fully satisfied.",0.52,Window Shopper,1
4,3a2af82,Female,Phoenix,AZ,Silver,18,1220.5,Beauty,2023-10-10,0.078,18.7,Medium,Clicked,Will definitely buy again.,0.88,Engaged,0


In [13]:
# save the output to a CSV file in /data
patient_df.to_csv("../data/exam/processed/merged_data.csv", index=False)