# Initial Data Exploration
This notebook performs initial exploration of the cleaned patient survey data.
Steps include: loading the data, viewing the first few rows, checking dimensions, listing columns, checking for missing values, and examining data types.

In [308]:
# Import pandas library
import pandas as pd

In [309]:
# Load the dataset
# Adjust the path if your notebook is in a different location relative to the data folder
file_path = '../data/cleaned/cleaned_patient_survey_data.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded {file_path}")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path.")
    # You might want to stop execution here or handle the error appropriately
    df = None # Set df to None if file not found

# Display the first 5 rows (only if df was loaded successfully)
if df is not None:
    print("\nFirst 5 rows of the dataset:")
    display(df.head())

Successfully loaded ../data/cleaned/cleaned_patient_survey_data.csv

First 5 rows of the dataset:


Unnamed: 0,governorate,district,sub_district,community,grant,health_facility,other_health_facility,observer_name,patient_age_group,respondent_sex,...,has_other_feedback,other_feedback_text,consent_date,consent_year,consent_quarter,consent_month,consent_day,consent_weekday,consent_month_name,consent_weekday_name
0,Aleppo,A'zaz,Aghtrin,Akhtrein,,Akhtarin Hospital,,محمد الدرويش,Child Male,female,...,no,,2024-11-17,2024.0,4.0,11.0,17.0,6.0,November,Sunday
1,Aleppo,A'zaz,Aghtrin,Akhtrein,,Akhtarin Hospital,,محمد الدرويش,Child Female,female,...,no,,2024-11-17,2024.0,4.0,11.0,17.0,6.0,November,Sunday
2,Aleppo,A'zaz,Aghtrin,Akhtrein,,Akhtarin Hospital,,محمد الدرويش,Child Male,female,...,yes,ارجو تصليح جهاز الاشعة,2024-11-17,2024.0,4.0,11.0,17.0,6.0,November,Sunday
3,Aleppo,A'zaz,Aghtrin,Akhtrein,,Akhtarin Hospital,,محمد الدرويش,Child Male,female,...,no,,2024-11-17,2024.0,4.0,11.0,17.0,6.0,November,Sunday
4,Aleppo,A'zaz,Aghtrin,Akhtrein,,Akhtarin Hospital,,محمد الدرويش,Child Female,female,...,no,,2024-11-17,2024.0,4.0,11.0,17.0,6.0,November,Sunday


In [310]:
# Show the number of rows and columns (only if df was loaded successfully)
if df is not None:
    print(f"\nDataset shape (rows, columns): {df.shape}")


Dataset shape (rows, columns): (736, 106)


In [311]:
# List column names (only if df was loaded successfully)
if df is not None:
    print("\nColumn names:")
    print(list(df.columns))


Column names:
['governorate', 'district', 'sub_district', 'community', 'grant', 'health_facility', 'other_health_facility', 'observer_name', 'patient_age_group', 'respondent_sex', 'respondent_age_years', 'patient_age_years', 'is_first_visit', 'marital_status', 'num_children', 'oldest_child_age', 'difficulty_seeing', 'difficulty_hearing', 'difficulty_walking', 'difficulty_communicating', 'difficulty_remembering', 'facility_access_easy', 'facility_access_easy_reason', 'facility_access_not_easy_reason', 'facility_access_other_reason', 'service_discovery_method', 'service_discovery_other', 'facility_selection_reason', 'facility_selection_other', 'overall_service_quality', 'service_quality_bad_reason', 'services_received_list', 'service_general_clinic', 'service_gynecology', 'service_pediatric', 'service_nutrition', 'service_other_flag', 'Which service/s did you receive today?/5', 'birth_services', 'specify_others', 'was_charged_for_service', 'charged_service_list', 'charged_service_genera

In [312]:
# Check for missing values (only if df was loaded successfully)
if df is not None:
    print("\nMissing values per column:")
    print(df.isnull().sum())


Missing values per column:
governorate               1
district                  1
sub_district              1
community                 1
grant                   489
                       ... 
consent_month             1
consent_day               1
consent_weekday           1
consent_month_name        1
consent_weekday_name      1
Length: 106, dtype: int64


In [313]:
# Show data types of each column (only if df was loaded successfully)
if df is not None:
    print("\nData types of each column:")
    print(df.dtypes)


Data types of each column:
governorate              object
district                 object
sub_district             object
community                object
grant                    object
                         ...   
consent_month           float64
consent_day             float64
consent_weekday         float64
consent_month_name       object
consent_weekday_name     object
Length: 106, dtype: object


## Process Binary Columns (0/1 to No/Yes)

In [314]:
# Identify potential binary columns (containing only 0 and 1)
# Note: This assumes binary columns strictly contain 0 and 1. Adjust if needed.
if df is not None:
    binary_columns = []
    for col in df.columns:
        unique_values = df[col].dropna().unique()
        if set(unique_values).issubset({0, 1}):
             # Check if the column actually contains both 0 and 1 or just one of them
             if len(unique_values) > 0: # Ensure column is not entirely empty
                 binary_columns.append(col)

    print(f"Identified binary columns (containing only 0s and 1s): {binary_columns}")

Identified binary columns (containing only 0s and 1s): ['service_general_clinic', 'service_gynecology', 'service_pediatric', 'service_nutrition', 'service_other_flag', 'Which service/s did you receive today?/5', 'charged_service_general_clinic', 'charged_service_gynecology', 'charged_service_pediatric', 'charged_service_nutrition', 'charged_service_other', 'channel_complaint_box', 'channel_info_desk', 'channel_ngo_staff', 'channel_phone_line', 'channel_whatsapp_viber', 'channel_social_media', 'channel_local_council', 'channel_other_flag']


In [315]:
# Define the mapping function
def map_binary_to_yes_no(value):
    if pd.isna(value):
        return value # Keep NaN values as they are
    elif value == 1:
        return 'Yes'
    elif value == 0:
        return 'No'
    else:
        return value # Return original value if it's not 0 or 1

In [316]:
# Apply the function to all identified binary columns
if df is not None and 'binary_columns' in locals():
    for col in binary_columns:
        # Ensure the column exists before applying
        if col in df.columns:
            df[col] = df[col].apply(map_binary_to_yes_no)
            print(f"Applied mapping to column: {col}")
        else:
            print(f"Warning: Column {col} not found in DataFrame during mapping.")

    # Verify the change for one column (optional)
    if binary_columns: # Check if list is not empty
        print(f"\nExample: First 5 values of '{binary_columns[0]}':")
        display(df[binary_columns[0]].head())

Applied mapping to column: service_general_clinic
Applied mapping to column: service_gynecology
Applied mapping to column: service_pediatric
Applied mapping to column: service_nutrition
Applied mapping to column: service_other_flag
Applied mapping to column: Which service/s did you receive today?/5
Applied mapping to column: charged_service_general_clinic
Applied mapping to column: charged_service_gynecology
Applied mapping to column: charged_service_pediatric
Applied mapping to column: charged_service_nutrition
Applied mapping to column: charged_service_other
Applied mapping to column: channel_complaint_box
Applied mapping to column: channel_info_desk
Applied mapping to column: channel_ngo_staff
Applied mapping to column: channel_phone_line
Applied mapping to column: channel_whatsapp_viber
Applied mapping to column: channel_social_media
Applied mapping to column: channel_local_council
Applied mapping to column: channel_other_flag

Example: First 5 values of 'service_general_clinic':


0    No
1    No
2    No
3    No
4    No
Name: service_general_clinic, dtype: object

In [317]:
# Print value counts for each binary question
if df is not None and 'binary_columns' in locals():
    print("\nSummary of binary questions (Yes/No counts):")
    for col in binary_columns:
        if col in df.columns:
            print(f"\n--- {col} ---")
            print(df[col].value_counts(dropna=False)) # include NaN counts
        else:
             print(f"\n--- Warning: Column {col} not found for summary. ---")


Summary of binary questions (Yes/No counts):

--- service_general_clinic ---
service_general_clinic
No     573
Yes    163
Name: count, dtype: int64

--- service_gynecology ---
service_gynecology
No     473
Yes    263
Name: count, dtype: int64

--- service_pediatric ---
service_pediatric
No     534
Yes    202
Name: count, dtype: int64

--- service_nutrition ---
service_nutrition
No     714
Yes     22
Name: count, dtype: int64

--- service_other_flag ---
service_other_flag
No     656
Yes     80
Name: count, dtype: int64

--- Which service/s did you receive today?/5 ---
Which service/s did you receive today?/5
NaN    489
No     211
Yes     36
Name: count, dtype: int64

--- charged_service_general_clinic ---
charged_service_general_clinic
NaN    733
Yes      2
No       1
Name: count, dtype: int64

--- charged_service_gynecology ---
charged_service_gynecology
NaN    733
No       3
Name: count, dtype: int64

--- charged_service_pediatric ---
charged_service_pediatric
NaN    733
No       2
Y

## Standardize Facility Rating Columns
Clean and map categorical facility ratings (e.g., "Excellent", "Good") to numerical scores.

In [318]:
# Define the facility rating columns to standardize
facility_rating_columns = [
    'facility_cleanliness_rating', 'facility_condition_rating', 'facility_info_displayed_rating',
    'facility_private_talk_spaces_rating', 'facility_ease_of_movement_rating', 'facility_waiting_comfort_rating'
]

# --- IMPORTANT: Define the mapping based on the actual values in your data --- 
# Example mapping (adjust keys and values as needed):
rating_map = {
    'Excellent': 3, 
    'Good': 2, 
    'Fair': 1, # Example: Added Fair
    'Poor': 0  # Example: Mapping Poor to 0
    # Add other potential values found in your columns
}

# Check if columns exist and show unique values before mapping
if df is not None:
    print("Checking columns and unique values before mapping:")
    valid_rating_columns = []
    for col in facility_rating_columns:
        if col in df.columns:
            valid_rating_columns.append(col)
            print(f"\n--- Unique values in '{col}' ---")
            print(df[col].unique())
        else:
            print(f"Warning: Column '{col}' not found in DataFrame.")
    facility_rating_columns = valid_rating_columns # Update list to only existing columns

Checking columns and unique values before mapping:

--- Unique values in 'facility_cleanliness_rating' ---
['Great' 'Good' 'Okay' nan]

--- Unique values in 'facility_condition_rating' ---
['Good' 'Okay' 'Great' nan]

--- Unique values in 'facility_info_displayed_rating' ---
['Okay' 'Good' 'Great' nan]

--- Unique values in 'facility_private_talk_spaces_rating' ---
['Okay' 'Good' 'Great' nan]

--- Unique values in 'facility_ease_of_movement_rating' ---
['Good' 'Okay' 'Great' nan]

--- Unique values in 'facility_waiting_comfort_rating' ---
['Good' 'Okay' 'Great' nan]


In [319]:
# Apply the mapping to standardize the columns
if df is not None and facility_rating_columns:
    print("\nApplying mapping...")
    for col in facility_rating_columns:
        # Store original values for comparison (optional)
        # original_values = df[col].copy()
        
        # Apply the map. Values not in the map's keys will become NaN.
        df[col] = df[col].map(rating_map)
        print(f"Mapped column: '{col}'")
        
        # Check for values that became NaN because they weren't in the map
        # unexpected_nan_count = df[col].isnull().sum() - original_values.isnull().sum()
        # if unexpected_nan_count > 0:
        #     print(f"  Warning: {unexpected_nan_count} values in '{col}' were not in the rating_map and became NaN.")

    print("\nMapping complete.")
    
    # Verify the changes by checking data types and unique values again
    print("\nData types after mapping:")
    print(df[facility_rating_columns].dtypes)
    
    print("\nUnique values after mapping (for one column):")
    if facility_rating_columns:
        print(df[facility_rating_columns[0]].unique())


Applying mapping...
Mapped column: 'facility_cleanliness_rating'
Mapped column: 'facility_condition_rating'
Mapped column: 'facility_info_displayed_rating'
Mapped column: 'facility_private_talk_spaces_rating'
Mapped column: 'facility_ease_of_movement_rating'
Mapped column: 'facility_waiting_comfort_rating'

Mapping complete.

Data types after mapping:
facility_cleanliness_rating            float64
facility_condition_rating              float64
facility_info_displayed_rating         float64
facility_private_talk_spaces_rating    float64
facility_ease_of_movement_rating       float64
facility_waiting_comfort_rating        float64
dtype: object

Unique values after mapping (for one column):
[nan  2.]


In [320]:
# Check missing values again after mapping
if df is not None and facility_rating_columns:
    print("\nMissing values in standardized columns after mapping:")
    print(df[facility_rating_columns].isnull().sum())


Missing values in standardized columns after mapping:
facility_cleanliness_rating            234
facility_condition_rating              193
facility_info_displayed_rating         242
facility_private_talk_spaces_rating    205
facility_ease_of_movement_rating       211
facility_waiting_comfort_rating        224
dtype: int64


## Impute Missing Facility Ratings
Impute missing values in the facility rating columns using a drill-down approach:
1. Fill missing values with the median rating for the specific `health_facility`.
2. If a `health_facility` has no ratings for a specific aspect (all NaN), fill the remaining NaNs with the overall median for that rating column across all facilities.

In [321]:
facility_rating_cols_to_impute = [
    'facility_cleanliness_rating', 
    'facility_condition_rating', 
    'facility_info_displayed_rating', 
    'facility_private_talk_spaces_rating', 
    'facility_ease_of_movement_rating', 
    'facility_waiting_comfort_rating'
]

if df is not None:
    print("Imputing missing values for facility ratings...")
    for col in facility_rating_cols_to_impute:
        if col in df.columns:
            print(f'  Imputing {col}...')
            # Ensure the column contains only numeric data
            df[col] = pd.to_numeric(df[col], errors='coerce')
            
            # Calculate median per health facility
            facility_median = df.groupby('health_facility')[col].transform('median')
            # Fill NaNs using facility median
            df[col] = df[col].fillna(facility_median)
            
            # Calculate overall median for the column
            overall_median = df[col].median()
            # Fill any remaining NaNs (if a facility had all NaNs) with the overall median
            df[col] = df[col].fillna(overall_median)
            print(f'    Missing values remaining in {col}: {df[col].isnull().sum()}') # Should be 0
        else:
            print(f'  Column {col} not found, skipping imputation.')
    print("Imputation complete.")
else:
    print("DataFrame 'df' not loaded. Cannot perform imputation.")

Imputing missing values for facility ratings...
  Imputing facility_cleanliness_rating...
    Missing values remaining in facility_cleanliness_rating: 0
  Imputing facility_condition_rating...
    Missing values remaining in facility_condition_rating: 0
  Imputing facility_info_displayed_rating...
    Missing values remaining in facility_info_displayed_rating: 0
  Imputing facility_private_talk_spaces_rating...
    Missing values remaining in facility_private_talk_spaces_rating: 0
  Imputing facility_ease_of_movement_rating...
    Missing values remaining in facility_ease_of_movement_rating: 0
  Imputing facility_waiting_comfort_rating...
    Missing values remaining in facility_waiting_comfort_rating: 0
Imputation complete.
    Missing values remaining in facility_condition_rating: 0
  Imputing facility_info_displayed_rating...
    Missing values remaining in facility_info_displayed_rating: 0
  Imputing facility_private_talk_spaces_rating...
    Missing values remaining in facility_pr

## Check Distribution of Facility Ratings After Imputation
Display the value counts for each facility rating column to understand their distribution after imputation.

In [322]:
facility_rating_cols_to_check = [
    'facility_cleanliness_rating', 
    'facility_condition_rating', 
    'facility_info_displayed_rating', 
    'facility_private_talk_spaces_rating', 
    'facility_ease_of_movement_rating', 
    'facility_waiting_comfort_rating'
]

if df is not None:
    print("Value Counts for Facility Rating Columns After Imputation:")
    for col in facility_rating_cols_to_check:
        if col in df.columns:
            print(f"\n--- {col} ---")
            print(df[col].value_counts().sort_index())
        else:
            print(f"\n--- Column {col} not found ---")
else:
    print("DataFrame 'df' not loaded. Cannot check distributions.")

Value Counts for Facility Rating Columns After Imputation:

--- facility_cleanliness_rating ---
facility_cleanliness_rating
2.0    736
Name: count, dtype: int64

--- facility_condition_rating ---
facility_condition_rating
2.0    736
Name: count, dtype: int64

--- facility_info_displayed_rating ---
facility_info_displayed_rating
2.0    736
Name: count, dtype: int64

--- facility_private_talk_spaces_rating ---
facility_private_talk_spaces_rating
2.0    736
Name: count, dtype: int64

--- facility_ease_of_movement_rating ---
facility_ease_of_movement_rating
2.0    736
Name: count, dtype: int64

--- facility_waiting_comfort_rating ---
facility_waiting_comfort_rating
2.0    736
Name: count, dtype: int64
