# Initial Data Exploration
This notebook performs initial exploration of the cleaned patient survey data.
Steps include: loading the data, viewing the first few rows, checking dimensions, listing columns, checking for missing values, and examining data types.

In [1]:
# Import pandas library
import pandas as pd

In [2]:
# Load the dataset
# Adjust the path if your notebook is in a different location relative to the data folder
file_path = '../data/cleaned/cleaned_patient_survey_data.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded {file_path}")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path.")
    # You might want to stop execution here or handle the error appropriately
    df = None # Set df to None if file not found

# Display the first 5 rows (only if df was loaded successfully)
if df is not None:
    print("\nFirst 5 rows of the dataset:")
    display(df.head())

Successfully loaded ../data/cleaned/cleaned_patient_survey_data.csv

First 5 rows of the dataset:


Unnamed: 0,governorate,district,sub_district,community,grant,health_facility,other_health_facility,observer_name,patient_age_group,respondent_sex,...,has_other_feedback,other_feedback_text,consent_date,consent_year,consent_quarter,consent_month,consent_day,consent_weekday,consent_month_name,consent_weekday_name
0,Aleppo,A'zaz,Aghtrin,Akhtrein,,Akhtarin Hospital,,محمد الدرويش,Child Male,female,...,no,,2024-11-17,2024.0,4.0,11.0,17.0,6.0,November,Sunday
1,Aleppo,A'zaz,Aghtrin,Akhtrein,,Akhtarin Hospital,,محمد الدرويش,Child Female,female,...,no,,2024-11-17,2024.0,4.0,11.0,17.0,6.0,November,Sunday
2,Aleppo,A'zaz,Aghtrin,Akhtrein,,Akhtarin Hospital,,محمد الدرويش,Child Male,female,...,yes,ارجو تصليح جهاز الاشعة,2024-11-17,2024.0,4.0,11.0,17.0,6.0,November,Sunday
3,Aleppo,A'zaz,Aghtrin,Akhtrein,,Akhtarin Hospital,,محمد الدرويش,Child Male,female,...,no,,2024-11-17,2024.0,4.0,11.0,17.0,6.0,November,Sunday
4,Aleppo,A'zaz,Aghtrin,Akhtrein,,Akhtarin Hospital,,محمد الدرويش,Child Female,female,...,no,,2024-11-17,2024.0,4.0,11.0,17.0,6.0,November,Sunday


In [3]:
# Show the number of rows and columns (only if df was loaded successfully)
if df is not None:
    print(f"\nDataset shape (rows, columns): {df.shape}")


Dataset shape (rows, columns): (736, 106)


In [4]:
# List column names (only if df was loaded successfully)
if df is not None:
    print("\nColumn names:")
    print(list(df.columns))


Column names:
['governorate', 'district', 'sub_district', 'community', 'grant', 'health_facility', 'other_health_facility', 'observer_name', 'patient_age_group', 'respondent_sex', 'respondent_age_years', 'patient_age_years', 'is_first_visit', 'marital_status', 'num_children', 'oldest_child_age', 'difficulty_seeing', 'difficulty_hearing', 'difficulty_walking', 'difficulty_communicating', 'difficulty_remembering', 'facility_access_easy', 'facility_access_easy_reason', 'facility_access_not_easy_reason', 'facility_access_other_reason', 'service_discovery_method', 'service_discovery_other', 'facility_selection_reason', 'facility_selection_other', 'overall_service_quality', 'service_quality_bad_reason', 'services_received_list', 'service_general_clinic', 'service_gynecology', 'service_pediatric', 'service_nutrition', 'service_other_flag', 'Which service/s did you receive today?/5', 'birth_services', 'specify_others', 'was_charged_for_service', 'charged_service_list', 'charged_service_genera

In [5]:
# Check for missing values (only if df was loaded successfully)
if df is not None:
    print("\nMissing values per column:")
    print(df.isnull().sum())


Missing values per column:
governorate               1
district                  1
sub_district              1
community                 1
grant                   489
                       ... 
consent_month             1
consent_day               1
consent_weekday           1
consent_month_name        1
consent_weekday_name      1
Length: 106, dtype: int64


In [6]:
# Show data types of each column (only if df was loaded successfully)
if df is not None:
    print("\nData types of each column:")
    print(df.dtypes)


Data types of each column:
governorate              object
district                 object
sub_district             object
community                object
grant                    object
                         ...   
consent_month           float64
consent_day             float64
consent_weekday         float64
consent_month_name       object
consent_weekday_name     object
Length: 106, dtype: object
