# Healthcare Utilization and Medicine Availability in Kakuma Refugee Camp

### Cleaning and Transforming Data

In [1]:
#importing data
import pandas as pd
import numpy as np
import re

In [2]:
# Load the dataset
df = pd.read_csv('datasets/kakuma_healthcare_visits.csv')

df.head(10)

Unnamed: 0,Visit_ID,Gender,Age,Zone,Clinic,Visit_Reason,Visit_Date,Medicine_Availability
0,VS0001,Male,63.0,,Kakuma Health Center,Respiratory Infection,2025-06-27,Shortage
1,VS0002,Female,17.0,Kakuma 3,Kalobeyei Clinic,Respiratory Infection,2025-06-04,Shortage
2,VS0003,Male,33.0,,Kalobeyei Clinic,Cough,2025-06-07,Available
3,VS0004,Male,29.0,,Kalobeyei Clinic,Skin Infection,2025-06-22,Available
4,VS0005,Male,13.0,,UNHCR Mobile Unit,Check-up,2025-06-10,Available
5,VS0006,,46.0,Kakuma 2,Kalobeyei Clinic,Check-up,,Out of Stock
6,VS0007,Male,35.0,Kakuma 1,Kakuma Health Center,Maternity,2025-06-21,Shortage
7,VS0008,Male,6.0,Kalobeyei,UNHCR Mobile Unit,Wound,2025-06-15,Out of Stock
8,VS0009,Male,69.0,Kakuma 4,Kalobeyei Clinic,Respiratory Infection,2025-06-15,
9,VS0010,Female,47.0,Kakuma 2,UNHCR Mobile Unit,Cough,2025-06-04,Available


In [3]:
def clean_healthcare_data(df):
    """
    Comprehensive data cleaning for healthcare dataset
    """

In [4]:
df_clean = df.copy()

In [5]:
print("Starting data cleaning process...")
print(f"Initial data shape: {df.shape}")


Starting data cleaning process...
Initial data shape: (1000, 8)


In [6]:
# Preview the dataset

print("\n Dataset Preview:")
df.head()


 Dataset Preview:


Unnamed: 0,Visit_ID,Gender,Age,Zone,Clinic,Visit_Reason,Visit_Date,Medicine_Availability
0,VS0001,Male,63.0,,Kakuma Health Center,Respiratory Infection,2025-06-27,Shortage
1,VS0002,Female,17.0,Kakuma 3,Kalobeyei Clinic,Respiratory Infection,2025-06-04,Shortage
2,VS0003,Male,33.0,,Kalobeyei Clinic,Cough,2025-06-07,Available
3,VS0004,Male,29.0,,Kalobeyei Clinic,Skin Infection,2025-06-22,Available
4,VS0005,Male,13.0,,UNHCR Mobile Unit,Check-up,2025-06-10,Available


# 1. Data Cleaning

###### a) Check for missing values

In [7]:
# a) Check for missing values

print("\n Missing Values:")
df.isnull().sum()


 Missing Values:


Visit_ID                   0
Gender                   100
Age                      100
Zone                     100
Clinic                   100
Visit_Reason             100
Visit_Date               100
Medicine_Availability    100
dtype: int64

###### b) Handle missing data

In [8]:
# b) Handle missing data
# Fill Age with median per clinic
median_age= df['Age'].median()
df['Age'] = df['Age'].fillna(median_age)

df.head(5)

Unnamed: 0,Visit_ID,Gender,Age,Zone,Clinic,Visit_Reason,Visit_Date,Medicine_Availability
0,VS0001,Male,63.0,,Kakuma Health Center,Respiratory Infection,2025-06-27,Shortage
1,VS0002,Female,17.0,Kakuma 3,Kalobeyei Clinic,Respiratory Infection,2025-06-04,Shortage
2,VS0003,Male,33.0,,Kalobeyei Clinic,Cough,2025-06-07,Available
3,VS0004,Male,29.0,,Kalobeyei Clinic,Skin Infection,2025-06-22,Available
4,VS0005,Male,13.0,,UNHCR Mobile Unit,Check-up,2025-06-10,Available


In [9]:
# Fill missing medicines with 'None'

df['Medicine_Availability'] = df['Medicine_Availability'].ffill()

df.Medicine_Availability

0          Shortage
1          Shortage
2         Available
3         Available
4         Available
           ...     
995    Out of Stock
996       Available
997    Out of Stock
998       Available
999    Out of Stock
Name: Medicine_Availability, Length: 1000, dtype: object

In [10]:
# Fill missing Visit_Reason empty values using backward fill
df['Visit_Reason'] = df['Visit_Reason'].ffill()

df['Gender'] = df['Gender'].ffill()

df['Visit_Reason'] = df['Visit_Reason'].ffill()

df['Visit_Date'] = df['Visit_Date'].ffill()

df['Zone'] = df['Zone'].bfill()

df['Clinic'] = df['Clinic'].bfill()


In [11]:
# Verify missing after cleaning

print("\n Missing Values After Cleaning:")
df.isnull().sum()


 Missing Values After Cleaning:


Visit_ID                 0
Gender                   0
Age                      0
Zone                     0
Clinic                   0
Visit_Reason             0
Visit_Date               0
Medicine_Availability    0
dtype: int64

###### c) Standardize Visit_Date

In [12]:
# c) Standardize Visit_Date

df['Visit_Date'] = pd.to_datetime(df['Visit_Date'], errors='coerce', format='%Y-%m-%d')
invalid_dates = df['Visit_Date'].isnull().sum()
if invalid_dates > 0:
    print(f"\n Found {invalid_dates} invalid Visit_Date entries. Dropping them.")
    df = df.dropna(subset=['Visit_Date'])

In [13]:
# Confirm date type

print("\n Visit_Date type:", df['Visit_Date'].dtype)


 Visit_Date type: datetime64[ns]


###### c) Categorize illness types

In [14]:
# d) Clean and normalize text fields

def clean_text(col):
    return (
        col.astype(str)
           .str.strip()
           .str.title()
           .str.normalize('NFKC')
           .str.replace(r'\s+', ' ', regex=True)
    )

text_cols = ['Clinic', 'Visit_Reason', 'Medicine_Availability']
for col in text_cols:
    df[col] = clean_text(df[col])

print("\n Sample Clinic:", df['Clinic'].unique()[:5])


 Sample Clinic: ['Kakuma Health Center' 'Kalobeyei Clinic' 'Unhcr Mobile Unit']


# 2. Data Transformation

###### a) Temporal fields


In [15]:
# Add week, month, and day

df['Week'] = df['Visit_Date'].dt.to_period('W').apply(lambda r: r.start_time)
df['Month'] = df['Visit_Date'].dt.to_period('M').astype(str)
df['Day'] = df['Visit_Date'].dt.date

In [16]:
# Show weekly and monthly samples

print("\n Weekly visits sample:")
print(df.groupby('Week').size().head())

print("\n Monthly visits sample:")
print(df.groupby('Month').size().head())


 Weekly visits sample:
Week
2025-05-26     21
2025-06-02    212
2025-06-09    225
2025-06-16    231
2025-06-23    233
dtype: int64

 Monthly visits sample:
Month
2025-06    953
2025-07     47
dtype: int64


###### b) Zone and Clinic grouping

In [17]:
# b) Zone and Clinic grouping

zone_clinic = (
    df.groupby(['Zone', 'Clinic'])
      .agg(Total_Visits=('Visit_Date', 'count'), Avg_Age=('Age', 'mean'))
      .reset_index()
)
print("\n Zone-Clinic Summary Sample:")
print(zone_clinic.head())


 Zone-Clinic Summary Sample:
       Zone                Clinic  Total_Visits    Avg_Age
0  Kakuma 1  Kakuma Health Center            65  37.153846
1  Kakuma 1      Kalobeyei Clinic            49  39.591837
2  Kakuma 1     Unhcr Mobile Unit            72  34.000000
3  Kakuma 2  Kakuma Health Center            56  35.696429
4  Kakuma 2      Kalobeyei Clinic            74  32.256757


In [18]:
# Clean Aggregated Data (zone_clinic)
zone_clinic['Zone'] = zone_clinic['Zone'].fillna(method='ffill').fillna(method='bfill')
zone_clinic['Clinic'] = zone_clinic['Clinic'].fillna(method='ffill').fillna(method='bfill')
zone_clinic['Avg_Age'] = zone_clinic['Avg_Age'].round(1)


  zone_clinic['Zone'] = zone_clinic['Zone'].fillna(method='ffill').fillna(method='bfill')
  zone_clinic['Clinic'] = zone_clinic['Clinic'].fillna(method='ffill').fillna(method='bfill')


###### c) Categorize illness types

In [19]:
# c) Categorize illness types

def map_illness(reason):
    reason = reason.lower()
    patterns = {
        'Respiratory': r'(cough|flu|cold|breath|chest|asthma|pneumonia)',
        'Gastrointestinal': r'(diarrhea|vomit|stomach|abdomen|typhoid)',
        'Injury': r'(cut|wound|fracture|injury|accident)',
        'Fever/Malaria': r'(malaria|fever|chills)',
        'Reproductive': r'(pregnancy|antenatal|birth|labour)',
        'Other': r'.*'
    }
    for category, pattern in patterns.items():
        if re.search(pattern, reason):
            return category
    return 'Other'

df['Illness_Category'] = df['Visit_Reason'].apply(map_illness)
print("\n Illness Category Distribution:")
print(df['Illness_Category'].value_counts())


 Illness Category Distribution:
Illness_Category
Other               486
Fever/Malaria       140
Respiratory         131
Gastrointestinal    129
Injury              114
Name: count, dtype: int64


In [20]:
# d) Extract and count medicines

df['Medicine_List'] = df['Medicine_Availability'].str.split(',').apply(
    lambda lst: [m.strip().title() for m in lst if m.strip() and m.lower() != 'none']
)

med_df = df.explode('Medicine_List')
med_df = med_df[med_df['Medicine_List'] != '']
med_freq = med_df['Medicine_List'].value_counts().reset_index()
med_freq.columns = ['Medicine', 'Times_Dispensed']
print("\n Top 10 Dispensed Medicines:")
med_freq.head(10)


 Top 10 Dispensed Medicines:


Unnamed: 0,Medicine,Times_Dispensed
0,Out Of Stock,355
1,Available,325
2,Shortage,320


In [21]:
# e) Pivot table by illness and zone

pivot = pd.pivot_table(
    df,
    values='Visit_Date',
    index='Zone',
    columns='Illness_Category',
    aggfunc='count',
    fill_value=0
)
print("\n Pivot Table (Zone vs Illness Category):")
pivot.head()


 Pivot Table (Zone vs Illness Category):


Illness_Category,Fever/Malaria,Gastrointestinal,Injury,Other,Respiratory
Zone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kakuma 1,27,24,20,90,25
Kakuma 2,29,14,28,92,36
Kakuma 3,27,30,20,93,19
Kakuma 4,27,33,24,113,22
Kalobeyei,30,28,22,98,29


In [22]:
df.head()

Unnamed: 0,Visit_ID,Gender,Age,Zone,Clinic,Visit_Reason,Visit_Date,Medicine_Availability,Week,Month,Day,Illness_Category,Medicine_List
0,VS0001,Male,63.0,Kakuma 3,Kakuma Health Center,Respiratory Infection,2025-06-27,Shortage,2025-06-23,2025-06,2025-06-27,Other,[Shortage]
1,VS0002,Female,17.0,Kakuma 3,Kalobeyei Clinic,Respiratory Infection,2025-06-04,Shortage,2025-06-02,2025-06,2025-06-04,Other,[Shortage]
2,VS0003,Male,33.0,Kakuma 2,Kalobeyei Clinic,Cough,2025-06-07,Available,2025-06-02,2025-06,2025-06-07,Respiratory,[Available]
3,VS0004,Male,29.0,Kakuma 2,Kalobeyei Clinic,Skin Infection,2025-06-22,Available,2025-06-16,2025-06,2025-06-22,Other,[Available]
4,VS0005,Male,13.0,Kakuma 2,Unhcr Mobile Unit,Check-Up,2025-06-10,Available,2025-06-09,2025-06,2025-06-10,Other,[Available]


In [25]:
df.to_csv('datasets/cleaned_kakuma_healthcare_visits.csv')