# Preprocessing

**Objective:** Clean, encode, and prepare the raw clinical data for analysis.

## 1. Setup & Data Loading

In [None]:
# Import required libraries
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

In [30]:
# Load Data
try:
    df = pd.read_csv("../1_datasets/raw_data.csv")
except FileNotFoundError:
    # Fallback for different execution contexts
    df = pd.read_csv("raw_data.csv")

print(f"Initial Shape: {df.shape}")
df.head()

Initial Shape: (135, 11)


Unnamed: 0,date_of_visit,age,gender,symptoms,History,status,pulse_rate(b/m),blood_pressure,rapid_diagnostic_test,weight(kg),height(m)
0,2023-01-11,80.0,Female,Knee joint pain; Waist pain; General body pain...,Hx of Hypertension,Hypertension,84.0,160/96,,,1.50m
1,2023-01-16,84.0,Male,Back pain,,,86.0,152/80,,,
2,2023-01-18,72.0,Female,Hand pain; Neck pain,,Hypertension,94.0,163/88,negative,,
3,2023-01-23,61.0,Male,Waist pain; Numb legs,,,71.0,120/91,,,1.62m
4,2023-02-01,69.0,Female,Had malaria symptoms 4 days ago,Hx of Hypertension,Hypertension,114.0,136/74,,,


## 2. Initial Exploration & Formatting

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date_of_visit          132 non-null    object 
 1   age                    132 non-null    float64
 2   gender                 131 non-null    object 
 3   symptoms               125 non-null    object 
 4   History                49 non-null     object 
 5   status                 63 non-null     object 
 6   pulse_rate(b/m)        120 non-null    float64
 7   blood_pressure         131 non-null    object 
 8   rapid_diagnostic_test  38 non-null     object 
 9   weight(kg)             92 non-null     object 
 10  height(m)              98 non-null     object 
dtypes: float64(2), object(9)
memory usage: 11.7+ KB


In [32]:
# Change date column to datetime format
df["date_of_visit"] = pd.to_datetime(df["date_of_visit"], errors="coerce")

# Sort df by date
df = df.sort_values(by="date_of_visit", ascending=True).reset_index(drop=True)

In [33]:
# Generate unique IDs for patient records
df.insert(0, "patient_id", ["P" + str(i).zfill(3) for i in range(1, len(df) + 1)])

##3. Missingness Analysis

In [34]:
# Check missing values
df.isna().sum()

Unnamed: 0,0
patient_id,0
date_of_visit,3
age,3
gender,4
symptoms,10
History,86
status,72
pulse_rate(b/m),15
blood_pressure,4
rapid_diagnostic_test,97


**Observation:** High missingness in `rapid_diagnostic_test`. We will drop this column as it is not relevant to hypertension analysis.

In [35]:
# Drop Irrelevant Columns
if "rapid_diagnostic_test" in df.columns:
    df.drop(columns=["rapid_diagnostic_test"], inplace=True)
print(df.columns)

Index(['patient_id', 'date_of_visit', 'age', 'gender', 'symptoms', 'History',
       'status', 'pulse_rate(b/m)', 'blood_pressure', 'weight(kg)',
       'height(m)'],
      dtype='object')


In [36]:
# Inspect missing values
df[df[["gender", "age"]].isna().any(axis=1)]

Unnamed: 0,patient_id,date_of_visit,age,gender,symptoms,History,status,pulse_rate(b/m),blood_pressure,weight(kg),height(m)
22,P023,2023-07-10,,,Headaches; Body pains; Fever; Abd discomfort,,,,127/83,,
90,P091,2024-06-10,69.0,,Hx of Hypertension,Hx of Hypertension,Hypertension,77.0,185/89,73kg,1.45m
94,P095,2024-06-26,,,Pains at the left leg,,,48.0,107/43,48kg,1.55m
133,P134,NaT,,,Hx of Hypertension; Headaches; fatigue,Hx of Hypertension,Hypertension,103.0,157/93,83kg,1.57m


**Context:**
Specific records (P023, P091, P134, etc.) were identified during manual review as having missing but recoverable data.


In [37]:
# Add missing gender values
df.loc[df["patient_id"].isin(["P023", "P091", "P134"]), "gender"] = "Female"

**Context:** P134 was identified as female based on the most common gender and height, which falls within the typical female range.

In [38]:
# Add missing age values
df.loc[df["patient_id"] == "P023", "age"] = 70.0

In [39]:
# Inspect impossible ages
df[(df["age"] > 90) | (df["age"] < 40)]

Unnamed: 0,patient_id,date_of_visit,age,gender,symptoms,History,status,pulse_rate(b/m),blood_pressure,weight(kg),height(m)
110,P111,2024-09-23,1964.0,male,Fatigue; Weakness; Body pains; Back pains,,Hypertension,,160/100,66kg,1.70m
124,P125,2024-12-02,1971.0,female,Hx of Hypertension; Numbness; leg pain; Headaches,Hx of Hypertension,Hypertension,75.0,201/97,,
125,P126,2024-12-11,1958.0,female,Body pain; sleepiness; Hx of hypertension,Hx of Hypertension,Hypertension,88.0,188/94,58kg,1.55m
126,P127,2024-12-12,1954.0,female,Pain at the waist; Pain in the leg; Pains on t...,Hx of Hypertension,Hypertension,88.0,169/90,70kg,1.45m
134,P135,NaT,1964.0,female,General body Pains,,Hypertension,82.0,202/100,,


In [40]:
# Fix wrong values
df.loc[df["patient_id"] == "P111", "age"] = 60.0
df.loc[df["patient_id"] == "P125", "age"] = 53.0
df.loc[df["patient_id"] == "P126", "age"] = 66.0
df.loc[df["patient_id"] == "P127", "age"] = 70.0
df.loc[df["patient_id"] == "P135", "age"] = 60.0

**Context:** This record was flagged manually.

In [41]:
# Drop flagged record
df.drop(index=94, inplace=True)

##4. Cleaning & Standardisation

In [42]:
# Display unique values
df["gender"].unique()

array(['Female', 'Male', 'female', 'male'], dtype=object)

In [43]:
# Convert to lowercase
df["gender"] = df["gender"].str.lower()

df["gender"].unique()

array(['female', 'male'], dtype=object)

In [44]:
# Clean Numerical Columns (Remove units like 'kg', 'm')
def clean_numeric(val, unit):
    if pd.isna(val):
        return val
    return str(val).lower().replace(unit, "").strip()


if "weight(kg)" in df.columns:
    df["weight(kg)"] = df["weight(kg)"].apply(lambda x: clean_numeric(x, "kg"))
    df["weight(kg)"] = pd.to_numeric(df["weight(kg)"], errors="coerce")

if "height(m)" in df.columns:
    df["height(m)"] = df["height(m)"].apply(lambda x: clean_numeric(x, "m"))
    df["height(m)"] = pd.to_numeric(df["height(m)"], errors="coerce")

print("Cleaned Weight and Height columns.")

Cleaned Weight and Height columns.


In [45]:
# Split Blood Pressure
if "blood_pressure" in df.columns:
    # Extract Systolic/Diastolic using regex or string split
    # (Assuming format '120/80')
    df[["Systolic", "Diastolic"]] = df["blood_pressure"].str.split("/", expand=True)
    df["Systolic"] = pd.to_numeric(df["Systolic"], errors="coerce")
    df["Diastolic"] = pd.to_numeric(df["Diastolic"], errors="coerce")
    print("Split Blood Pressure into Systolic/Diastolic.")

Split Blood Pressure into Systolic/Diastolic.


In [46]:
df.head()

Unnamed: 0,patient_id,date_of_visit,age,gender,symptoms,History,status,pulse_rate(b/m),blood_pressure,weight(kg),height(m),Systolic,Diastolic
0,P001,2023-01-11,80.0,female,Knee joint pain; Waist pain; General body pain...,Hx of Hypertension,Hypertension,84.0,160/96,,1.5,160.0,96.0
1,P002,2023-01-16,84.0,male,Back pain,,,86.0,152/80,,,152.0,80.0
2,P003,2023-01-18,72.0,female,Hand pain; Neck pain,,Hypertension,94.0,163/88,,,163.0,88.0
3,P004,2023-01-23,61.0,male,Waist pain; Numb legs,,,71.0,120/91,,1.62,120.0,91.0
4,P005,2023-02-01,69.0,female,Had malaria symptoms 4 days ago,Hx of Hypertension,Hypertension,114.0,136/74,,,136.0,74.0


In [47]:
df.isna().sum()

Unnamed: 0,0
patient_id,0
date_of_visit,3
age,1
gender,0
symptoms,10
History,85
status,71
pulse_rate(b/m),15
blood_pressure,4
weight(kg),43


##5. Encoding Categorical Variables

In [48]:
df["History"].unique()

array(['Hx of Hypertension', nan, 'No Hx of Hypertension',
       'No hx of Hypertension'], dtype=object)

In [49]:
# Encode History
if "History" in df.columns:
    df["History_category"] = (
        df["History"]
        .map(
            {
                "Hx of Hypertension": "present",
                "No Hx of Hypertension": "absent",
                "No hx of Hypertension": "absent",
            }
        )
        .fillna("missing")
    )

    df["history_encoded"] = df["History_category"].map(
        {"present": 1, "absent": 0, "missing": -1}
    )
    print("Encoded History.")

Encoded History.


In [50]:
# Encode Status (Target)
if "status" in df.columns:
    df["status_encoded"] = df["status"].apply(
        lambda x: 1 if str(x).lower() == "hypertension" else 0
    )
    print("Encoded Status.")

Encoded Status.


In [51]:
# Encode Gender
if "gender" in df.columns:
    df["gender_encoded"] = df["gender"].str.lower().map({"male": 1, "female": 0})
    print("Encoded Gender.")

Encoded Gender.


##6. Conditional Imputation

**Method:** Fill missing Systolic/Diastolic BP with the median of their respective class (Hypertensive vs. Healthy).

In [52]:
# Conditional Median Imputation for BP
for col in ["Systolic", "Diastolic"]:
    if col in df.columns:
        # Calculate median for Healthy (0) and Hypertensive (1)
        medians = df.groupby("status_encoded")[col].median()

        # Fill based on row status
        df[col] = df.apply(
            lambda row: medians[row["status_encoded"]]
            if pd.isnull(row[col])
            else row[col],
            axis=1,
        )
print("Imputed BP values using class medians.")

Imputed BP values using class medians.


##7. Standardise Columns

In [53]:
# Drop irrelevant Columns
cols_to_drop = [
    "patient_id",
    "date_of_visit",
    "History_category",
    "blood_pressure",
    "symptoms",
    "History",
    "gender",
    "status",
]
df_final = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

print("Final Columns:", df_final.columns.tolist())
df_final.info()

Final Columns: ['age', 'pulse_rate(b/m)', 'weight(kg)', 'height(m)', 'Systolic', 'Diastolic', 'history_encoded', 'status_encoded', 'gender_encoded']
<class 'pandas.core.frame.DataFrame'>
Index: 134 entries, 0 to 134
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              133 non-null    float64
 1   pulse_rate(b/m)  119 non-null    float64
 2   weight(kg)       91 non-null     float64
 3   height(m)        79 non-null     float64
 4   Systolic         134 non-null    float64
 5   Diastolic        134 non-null    float64
 6   history_encoded  134 non-null    int64  
 7   status_encoded   134 non-null    int64  
 8   gender_encoded   134 non-null    int64  
dtypes: float64(6), int64(3)
memory usage: 10.5 KB


In [54]:
# Rename columns
df = df.rename(
    columns={
        "status_encoded": "hypertension",
        "gender_encoded": "gender",
        "history_encoded": "history_hypertension",
        "pulse_rate(b/m)": "pulse_bpm",
        "height(m)": "height_m",
        "weight(kg)": "weight_kg",
        "Systolic": "systolic_bp",
        "Diastolic": "diastolic_bp",
    }
)

## 8. Export

In [58]:
# Save Processed Data
output_file = "../1_datasets/encoded_data.csv"
df_final.to_csv(output_file, index=False)