Importing Libraries

In [20]:
import numpy as np
import pandas as pd

Loading the CSV file into a dataframe

In [21]:
df = pd.read_csv("originaldata/hypertension.csv")
df.head()

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,Risk
0,1,39,0,0.0,0.0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,23.0,0.0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


labelling the model's labels

In [22]:
# Function to determine risk based on blood pressure values
def determine_risk(row):
    if row['sysBP'] > 180 or row['diaBP'] > 120:
        return 'Critical'
    elif row['sysBP'] >= 140 or row['diaBP'] >= 90:
        return 'Stage 2'
    elif row['sysBP'] >= 130 or row['diaBP'] >= 80:
        return 'Stage 1'
    elif row['sysBP'] >= 120 and row['diaBP'] < 80:
        return 'Elevated'
    else:
        return 'Normal'

# Applying function to the DataFrame
df['Risk'] = df.apply(determine_risk, axis=1)

df.head()

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,Risk
0,1,39,0,0.0,0.0,0,195.0,106.0,70.0,26.97,80.0,77.0,Normal
1,0,46,0,0.0,0.0,0,250.0,121.0,81.0,28.73,95.0,76.0,Stage 1
2,1,48,1,20.0,0.0,0,245.0,127.5,80.0,25.34,75.0,70.0,Stage 1
3,0,61,1,30.0,0.0,0,225.0,150.0,95.0,28.58,65.0,103.0,Stage 2
4,0,46,1,23.0,0.0,0,285.0,130.0,84.0,23.1,85.0,85.0,Stage 1


Counting Number of Unique Labels in the Risks column

In [23]:
df.Risk.value_counts()

Risk
Stage 2     1352
Stage 1     1284
Normal      1033
Elevated     410
Critical     161
Name: count, dtype: int64

Checking if dataframe has null values

In [26]:
print(df.isnull().sum())
df.info()
df.head()
df = df.dropna()


male               0
age                0
currentSmoker      0
cigsPerDay        29
BPMeds            53
diabetes           0
totChol           50
sysBP              0
diaBP              0
BMI               19
heartRate          1
glucose          388
Risk               0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   male           4240 non-null   int64  
 1   age            4240 non-null   int64  
 2   currentSmoker  4240 non-null   int64  
 3   cigsPerDay     4211 non-null   float64
 4   BPMeds         4187 non-null   float64
 5   diabetes       4240 non-null   int64  
 6   totChol        4190 non-null   float64
 7   sysBP          4240 non-null   float64
 8   diaBP          4240 non-null   float64
 9   BMI            4221 non-null   float64
 10  heartRate      4239 non-null   float64
 11  glucose        3852 non-null  

In [27]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN


# Features and target
X = df.drop('Risk', axis=1)
y = df['Risk']

# SMOTE for oversampling the minority classes
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Random undersampling of the majority classes
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_res, y_res = rus.fit_resample(X_res, y_res)

# Combined SMOTE and ENN (Edited Nearest Neighbours) for better balancing
smote_enn = SMOTEENN(sampling_strategy='auto', random_state=42)
X_res, y_res = smote_enn.fit_resample(X, y)

# Combine the resampled features and target back into a DataFrame
resampled_df = pd.concat([X_res, y_res], axis=1)

# Save the balanced dataset
resampled_df.to_csv('balanced_dataset.csv', index=False)

# Check the distribution of the new dataset
print(resampled_df['Risk'].value_counts())

Risk
Critical    1198
Elevated    1065
Normal       902
Stage 2      727
Stage 1      452
Name: count, dtype: int64


Saving Data Frame into CSV file

In [28]:
df.to_csv('modifieddata/modified_hypertension_data.csv', index=False)

print("File saved as 'modified_hypertension_data.csv'")

File saved as 'modified_hypertension_data.csv'
