# Exploratory Data Analysis

Libraries

In [2]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import accuracy_score, classification_report 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import learning_curve
import joblib 
import seaborn as sns 
import warnings 
import matplotlib.pyplot as plt 

warnings.filterwarnings("ignore")

Loading the Dataset

In [5]:
file_path = "hypertension_dataset.csv"
df = pd.read_csv(file_path)

In [6]:
# Describe the dataset
df.describe()

Unnamed: 0,Age,BMI,Cholesterol,Systolic_BP,Diastolic_BP,Alcohol_Intake,Stress_Level,Salt_Intake,Sleep_Duration,Heart_Rate,LDL,HDL,Triglycerides,Glucose
count,174982.0,174982.0,174982.0,174982.0,174982.0,174982.0,174982.0,174982.0,174982.0,174982.0,174982.0,174982.0,174982.0,174982.0
mean,53.493319,27.494672,224.539335,134.505229,89.450315,14.991069,5.013041,8.482757,6.996468,74.495085,129.528271,64.573105,149.569836,134.488187
std,20.786353,7.213866,43.361589,26.019268,17.310789,8.652624,2.582341,3.750074,1.730731,14.438573,34.635795,20.190666,57.765006,37.526882
min,18.0,15.0,150.0,90.0,60.0,0.0,1.0,2.0,4.0,50.0,70.0,30.0,50.0,70.0
25%,36.0,21.2,187.0,112.0,74.0,7.5,3.0,5.2,5.5,62.0,100.0,47.0,100.0,102.0
50%,54.0,27.5,225.0,135.0,89.0,15.0,5.0,8.5,7.0,74.0,130.0,65.0,150.0,134.0
75%,71.0,33.7,262.0,157.0,104.0,22.5,7.0,11.7,8.5,87.0,160.0,82.0,200.0,167.0
max,89.0,40.0,299.0,179.0,119.0,30.0,9.0,15.0,10.0,99.0,189.0,99.0,249.0,199.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174982 entries, 0 to 174981
Data columns (total 23 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Country                  174982 non-null  object 
 1   Age                      174982 non-null  int64  
 2   BMI                      174982 non-null  float64
 3   Cholesterol              174982 non-null  int64  
 4   Systolic_BP              174982 non-null  int64  
 5   Diastolic_BP             174982 non-null  int64  
 6   Smoking_Status           174982 non-null  object 
 7   Alcohol_Intake           174982 non-null  float64
 8   Physical_Activity_Level  174982 non-null  object 
 9   Family_History           174982 non-null  object 
 10  Diabetes                 174982 non-null  object 
 11  Stress_Level             174982 non-null  int64  
 12  Salt_Intake              174982 non-null  float64
 13  Sleep_Duration           174982 non-null  float64
 14  Hear

In [8]:
# Checking data types
df.dtypes

Country                     object
Age                          int64
BMI                        float64
Cholesterol                  int64
Systolic_BP                  int64
Diastolic_BP                 int64
Smoking_Status              object
Alcohol_Intake             float64
Physical_Activity_Level     object
Family_History              object
Diabetes                    object
Stress_Level                 int64
Salt_Intake                float64
Sleep_Duration             float64
Heart_Rate                   int64
LDL                          int64
HDL                          int64
Triglycerides                int64
Glucose                      int64
Gender                      object
Education_Level             object
Employment_Status           object
Hypertension                object
dtype: object

In [9]:
# Checking column names
df.columns

Index(['Country', 'Age', 'BMI', 'Cholesterol', 'Systolic_BP', 'Diastolic_BP',
       'Smoking_Status', 'Alcohol_Intake', 'Physical_Activity_Level',
       'Family_History', 'Diabetes', 'Stress_Level', 'Salt_Intake',
       'Sleep_Duration', 'Heart_Rate', 'LDL', 'HDL', 'Triglycerides',
       'Glucose', 'Gender', 'Education_Level', 'Employment_Status',
       'Hypertension'],
      dtype='object')

In [10]:
# Checking for unique values
df.nunique()

Country                     20
Age                         72
BMI                        251
Cholesterol                150
Systolic_BP                 90
Diastolic_BP                60
Smoking_Status               3
Alcohol_Intake             301
Physical_Activity_Level      3
Family_History               2
Diabetes                     2
Stress_Level                 9
Salt_Intake                131
Sleep_Duration              61
Heart_Rate                  50
LDL                        120
HDL                         70
Triglycerides              200
Glucose                    130
Gender                       2
Education_Level              3
Employment_Status            3
Hypertension                 2
dtype: int64

In [11]:
# Checking for missing values
missing_values = df.isnull().sum()
missing_values

Country                    0
Age                        0
BMI                        0
Cholesterol                0
Systolic_BP                0
Diastolic_BP               0
Smoking_Status             0
Alcohol_Intake             0
Physical_Activity_Level    0
Family_History             0
Diabetes                   0
Stress_Level               0
Salt_Intake                0
Sleep_Duration             0
Heart_Rate                 0
LDL                        0
HDL                        0
Triglycerides              0
Glucose                    0
Gender                     0
Education_Level            0
Employment_Status          0
Hypertension               0
dtype: int64

In [12]:
# Checking for duplicated values
duplicated_values = df.duplicated().sum()
duplicated_values

0

In [13]:
# Checkign the distrubution of Hypertension values
print("Proportion of Hypertension Values")
df['Hypertension'].value_counts(normalize=True)

Proportion of Hypertension Values


Hypertension
High    0.718823
Low     0.281177
Name: proportion, dtype: float64