In [1]:
import pandas as pd
import numpy as np

In [2]:
# import csv
heart = pd.read_csv('heart_data.csv')
sleep = pd.read_csv('sleep_data.csv')

In [3]:
# show heart
heart.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
# Data Exploration for heart
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


### Dataset Description

- Age: Age of participant (integer)
- Gender: Gender of participant (male: 1/female: 2).
- Height: Height measured in centimeters (integer)
- Weight: Weight measured in kilograms (integer)
- Ap_hi: Systolic blood pressure reading taken from patient (integer)
- Ap_lo: Diastolic blood pressure reading taken from patient (integer)
- Cholesterol: Total cholesterol level read as mg/dl on a scale 0 - 5+ units( integer). Each unit denoting increase/decrease by 20 mg/dL respectively.
- Gluc: Glucose level read as mmol/l on a scale 0 - 16+ units( integer). Each unit denoting increase Decreaseby 1 mmol/L respectively.
- Smoke: Whether person smokes or not(binary; 0= No , 1=Yes).
- Alco: Whether person drinks alcohol or not(binary; 0 =No ,1 =Yes ).
- Active: whether person physically active or not( Binary ;0 =No,1 = Yes ).
- Cardio: whether person suffers from cardiovascular diseases or not(Binary ;0 – no , 1 ­‑yes ).

In [5]:
heart.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [6]:
heart.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [7]:
# Notice age is counted for days 
# Convert the age column from days to years, considering leap years
heart['age_years'] = heart['age'] / 365.25
heart[['age', 'age_years']].head()

Unnamed: 0,age,age_years
0,18393,50.35729
1,20228,55.381246
2,18857,51.627652
3,17623,48.249144
4,17474,47.841205


In [8]:
# Replace the original age column with the converted age_years values rounded to the nearest whole number
heart['age'] = heart['age_years'].round().astype(int)

# Drop the age_years column since we no longer need it
heart.drop(columns='age_years', inplace=True)

In [9]:
# Looking at ap_hi and ap_low there is some negative value probably by mistake in the input
# use abs to make it positive and check it again
heart['ap_hi'] = heart['ap_hi'].abs()
heart['ap_lo'] = heart['ap_lo'].abs()

In [10]:
# need to check ap_hi
heart['ap_hi'].sort_values(ascending=False).reset_index(drop=True).head(50)

0     16020
1     14020
2     14020
3     14020
4     14020
5     13010
6     13010
7     11500
8     11020
9      2000
10     1620
11     1500
12     1420
13     1420
14     1409
15     1400
16     1400
17     1400
18     1300
19     1300
20     1205
21     1202
22     1130
23     1110
24      960
25      909
26      907
27      907
28      907
29      906
30      906
31      906
32      906
33      906
34      906
35      902
36      806
37      701
38      401
39      309
40      240
41      240
42      240
43      240
44      240
45      240
46      240
47      230
48      220
49      220
Name: ap_hi, dtype: int64

In [11]:
heart['ap_hi'].sort_values().reset_index(drop=True).head(50)

0      1
1      1
2      7
3     10
4     10
5     10
6     10
7     10
8     10
9     10
10    11
11    11
12    11
13    11
14    11
15    11
16    11
17    11
18    11
19    11
20    11
21    11
22    11
23    11
24    11
25    11
26    11
27    11
28    11
29    11
30    11
31    11
32    11
33    11
34    11
35    11
36    11
37    11
38    12
39    12
40    12
41    12
42    12
43    12
44    12
45    12
46    12
47    12
48    12
49    12
Name: ap_hi, dtype: int64

In [12]:
# something is not right with data
# need to drop all columns that is not in range for systolic blood pressure
# assume that lowest can be 50 and highest can be 200
heart = heart[(heart['ap_hi'] >= 50) & (heart['ap_hi'] <= 200)]

In [13]:
# samething with ap_lo
# need to drap all the columns that is not in range for diastolic blood pressure
# assume that lowest can be 30 and highest can be 125
heart = heart[(heart['ap_lo'] >= 30) & (heart['ap_lo'] <= 125)]

heart.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,2,52,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,4,48,1,156,56.0,100,60,1,1,0,0,0,0


In [14]:
# remove all data which ap_lo has higher than ap_hi
heart = heart[heart['ap_lo'] <= heart['ap_hi']]

In [15]:
heart.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0
mean,49971.031805,53.289158,1.348468,164.359167,74.103045,126.578025,81.250926,1.364181,1.225447,0.087864,0.053304,0.803297,0.494315
std,28846.508255,6.764072,0.476489,8.183,14.31726,16.460556,9.329462,0.678551,0.571357,0.283099,0.224641,0.397509,0.499971
min,0.0,30.0,1.0,55.0,11.0,60.0,30.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,24997.25,48.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50005.5,54.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74867.75,58.0,2.0,170.0,82.0,140.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,65.0,2.0,250.0,200.0,200.0,125.0,3.0,3.0,1.0,1.0,1.0,1.0


In [16]:
# Create new column "ap" that is difference between ap_hi and ap_low
heart['ap'] = heart['ap_hi'] - heart['ap_lo']

In [17]:
# Display the first few rows of the dataframe to confirm the changes
heart.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,ap
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0,30
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1,50
2,2,52,1,165,64.0,130,70,3,1,0,0,0,1,60
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1,50
4,4,48,1,156,56.0,100,60,1,1,0,0,0,0,40


In [18]:
# Calculate BMI with given information
heart['bmi'] = heart['weight'] / ((heart['height'] / 100) ** 2)

heart.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,ap,bmi
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0,30,21.96712
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1,50,34.927679
2,2,52,1,165,64.0,130,70,3,1,0,0,0,1,60,23.507805
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1,50,28.710479
4,4,48,1,156,56.0,100,60,1,1,0,0,0,0,40,23.011177


In [19]:
heart.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,ap,bmi
count,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0,68606.0
mean,49971.031805,53.289158,1.348468,164.359167,74.103045,126.578025,81.250926,1.364181,1.225447,0.087864,0.053304,0.803297,0.494315,45.3271,27.516503
std,28846.508255,6.764072,0.476489,8.183,14.31726,16.460556,9.329462,0.678551,0.571357,0.283099,0.224641,0.397509,0.499971,11.57344,6.04759
min,0.0,30.0,1.0,55.0,11.0,60.0,30.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3.471784
25%,24997.25,48.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,40.0,23.875115
50%,50005.5,54.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,40.0,26.346494
75%,74867.75,58.0,2.0,170.0,82.0,140.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0,50.0,30.119376
max,99999.0,65.0,2.0,250.0,200.0,200.0,125.0,3.0,3.0,1.0,1.0,1.0,1.0,135.0,298.666667


In [20]:
# Since bmi like 298 is too irregular need to drop columns that has irregular data
# usually bmi above 45 is highly unlikely so drop anything that is above 45
heart = heart[(heart['bmi'] <= 45)]

heart.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,ap,bmi
count,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0
mean,49975.611003,53.285945,1.350046,164.471941,73.71811,126.489946,81.208205,1.362142,1.223521,0.088132,0.053297,0.803422,0.49293,45.281741,27.276698
std,28846.843008,6.76488,0.476987,7.838807,13.552824,16.401575,9.303021,0.676979,0.569252,0.283489,0.224627,0.397414,0.499954,11.537754,4.856586
min,0.0,30.0,1.0,120.0,11.0,60.0,30.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3.471784
25%,25000.5,48.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,40.0,23.875115
50%,50016.5,54.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,40.0,26.291724
75%,74869.75,58.0,2.0,170.0,82.0,140.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0,50.0,30.083829
max,99999.0,65.0,2.0,250.0,149.0,200.0,125.0,3.0,3.0,1.0,1.0,1.0,1.0,135.0,44.997166


In [21]:
# to make it simliar to sleep data will catagorize into three different BMI levels
# Define a function to categorize BMI levels
def categorize_bmi(bmi):
    if bmi <= 24.9:
        return 1  # Underweight to Normal
    elif bmi <= 29.9:
        return 2  # Overweight
    else:
        return 3  # Obesity

In [22]:
heart['bmi_level'] = heart['bmi'].apply(categorize_bmi)


In [23]:
heart.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,ap,bmi,bmi_level
count,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0,68034.0
mean,49975.611003,53.285945,1.350046,164.471941,73.71811,126.489946,81.208205,1.362142,1.223521,0.088132,0.053297,0.803422,0.49293,45.281741,27.276698,1.890334
std,28846.843008,6.76488,0.476987,7.838807,13.552824,16.401575,9.303021,0.676979,0.569252,0.283489,0.224627,0.397414,0.499954,11.537754,4.856586,0.783774
min,0.0,30.0,1.0,120.0,11.0,60.0,30.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3.471784,1.0
25%,25000.5,48.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,40.0,23.875115,1.0
50%,50016.5,54.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,40.0,26.291724,2.0
75%,74869.75,58.0,2.0,170.0,82.0,140.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0,50.0,30.083829,3.0
max,99999.0,65.0,2.0,250.0,149.0,200.0,125.0,3.0,3.0,1.0,1.0,1.0,1.0,135.0,44.997166,3.0


In [24]:
# Sleep Description
sleep.head()

Unnamed: 0,Person ID,Sleep Duration,Quality of Sleep,Age,Physical Activity Level,Stress Level,Blood Pressure 1,Blood Pressure 2,Heart Rate,Daily Steps,BMI Levels,Occupation,Gender
0,1,10,8.0,41,1,8,126,82,95,7000,2,Software Engineer,Female
1,2,8,7.0,44,1,6,138,74,66,9000,2,Accountant,Female
2,3,8,7.0,31,3,5,123,68,76,4000,3,Salesperson,Female
3,4,10,8.0,36,2,6,131,76,74,6000,2,Sales Representative,Female
4,5,10,8.0,37,3,3,103,67,89,7000,2,Lawyer,Female


In [25]:
# Data Exploration for sleep
sleep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                1000 non-null   int64  
 1   Sleep Duration           1000 non-null   int64  
 2   Quality of Sleep         1000 non-null   float64
 3   Age                      1000 non-null   int64  
 4   Physical Activity Level  1000 non-null   int64  
 5   Stress Level             1000 non-null   int64  
 6   Blood Pressure 1         1000 non-null   int64  
 7   Blood Pressure 2         1000 non-null   int64  
 8   Heart Rate               1000 non-null   int64  
 9   Daily Steps              1000 non-null   int64  
 10  BMI Levels               1000 non-null   int64  
 11  Occupation               1000 non-null   object 
 12  Gender                   1000 non-null   object 
dtypes: float64(1), int64(10), object(2)
memory usage: 101.7+ KB


### Dataset Description


- Sleep Duration: Participant's sleep duration per hour (integer)
- Quality of Sleep: Participan's subjective rating of quality out of 1-10 (float)
- Age: Age of Participant (integer)
- Physical Activity Level: Participant's Physical Activity subjective rating out of 1-7 (integer)
- Stress level: Participant's stress level Participant rating out of 1-10 (integer)
- Blood Pressure 1: Systolic blood pressure reading taken from participant (integer)
- Blood Pressure 2: Diastolic blood pressure reading taken from participant (integer)
- Heart Rate: Average Heart Rate taken from participant (integer)
- Daily Steps: Average Daily Steps recorded from participant (integer)
- BMI Levels: BMI class
    - Class 1 - BMI from 0 to 24.9, covering underweight to normal weight
    - Class 2 - BMI from 25 to 29.9, indicating overweight.
    - Class 3 - BMI 30 or above, representing obesity.
- Occupation: information about participant's occupation (object)
- Gender: Gender of participant (string)

In [26]:
sleep.describe()

Unnamed: 0,Person ID,Sleep Duration,Quality of Sleep,Age,Physical Activity Level,Stress Level,Blood Pressure 1,Blood Pressure 2,Heart Rate,Daily Steps,BMI Levels
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,6.932,6.466,35.134,2.953,4.947,120.273,79.849,75.14,7933.0,2.019
std,288.819436,1.506534,0.753267,9.676036,1.002894,2.05095,9.974767,7.871478,10.096353,1984.048247,0.831463
min,1.0,2.0,4.0,5.0,0.0,-2.0,90.0,58.0,44.0,2000.0,1.0
25%,250.75,6.0,6.0,28.0,2.0,4.0,114.0,75.0,68.0,7000.0,1.0
50%,500.5,7.0,6.5,35.0,3.0,5.0,120.0,80.0,75.0,8000.0,2.0
75%,750.25,8.0,7.0,41.0,4.0,6.0,127.0,85.0,82.0,9000.0,3.0
max,1000.0,11.0,8.5,67.0,6.0,13.0,150.0,103.0,105.0,15000.0,3.0


In [27]:
sleep.isnull().sum()

Person ID                  0
Sleep Duration             0
Quality of Sleep           0
Age                        0
Physical Activity Level    0
Stress Level               0
Blood Pressure 1           0
Blood Pressure 2           0
Heart Rate                 0
Daily Steps                0
BMI Levels                 0
Occupation                 0
Gender                     0
dtype: int64

In [28]:
# Adjusting Stress level column as there is weird data
sleep['Stress Level'] = sleep['Stress Level'].clip(lower=0, upper=10)

sleep.describe()

Unnamed: 0,Person ID,Sleep Duration,Quality of Sleep,Age,Physical Activity Level,Stress Level,Blood Pressure 1,Blood Pressure 2,Heart Rate,Daily Steps,BMI Levels
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,6.932,6.466,35.134,2.953,4.944,120.273,79.849,75.14,7933.0,2.019
std,288.819436,1.506534,0.753267,9.676036,1.002894,2.006711,9.974767,7.871478,10.096353,1984.048247,0.831463
min,1.0,2.0,4.0,5.0,0.0,0.0,90.0,58.0,44.0,2000.0,1.0
25%,250.75,6.0,6.0,28.0,2.0,4.0,114.0,75.0,68.0,7000.0,1.0
50%,500.5,7.0,6.5,35.0,3.0,5.0,120.0,80.0,75.0,8000.0,2.0
75%,750.25,8.0,7.0,41.0,4.0,6.0,127.0,85.0,82.0,9000.0,3.0
max,1000.0,11.0,8.5,67.0,6.0,10.0,150.0,103.0,105.0,15000.0,3.0


In [29]:
# Change gender same as sleep data 
# 1 as Male 2 as Female
sleep['Gender'] = sleep['Gender'].map({'Male': 1, 'Female': 2})

sleep.head()

Unnamed: 0,Person ID,Sleep Duration,Quality of Sleep,Age,Physical Activity Level,Stress Level,Blood Pressure 1,Blood Pressure 2,Heart Rate,Daily Steps,BMI Levels,Occupation,Gender
0,1,10,8.0,41,1,8,126,82,95,7000,2,Software Engineer,2
1,2,8,7.0,44,1,6,138,74,66,9000,2,Accountant,2
2,3,8,7.0,31,3,5,123,68,76,4000,3,Salesperson,2
3,4,10,8.0,36,2,6,131,76,74,6000,2,Sales Representative,2
4,5,10,8.0,37,3,3,103,67,89,7000,2,Lawyer,2


In [30]:
# To make it consistent with sleep data rename
sleep.rename(columns={'Blood Pressure 1': 'ap_hi', 'Blood Pressure 2': 'ap_lo'}, inplace=True)
sleep['ap'] = sleep['ap_hi'] - sleep['ap_lo']

sleep.head()

Unnamed: 0,Person ID,Sleep Duration,Quality of Sleep,Age,Physical Activity Level,Stress Level,ap_hi,ap_lo,Heart Rate,Daily Steps,BMI Levels,Occupation,Gender,ap
0,1,10,8.0,41,1,8,126,82,95,7000,2,Software Engineer,2,44
1,2,8,7.0,44,1,6,138,74,66,9000,2,Accountant,2,64
2,3,8,7.0,31,3,5,123,68,76,4000,3,Salesperson,2,55
3,4,10,8.0,36,2,6,131,76,74,6000,2,Sales Representative,2,55
4,5,10,8.0,37,3,3,103,67,89,7000,2,Lawyer,2,36


In [31]:
# to compare "active" from the heart dataset create new column name "active" from sleep
# There is two indicator from sleep which can be formulate as "active" which is daily steps and physical activity level
# the average of physical activity level is 2.95 and average of daily steps is 7933
# to make it roughly 0 or 1 active we are going to assume 
# Physical Activity Level great than or equal to 4 OR Daily Steps greater than or equal to 9000 as active (1) and rest are not (0)

sleep['active'] = ((sleep['Physical Activity Level'] >= 4) | (sleep['Daily Steps'] >= 9000)).astype(int)


In [32]:
sleep.describe()

Unnamed: 0,Person ID,Sleep Duration,Quality of Sleep,Age,Physical Activity Level,Stress Level,ap_hi,ap_lo,Heart Rate,Daily Steps,BMI Levels,Gender,ap,active
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,6.932,6.466,35.134,2.953,4.944,120.273,79.849,75.14,7933.0,2.019,1.5,40.424,0.553
std,288.819436,1.506534,0.753267,9.676036,1.002894,2.006711,9.974767,7.871478,10.096353,1984.048247,0.831463,0.50025,12.925219,0.497432
min,1.0,2.0,4.0,5.0,0.0,0.0,90.0,58.0,44.0,2000.0,1.0,1.0,3.0,0.0
25%,250.75,6.0,6.0,28.0,2.0,4.0,114.0,75.0,68.0,7000.0,1.0,1.0,32.0,0.0
50%,500.5,7.0,6.5,35.0,3.0,5.0,120.0,80.0,75.0,8000.0,2.0,1.5,40.5,1.0
75%,750.25,8.0,7.0,41.0,4.0,6.0,127.0,85.0,82.0,9000.0,3.0,2.0,49.25,1.0
max,1000.0,11.0,8.5,67.0,6.0,10.0,150.0,103.0,105.0,15000.0,3.0,2.0,77.0,1.0


In [33]:
sleep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                1000 non-null   int64  
 1   Sleep Duration           1000 non-null   int64  
 2   Quality of Sleep         1000 non-null   float64
 3   Age                      1000 non-null   int64  
 4   Physical Activity Level  1000 non-null   int64  
 5   Stress Level             1000 non-null   int64  
 6   ap_hi                    1000 non-null   int64  
 7   ap_lo                    1000 non-null   int64  
 8   Heart Rate               1000 non-null   int64  
 9   Daily Steps              1000 non-null   int64  
 10  BMI Levels               1000 non-null   int64  
 11  Occupation               1000 non-null   object 
 12  Gender                   1000 non-null   int64  
 13  ap                       1000 non-null   int64  
 14  active                   

In [34]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
Index: 68034 entries, 0 to 69999
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           68034 non-null  int64  
 1   age          68034 non-null  int64  
 2   gender       68034 non-null  int64  
 3   height       68034 non-null  int64  
 4   weight       68034 non-null  float64
 5   ap_hi        68034 non-null  int64  
 6   ap_lo        68034 non-null  int64  
 7   cholesterol  68034 non-null  int64  
 8   gluc         68034 non-null  int64  
 9   smoke        68034 non-null  int64  
 10  alco         68034 non-null  int64  
 11  active       68034 non-null  int64  
 12  cardio       68034 non-null  int64  
 13  ap           68034 non-null  int64  
 14  bmi          68034 non-null  float64
 15  bmi_level    68034 non-null  int64  
dtypes: float64(2), int64(14)
memory usage: 8.8 MB


In [35]:
# Reordering to compare better
new_order = [
    'id', 'age', 'gender', 'ap_hi', 'ap_lo', 'ap', 'active', 
    'bmi_level', 'cardio', 'height', 'weight', 'bmi', 
    'cholesterol', 'gluc', 'smoke', 'alco'
]

heart = heart[new_order]

In [36]:
# check
heart.head()

Unnamed: 0,id,age,gender,ap_hi,ap_lo,ap,active,bmi_level,cardio,height,weight,bmi,cholesterol,gluc,smoke,alco
0,0,50,2,110,80,30,1,1,0,168,62.0,21.96712,1,1,0,0
1,1,55,1,140,90,50,1,3,1,156,85.0,34.927679,3,1,0,0
2,2,52,1,130,70,60,0,1,1,165,64.0,23.507805,3,1,0,0
3,3,48,2,150,100,50,1,2,1,169,82.0,28.710479,1,1,0,0
4,4,48,1,100,60,40,0,1,0,156,56.0,23.011177,1,1,0,0


In [37]:
# Define the mapping of old column names to new column names
rename_mapping = {
    'Person ID': 'id',
    'Sleep Duration': 'sleep_duration',
    'Quality of Sleep': 'quality_sleep',
    'Age': 'age',
    'Physical Activity Level': 'p_active_level',
    'Stress Level': 'stress_level',
    'ap_hi': 'ap_hi',
    'ap_lo': 'ap_lo', 
    'Heart Rate': 'heart_rate',
    'Daily Steps': 'steps',
    'BMI Levels': 'bmi_level',
    'Occupation': 'occupation',
    'Gender': 'gender',
    'ap': 'ap',
    'active': 'active'
}

In [38]:
# Rename the columns
sleep.rename(columns=rename_mapping, inplace=True)

In [39]:
sleep.head()

Unnamed: 0,id,sleep_duration,quality_sleep,age,p_active_level,stress_level,ap_hi,ap_lo,heart_rate,steps,bmi_level,occupation,gender,ap,active
0,1,10,8.0,41,1,8,126,82,95,7000,2,Software Engineer,2,44,0
1,2,8,7.0,44,1,6,138,74,66,9000,2,Accountant,2,64,1
2,3,8,7.0,31,3,5,123,68,76,4000,3,Salesperson,2,55,0
3,4,10,8.0,36,2,6,131,76,74,6000,2,Sales Representative,2,55,0
4,5,10,8.0,37,3,3,103,67,89,7000,2,Lawyer,2,36,0


In [40]:
# reordering to compare it better
new_order1 = [
    'id', 'age', 'gender', 'ap_hi', 'ap_lo', 'ap', 'active', 
    'bmi_level', 'sleep_duration', 'quality_sleep', 'p_active_level', 'stress_level', 
    'heart_rate', 'steps', 'occupation'
]

In [41]:
sleep = sleep[new_order1]

In [42]:
# check
sleep.head()

Unnamed: 0,id,age,gender,ap_hi,ap_lo,ap,active,bmi_level,sleep_duration,quality_sleep,p_active_level,stress_level,heart_rate,steps,occupation
0,1,41,2,126,82,44,0,2,10,8.0,1,8,95,7000,Software Engineer
1,2,44,2,138,74,64,1,2,8,7.0,1,6,66,9000,Accountant
2,3,31,2,123,68,55,0,3,8,7.0,3,5,76,4000,Salesperson
3,4,36,2,131,76,55,0,2,10,8.0,2,6,74,6000,Sales Representative
4,5,37,2,103,67,36,0,2,10,8.0,3,3,89,7000,Lawyer


In [43]:
heart.head()

Unnamed: 0,id,age,gender,ap_hi,ap_lo,ap,active,bmi_level,cardio,height,weight,bmi,cholesterol,gluc,smoke,alco
0,0,50,2,110,80,30,1,1,0,168,62.0,21.96712,1,1,0,0
1,1,55,1,140,90,50,1,3,1,156,85.0,34.927679,3,1,0,0
2,2,52,1,130,70,60,0,1,1,165,64.0,23.507805,3,1,0,0
3,3,48,2,150,100,50,1,2,1,169,82.0,28.710479,1,1,0,0
4,4,48,1,100,60,40,0,1,0,156,56.0,23.011177,1,1,0,0


In [44]:
# final check of data
heart.info()

<class 'pandas.core.frame.DataFrame'>
Index: 68034 entries, 0 to 69999
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           68034 non-null  int64  
 1   age          68034 non-null  int64  
 2   gender       68034 non-null  int64  
 3   ap_hi        68034 non-null  int64  
 4   ap_lo        68034 non-null  int64  
 5   ap           68034 non-null  int64  
 6   active       68034 non-null  int64  
 7   bmi_level    68034 non-null  int64  
 8   cardio       68034 non-null  int64  
 9   height       68034 non-null  int64  
 10  weight       68034 non-null  float64
 11  bmi          68034 non-null  float64
 12  cholesterol  68034 non-null  int64  
 13  gluc         68034 non-null  int64  
 14  smoke        68034 non-null  int64  
 15  alco         68034 non-null  int64  
dtypes: float64(2), int64(14)
memory usage: 8.8 MB


In [45]:
sleep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1000 non-null   int64  
 1   age             1000 non-null   int64  
 2   gender          1000 non-null   int64  
 3   ap_hi           1000 non-null   int64  
 4   ap_lo           1000 non-null   int64  
 5   ap              1000 non-null   int64  
 6   active          1000 non-null   int64  
 7   bmi_level       1000 non-null   int64  
 8   sleep_duration  1000 non-null   int64  
 9   quality_sleep   1000 non-null   float64
 10  p_active_level  1000 non-null   int64  
 11  stress_level    1000 non-null   int64  
 12  heart_rate      1000 non-null   int64  
 13  steps           1000 non-null   int64  
 14  occupation      1000 non-null   object 
dtypes: float64(1), int64(13), object(1)
memory usage: 117.3+ KB


In [46]:
# export
heart.to_csv('heart_data_clean.csv', index=False)

In [47]:
sleep.to_csv('sleep_data_clean.csv', index=False)