Imports and Configuration

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: to display all columns in a dataframe
# pd.set_option('display.max_columns', None)
# Optional: seaborn styling
sns.set_theme(style="whitegrid")

print("Libraries imported successfully.")

Libraries imported successfully.


Load the Data

In [5]:
# Define the path to your data file
# ../data/ goes up one directory from 'notebooks' then into 'data'
file_path = '/home/jul/Proyects/healthcare-dataset-stroke-data.csv' # Adjust if your filename is different

try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    print("Please check the file path and ensure the file is uploaded correctly.")
    df = None # Set df to None if file not found

Dataset loaded successfully!


Basic Inspection (if df is not None)

In [6]:
if df is not None:
    print("First 5 rows of the dataset:")
    display(df.head()) # 'display()' is often better in notebooks for DataFrames

    print("\nLast 5 rows of the dataset:")
    display(df.tail())

    print(f"\nShape of the dataset (rows, columns): {df.shape}")

    print("\nBasic information about the dataset:")
    df.info()

    print("\nDescriptive statistics for numerical features:")
    display(df.describe())

    print("\nDescriptive statistics for categorical features:")
    display(df.describe(include=['object', 'category'])) # 'category' if you've converted types

    print("\nNumber of unique values in each column:")
    display(df.nunique())

First 5 rows of the dataset:


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1



Last 5 rows of the dataset:


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0



Shape of the dataset (rows, columns): (5110, 12)

Basic information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB

Descriptive statistics for numerical features:


Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0



Descriptive statistics for categorical features:


Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
count,5110,5110,5110,5110,5110
unique,3,2,5,2,4
top,Female,Yes,Private,Urban,never smoked
freq,2994,3353,2925,2596,1892



Number of unique values in each column:


id                   5110
gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

Basic Cleaning ( if df is not None):
Identify Missing Values:

In [7]:
if df is not None:
    print("\nMissing values per column:")
    missing_values = df.isnull().sum()
    missing_percentage = (df.isnull().sum() / len(df)) * 100
    missing_df = pd.DataFrame({'Count': missing_values, 'Percentage': missing_percentage})
    display(missing_df[missing_df['Count'] > 0].sort_values(by='Percentage', ascending=False))


Missing values per column:


Unnamed: 0,Count,Percentage
bmi,201,3.933464


Handling Irrelevant Columns (Example):
Sometimes there are columns that don't add value to the analysis (e.g., unique IDs for each row if they are not used for joins). Decide if any can be removed.

In [8]:
# Make a copy of the dataframe to avoid modifying the original
df_cleaned = df.copy()

# Drop the 'id' column if it is not useful for analysis
if 'id' in df_cleaned.columns:
    df_cleaned = df_cleaned.drop(['id'], axis=1)
    print("\nDropped 'id' column.")

# Check the cleaned dataset
print(df_cleaned.head())


Dropped 'id' column.
   gender   age  hypertension  heart_disease ever_married      work_type  \
0    Male  67.0             0              1          Yes        Private   
1  Female  61.0             0              0          Yes  Self-employed   
2    Male  80.0             0              1          Yes        Private   
3  Female  49.0             0              0          Yes        Private   
4  Female  79.0             1              0          Yes  Self-employed   

  Residence_type  avg_glucose_level   bmi   smoking_status  stroke  
0          Urban             228.69  36.6  formerly smoked       1  
1          Rural             202.21   NaN     never smoked       1  
2          Rural             105.92  32.5     never smoked       1  
3          Urban             171.23  34.4           smokes       1  
4          Rural             174.12  24.0     never smoked       1  


Check Remaining Columns
After removing irrelevant columns, you can inspect the cleaned dataset again:

In [9]:
# Inspect the remaining columns
print(df_cleaned.columns)

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')
