In [None]:
import pandas as pd


In [None]:
file_path = '/content/heart_failure_clinical_records_dataset.csv'
df = pd.read_csv(file_path)

In [None]:
# Display the first few rows of the DataFrame
print("Initial DataFrame:")
print(df.head())

# Filter data based on conditions (example: filter rows where 'age' > 30)
filtered_df = df[df['age'] > 30]

print("\nFiltered DataFrame (age > 30):")
print(filtered_df.head())


Initial DataFrame:
    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      7861         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           129    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00               2.7           116    0   

   smoking  time  DEATH_EVENT  
0        0     4            1  
1        0     6     

In [None]:
# Handle missing values
# Option 1: Drop rows with any missing values
df_dropped = df.dropna()

print("\nDataFrame after dropping rows with any missing values:")
print(df_dropped.head())

# Option 2: Fill missing values with a specific value (example: fill with 0)
df_filled = df.fillna(0)

print("\nDataFrame after filling missing values with 0:")
print(df_filled.head())

# Option 3: Fill missing values with the mean of the column (example: for 'age' column)
df['age'].fillna(df['age'].mean(), inplace=True)

print("\nDataFrame after filling missing values in 'age' column with mean:")
print(df.head())



DataFrame after dropping rows with any missing values:
    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      7861         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           129    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00               2.7           116    0   

   smoking  time  DEATH_EVENT  
0        0     4

In [None]:
# Calculate summary statistics
summary_stats = df.describe()

print("\nSummary Statistics:")
print(summary_stats)


Summary Statistics:
              age     anaemia  creatinine_phosphokinase    diabetes  \
count  299.000000  299.000000                299.000000  299.000000   
mean    60.833893    0.431438                581.839465    0.418060   
std     11.894809    0.496107                970.287881    0.494067   
min     40.000000    0.000000                 23.000000    0.000000   
25%     51.000000    0.000000                116.500000    0.000000   
50%     60.000000    0.000000                250.000000    0.000000   
75%     70.000000    1.000000                582.000000    1.000000   
max     95.000000    1.000000               7861.000000    1.000000   

       ejection_fraction  high_blood_pressure      platelets  \
count         299.000000           299.000000     299.000000   
mean           38.083612             0.351171  263358.029264   
std            11.834841             0.478136   97804.236869   
min            14.000000             0.000000   25100.000000   
25%            30.0

In [None]:
# Additional operations (example: adding a new column 'age_group' based on 'age')
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 50, 100], labels=['Child', 'Young Adult', 'Adult', 'Senior'])

print("\nDataFrame with new 'age_group' column:")
print(df.head())


DataFrame with new 'age_group' column:
    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      7861         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           129    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00               2.7           116    0   

   smoking  time  DEATH_EVENT age_group  
0        0     4      