In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

sns.set(style="whitegrid")
pd.set_option("display.max_columns", 20)

In [None]:
df = pd.read_csv('cardiac arrest dataset.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
missing_values = df.isnull().sum()
duplicated_rows = df.duplicated().sum()

print("Missing values by column:\n", missing_values)
print("Duplicate lines:", duplicated_rows)

In [None]:
df.describe().T

In [None]:
sns.countplot(data=df, x='target', palette='Set2')
plt.title('Target Variable Distribution (Cardiovascular Disease)')
plt.xlabel('0 = No Illness | 1 = With illness')
plt.ylabel('Number of Patients')
plt.show()

In [None]:
df = df.drop_duplicates()

print(f"Total rows after removing duplicates: {df.shape[0]}")

In [None]:
sexo_vs_doenca = pd.crosstab(df['sex'], df['target'])

sexo_vs_doenca.plot(kind='bar', stacked=True, colormap='Set3')
plt.title('Distribution of Cardiovascular Disease by Sex')
plt.xlabel('Sex (0 = female, 1 = male)')
plt.ylabel('Number of Patients')
plt.legend(['No Illness', 'With illness'])
plt.tight_layout()
plt.show()

In [None]:
variaveis_continuas = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

df[variaveis_continuas].hist(bins=15, figsize=(15, 10), color='steelblue', edgecolor='black')
plt.suptitle('Distribution of Continuous Variables')
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
plt.figure(figsize=(15, 10))
for i, var in enumerate(variaveis_continuas, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(data=df, x='target', y=var, palette='Set2')
    plt.title(f'{var} vs Cardiovascular Disease')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
correlacao = df.corr()
sns.heatmap(correlacao, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Correlation Matrix Between Variables")
plt.tight_layout()
plt.show()

In [None]:
variaveis_cat = ['cp', 'restecg', 'slope', 'ca', 'thal']

plt.figure(figsize=(15, 10))
for i, var in enumerate(variaveis_cat, 1):
    plt.subplot(2, 3, i)
    sns.countplot(data=df, x=var, hue='target', palette='Set2')
    plt.title(f'{var} vs target')
plt.tight_layout()
plt.show()

In [None]:
z_scores = np.abs(zscore(df[variaveis_continuas]))
outliers = (z_scores > 3).sum(axis=0)
outliers

In [None]:
df['age_range'] = pd.cut(df['age'], bins=[25, 40, 50, 60, 70, 80], labels=['25-40', '41-50', '51-60', '61-70', '71-80'])

plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='age_range', hue='target', palette='Set2')
plt.title('Disease Distribution by Age Group')
plt.xlabel('Age Range (years)')
plt.ylabel('Number of Patients')
plt.tight_layout()
plt.show()
