## 1. Compreensão dos Dados

#### [1] Carregar os dados e imprimir as primeiras linhas. Examinar a estrutura básica e identificar o tipo de dados de cada atributo.

In [None]:
import pandas as pd

# Ler o ficheiro
file_path_csv = './heart_disease_uci.csv'

# Construir o dataframe
heart_data = pd.read_csv(file_path_csv)

# Mostrar as primeiras linhas e a estrutura do conjunto de dados
heart_data.head()

#### [2] Verificar a existência de valores em falta e identificar colunas que possam requerer limpeza ou transformação adicional

In [None]:
heart_data.info()

#### [3] Calcular estatísticas básicas (média, mediana, etc.) e analisar a distribuição de cada atributo.

In [None]:
heart_data.describe()

## 2. Exploração dos Dados

#### [1] Explore a distribuição da classe. Qual a variação entre casos positivos (doença) vs casos negativos (sem doença).

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

plt.figure(figsize=(8, 6))
ax = sns.countplot(data=heart_data, x='num', hue='num', palette="viridis")
plt.title('Distribution of Target Variable: Heart Disease Presence (num)')
plt.xlabel('Heart Disease (0=No, 1+=Yes)')
plt.ylabel('Count')

# Add labels on top of each bar
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2, p.get_height()),
                ha='center', va='bottom')

plt.show()

#### Conversão da classe em binário

In [None]:
heart_data['num'] = (heart_data['num'] > 0).astype(int)

In [None]:
sns.set_theme(style="whitegrid")

plt.figure(figsize=(8, 6))
ax = sns.countplot(data=heart_data, x='num', hue='num', palette="viridis", legend=False)  # Use 'viridis' color palette and set 'num' as hue
plt.title('Distribution of Target Variable: Heart Disease Presence (num)')
plt.xlabel('Heart Disease (0=No, 1+=Yes)')
plt.ylabel('Count')

for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2, p.get_height()),
                ha='center', va='bottom')

plt.show()

#### [2] Compare diferentes atributos (age, ejection fraction, sérum sodium, oldpeak) com a classe.

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

## Comparison for age vs Heart Disease
sns.boxplot(data=heart_data, x='num', y='age', hue='num', ax=axes[0], palette="viridis")
axes[0].set_title('Age vs Heart Disease', color='darkblue')
axes[0].set_xlabel('Heart Disease (num)', color='darkgreen')
axes[0].set_ylabel('Age', color='darkred')
axes[0].tick_params(axis='x', colors='purple')
axes[0].tick_params(axis='y', colors='purple')

## Comparison for Thalch vs Heart Disease
sns.boxplot(data=heart_data, x='num', y='thalch', hue='num', ax=axes[1], palette="viridis")
axes[1].set_title('Thalch (Max Heart Rate) vs Heart Disease', color='darkblue')
axes[1].set_xlabel('Heart Disease (num)', color='darkgreen')
axes[1].set_ylabel('Thalch', color='darkred')
axes[1].tick_params(axis='x', colors='purple')
axes[1].tick_params(axis='y', colors='purple')

## Comparison for Oldpeak vs Heart Disease
sns.boxplot(data=heart_data, x='num', y='oldpeak', hue='num', ax=axes[2], palette="viridis")
axes[2].set_title('Oldpeak vs Heart Disease', color='darkblue')
axes[2].set_xlabel('Heart Disease (num)', color='darkgreen')
axes[2].set_ylabel('Oldpeak', color='darkred')
axes[2].tick_params(axis='x', colors='purple')
axes[2].tick_params(axis='y', colors='purple')

plt.tight_layout()
plt.show()

#### (a) Use o histplot para comparar thalach e ca com a possibilidade de doença.

In [None]:
# For the 'thalach' (max heart rate achieved) attribute
plt.figure(figsize=(10, 5))
sns.histplot(data=heart_data, x='thalch', hue='num', multiple="stack", kde=True, palette="viridis")
plt.title("Distribution of Thalach (Max Heart Rate) vs Heart Disease")
plt.xlabel("Thalach (Max Heart Rate)")
plt.ylabel("Frequency")

# For the 'ca' (number of major vessels colored by fluoroscopy) attribute
plt.figure(figsize=(10, 5))
sns.histplot(data=heart_data, x='ca', hue='num', multiple="stack", kde=True, palette="viridis")
plt.title("Distribution of CA (Major Vessels) vs Heart Disease")
plt.xlabel("CA (Number of Major Vessels)")
plt.ylabel("Frequency")

plt.show()

#### [3]  Identifique padrões ou correlações entre os atributos, utilizando matrizes de correlação. Não utilize os atributos ‘id’ e ‘num’.

In [None]:
# Select only numeric columns for the correlation matrix
numeric_columns = heart_data.select_dtypes(include='number').drop(columns=['id', 'num'])
correlation_matrix = numeric_columns.corr()

# Plot the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={'color': 'black'})
plt.title('Correlation Matrix of Numerical Features', color='darkblue')
plt.xticks(color='purple')
plt.yticks(color='purple')
plt.show()

## 3. Processamento de Dados

#### [1] Tratar os valores em falta identificados na fase de compreensão dos dados.

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Define the threshold of 30% for missing values
missing_value_threshold = 0.3

# Iterate over the columns of the DataFrame
for column in heart_data.columns:
    # Calculate the percentage of missing values
    missing_percentage = heart_data[column].isnull().mean()

    # Check if the percentage of missing values is greater than the threshold
    if missing_percentage > missing_value_threshold:
        # Drop the column if it has more than 30% missing values
        heart_data = heart_data.drop(columns=[column])
        print(f"Column '{column}' dropped due to more than 30% missing values.")
    else:
        # Fill missing values
        if heart_data[column].dtype in ['float64', 'int64']:
            # Fill with the median for numerical columns
            heart_data[column] = heart_data[column].fillna(heart_data[column].median())
        else:
            # Fill with the mode for categorical columns
            heart_data[column] = heart_data[column].fillna(heart_data[column].mode()[0])
            heart_data[column] = heart_data[column].infer_objects()  # Ensure correct data types if needed


In [None]:
# validate
heart_data.info()

#### [2] Escalar ou normalizar características numéricas para garantir consistência nos dados para a fase de modelação.

In [None]:
scaler = StandardScaler()
numerical_features = heart_data.select_dtypes(include=['float64', 'int64']).columns.drop('num')
heart_data[numerical_features] = scaler.fit_transform(heart_data[numerical_features])

# Display the preprocessed data (first few rows)
heart_data.head()

#### [3] Codificar variáveis categóricas, se existirem, em valores numéricos.

In [None]:
# Dictionary to store the encoders for each column
label_encoders = {}

# Apply LabelEncoder to all categorical columns and store the conversion information
for column in heart_data.select_dtypes(include=['object', 'bool']).columns:
    # Create the encoder
    le = LabelEncoder()

    # Apply the encoder to the column and store it in the dictionary
    heart_data[column] = le.fit_transform(heart_data[column])
    label_encoders[column] = le

    # Add the value mappings for each column to the information list
    print(f"Coluna: {column}")
    for i, class_name in enumerate(le.classes_):
        print(f"  {class_name} -> {i}")

In [None]:
heart_data.head()

In [None]:
heart_data.info()

#### [4] Remover atributos desnecessários.

In [None]:
heart_data = heart_data.drop(columns=['id'])

In [None]:
heart_data.head()

## 4. Modelação

#### [1] Dividir o conjunto de dados em conjuntos de treino e teste (80/20).

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical features in X to one-hot encoded columns
X = heart_data.drop(columns=['num'])  # Features (excluding target)

# Only the target variable
y_binary = heart_data['num']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

X_train.info()
X_test.info()
y_train.info()
y_test.info()

#### [2] Experimentar com Random Forest.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Initialize and train the RandomForest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions with the RandomForest model
y_pred_rf = rf_model.predict(X_test)

print('[INFO] - Random Forest predictions complete!!')

## 5. Avaliação

#### [1] Avaliar o desempenho do modelo utilizando métricas como precisão, precisão específica, recall e F1-score

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

rf_report = classification_report(y_test, y_pred_rf)

print(f"DecisionTrees Report:\n", rf_report)

#### [2] Gerar uma matriz de confusão para o modelo e interpretar os resultados

In [None]:
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
                xticklabels=['No Heart Disease', 'Heart Disease'],
                yticklabels=['No Heart Disease', 'Heart Disease'])
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(title)
    plt.show()

plot_confusion_matrix(y_test, y_pred_rf, "Confusion Matrix: Random Forest")