<a href="https://colab.research.google.com/github/ivan0054/TP-FINAL--ClaudioPavon/blob/Augusto-4/TP_FINAL_Pavon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
df = pd.read_csv("/content/healthcare-dataset-stroke-data_TF.csv")
df

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x='stroke', y='age', data=df)
plt.title('Box plot: Age (Edad) vs Stroke (Ataque)')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.violinplot(x='stroke', y='bmi', data=df)
plt.title('Violin: BMI (Índice de masa corporal) vs Stroke (Ataque)')
plt.show()

In [None]:
df.info(), df.describe()

In [None]:
df.drop(columns='id', inplace=True)

In [None]:
missing_values = df.isnull().sum()
missing_values

In [None]:
df['age'].fillna(df['age'].median(), inplace=True)
df['hypertension'].fillna(df['hypertension'].mode()[0], inplace=True)
df['bmi'].fillna(df['bmi'].mean(), inplace=True)
print("\nValores nulos después de la verificación:")
print(df.isnull().sum())

In [None]:
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df_encoded.corr(), cmap='coolwarm', annot=True)
plt.title('Mapa de calor de correlación')
plt.show()

In [None]:
X = df_encoded.drop("stroke", axis=1)
y = df_encoded["stroke"]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred))
print("Exactitud:", accuracy_score(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Matriz de Confusión - Random Forest")
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.show()

In [None]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)


In [None]:
y_pred_logreg = logreg.predict(X_test)

In [None]:
print("Regresión Logística")
print(classification_report(y_test, y_pred_logreg))
print("Exactitud:", accuracy_score(y_test, y_pred_logreg))

In [None]:
cm_logreg = confusion_matrix(y_test, y_pred_logreg)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_logreg, annot=True, fmt='d', cmap='Blues')
plt.title("Matriz de Confusión - Regresión Logística")
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))  # 1 fila, 2 columnas

# Random Forest
cm_rf = confusion_matrix(y_test, y_pred)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title("Matriz de Confusión - Random Forest")
axes[0].set_xlabel("Predicción")
axes[0].set_ylabel("Real")

# Regresión Logística
cm_logreg = confusion_matrix(y_test, y_pred_logreg)
sns.heatmap(cm_logreg, annot=True, fmt='d', cmap='Blues', ax=axes[1])
axes[1].set_title("Matriz de Confusión - Regresión Logística")
axes[1].set_xlabel("Predicción")
axes[1].set_ylabel("Real")

plt.show()

In [None]:
accuracy_rf = accuracy_score(y_test, y_pred)
print(f"Random Forest - Exactitud: {accuracy_rf:.4f}")

In [None]:
accuracy_log = accuracy_score(y_test, y_pred_logreg)
print(f"Regresión Logística - Exactitud: {accuracy_log:.4f}")

In [None]:
print("\n COMPARACIÓN DE MODELOS:")
print(f"Random Forest - Exactitud: {accuracy_rf:.4f}")
print(f"Regresión Logística - Exactitud: {accuracy_log:.4f}")