In [4]:
import pandas as pd

# Load the dataset
file_path = 'healthcare-dataset-stroke-data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Handle missing values in 'bmi' by imputing with the mean
imputer = SimpleImputer(strategy='mean')
data['bmi'] = imputer.fit_transform(data[['bmi']])

# Encode categorical variables
label_encoders = {}
for column in ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Split the data into features and target
X = data.drop(columns=['id', 'stroke'])
y = data['stroke']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the preprocessed data
X_train.head(), y_train.head()

(      gender   age  hypertension  heart_disease  ever_married  work_type  \
 845        0  48.0             0              0             1          2   
 3744       1  15.0             0              0             0          2   
 4183       0  67.0             0              0             1          3   
 3409       1  44.0             0              0             1          2   
 284        1  14.0             0              0             0          0   
 
       Residence_type  avg_glucose_level   bmi  smoking_status  
 845                1              69.21  33.1               2  
 3744               0             122.25  21.0               2  
 4183               0             110.42  24.9               2  
 3409               1              65.41  24.8               3  
 284                1              82.34  31.6               0  ,
 845     0
 3744    0
 4183    0
 3409    0
 284     0
 Name: stroke, dtype: int64)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Create a logistic regression model with class weights
log_reg = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions on the test set

y_pred = log_reg.predict(X_test)

# Evaluate the model
classification_report_output = classification_report(y_test, y_pred)
confusion_matrix_output = confusion_matrix(y_test, y_pred)

print(classification_report_output)

              precision    recall  f1-score   support

           0       0.99      0.75      0.85       972
           1       0.14      0.80      0.24        50

    accuracy                           0.75      1022
   macro avg       0.56      0.77      0.55      1022
weighted avg       0.95      0.75      0.82      1022



In [13]:
print(confusion_matrix_output)

[[728 244]
 [ 10  40]]


In [15]:
from sklearn.svm import SVC

# Create an SVM model with class weights
svm_model = SVC(class_weight='balanced', random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
classification_report_svm = classification_report(y_test, y_pred_svm)
confusion_matrix_svm = confusion_matrix(y_test, y_pred_svm)

print(classification_report_svm)

              precision    recall  f1-score   support

           0       0.98      0.70      0.82       972
           1       0.12      0.78      0.21        50

    accuracy                           0.71      1022
   macro avg       0.55      0.74      0.51      1022
weighted avg       0.94      0.71      0.79      1022



In [16]:
print(confusion_matrix_svm)

[[682 290]
 [ 11  39]]


In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

# Build the neural network model
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, class_weight=class_weight_dict, validation_split=0.2)

# Evaluate the model
y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")

# Generate classification report and confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
classification_report_nn = classification_report(y_test, y_pred_nn)
confusion_matrix_nn = confusion_matrix(y_test, y_pred_nn)

print(classification_report_nn)
print(confusion_matrix_nn)

NameError: name 'y_train' is not defined