In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df=pd.read_csv("heart_patient_data.csv")

In [3]:
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [4]:
df['bmi'].fillna(df['bmi'].mean(), inplace=True)
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [5]:
df = df.drop(["id"],axis="columns")
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.000000,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,formerly smoked,0


In [6]:
X = df.iloc[:,:-1] #feature
Y = df.iloc[:,10].values #classlabel
Y = df[['stroke']]
Y

Unnamed: 0,stroke
0,1
1,1
2,1
3,1
4,1
...,...
5105,0
5106,0
5107,0
5108,0


In [7]:
# converting object data to integer
labelencoder_X=LabelEncoder()
X = X.apply(LabelEncoder().fit_transform)
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1,88,0,1,1,2,1,3850,240,1
1,0,82,0,0,1,3,0,3588,162,2
2,1,101,0,1,1,2,0,2483,199,2
3,0,70,0,0,1,2,1,3385,218,3
4,0,100,1,0,1,3,0,3394,113,2
...,...,...,...,...,...,...,...,...,...,...
5105,0,101,1,0,1,2,1,1360,162,2
5106,0,102,0,0,1,3,1,3030,274,2
5107,0,56,0,0,1,3,0,1314,180,2
5108,1,72,0,0,1,2,0,3363,129,1


In [8]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.25,random_state = 42)
len(x_train),len(x_test),len(y_train),len(y_test)

(3832, 1278, 3832, 1278)

In [9]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.optimizers import Adam

# Separate features and target variable
X = df.drop(columns=['stroke'])  # Features
y = df['stroke']  # Target variable

# Handle missing values in 'bmi'
X['bmi'].fillna(X['bmi'].mean(), inplace=True)

# Define categorical and numeric features
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
numeric_features = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

# Encode categorical features only
labelencoder_X = LabelEncoder()
for feature in categorical_features:
    X[feature] = labelencoder_X.fit_transform(X[feature])

# Split data into numeric and categorical features
X_numeric = X[numeric_features].values
X_categorical = X[categorical_features].values

# Train-test split
X_numeric_train, X_numeric_test, X_categorical_train, X_categorical_test, y_train, y_test = train_test_split(
    X_numeric, X_categorical, y.values, test_size=0.2, random_state=42
)

# Input layers for numeric and categorical features
num_numeric_features = X_numeric_train.shape[1]
numeric_input = Input(shape=(num_numeric_features,))

cat_inputs = []
embedding_layers = []

for i, cat_feature in enumerate(categorical_features):
    # Determine the number of unique categories dynamically
    num_categories = df[cat_feature].nunique()
    embedding_dim = min(50, int(np.ceil(np.sqrt(num_categories))))  # Dynamic embedding dimension

    # Create input and embedding layers
    cat_input = Input(shape=(1,), name=f'{cat_feature}_input')
    cat_inputs.append(cat_input)

    embedding_layer = Embedding(input_dim=num_categories, output_dim=embedding_dim, name=f'{cat_feature}_embedding')(cat_input)
    flatten_layer = Flatten(name=f'{cat_feature}_flatten')(embedding_layer)
    embedding_layers.append(flatten_layer)

# Concatenate all embeddings
concatenated_embeddings = Concatenate()(embedding_layers)

# Combine numeric and categorical features
concatenated_input = Concatenate()([numeric_input, concatenated_embeddings])

# Neural network layers
hidden_layer1 = Dense(64, activation='relu')(concatenated_input)
hidden_layer2 = Dense(32, activation='relu')(hidden_layer1)
output_layer = Dense(1, activation='sigmoid')(hidden_layer2)

# Define and compile the model
model = Model(inputs=[numeric_input] + cat_inputs, outputs=output_layer)
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(
    [X_numeric_train] + [X_categorical_train[:, i] for i in range(X_categorical_train.shape[1])],
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(
    [X_numeric_test] + [X_categorical_test[:, i] for i in range(X_categorical_test.shape[1])],
    y_test
)
print('Test Accuracy: ', test_accuracy * 100)



model.save('stroke_prediction_model.h5')



Epoch 1/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.8135 - loss: 0.9619 - val_accuracy: 0.9548 - val_loss: 0.1797
Epoch 2/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9500 - loss: 0.1899 - val_accuracy: 0.9548 - val_loss: 0.1793
Epoch 3/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9532 - loss: 0.1706 - val_accuracy: 0.9438 - val_loss: 0.1845
Epoch 4/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9480 - loss: 0.1863 - val_accuracy: 0.9535 - val_loss: 0.1757
Epoch 5/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9452 - loss: 0.2048 - val_accuracy: 0.9535 - val_loss: 0.1757
Epoch 6/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9551 - loss: 0.1724 - val_accuracy: 0.9535 - val_loss: 0.1731
Epoch 7/10
[1m103/103[0m 



Test Accuracy:  93.93346309661865


In [10]:
from tensorflow.keras.models import load_model

# Load the saved model
loaded_model = load_model('stroke_prediction_model.h5')

# Print a summary of the model to confirm
loaded_model.summary()


