# Deep Neural Network

In [40]:
# Imports
import tensorflow as tf
from tensorflow.keras import layers, models, preprocessing, Input
import numpy as np
import pandas as pd
import math
import statistics
import matplotlib.pyplot as plt
import seaborn as sn
from IPython.display import display
from sklearn.model_selection import train_test_split


In [36]:
(train_labels.sum() + test_labels.sum()) / (train_labels.shape[0] + train_labels.shape[0])

0.03041327124563446

In [28]:
test_labels.sum() / test_labels.shape[0]


0.04276985743380855

In [41]:
# Load data into pandas dataframe
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

# Create dummies objects
gender = pd.get_dummies(data['gender'])
ever_married = pd.get_dummies(data['ever_married'])
work_type = pd.get_dummies(data['work_type'])
residence_type = pd.get_dummies(data['Residence_type'])
smoking_status = pd.get_dummies(data['smoking_status'])

# Drop old and not usefull columns
data = data.drop(['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'id'], axis=1)

# Create new dataframe
data = pd.concat([data, gender, ever_married, work_type, residence_type, smoking_status], axis=1)

# Rename column names
data = data.rename(columns={'Yes':'ever_married', 'No':'never_married', 'Unknown':'unknown_smoking_status', 'Other':'other_gender'})

# Clean column names
data.columns = data.columns.str.lower().str.replace(' ','_')

# Remove rows with N\A values
data.dropna(axis=0, inplace=True)


# Split the data into target "y" and input "X"
y = data['stroke']
X = data.drop('stroke', axis=1)

#Split the data into 70% training and 30% testing
train_data, test_data, train_labels, test_labels = train_test_split(X, y, train_size=0.7, random_state=1265599650)

# print shapes
print(f'shapes:\nTrain data: {train_data.shape}\nTest data: {test_data.shape}\nTrain labels: {train_labels.shape}\nTest labels: {test_labels.shape}')

print(train_data.info())


shapes:
Train data: (3436, 21)
Test data: (1473, 21)
Train labels: (3436,)
Test labels: (1473,)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3436 entries, 2535 to 1116
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     3436 non-null   float64
 1   hypertension            3436 non-null   int64  
 2   heart_disease           3436 non-null   int64  
 3   avg_glucose_level       3436 non-null   float64
 4   bmi                     3436 non-null   float64
 5   female                  3436 non-null   uint8  
 6   male                    3436 non-null   uint8  
 7   other_gender            3436 non-null   uint8  
 8   never_married           3436 non-null   uint8  
 9   ever_married            3436 non-null   uint8  
 10  govt_job                3436 non-null   uint8  
 11  never_worked            3436 non-null   uint8  
 12  private                 3436 non-null   uint8  

In [60]:
model = models.Sequential()

model.add(Input(shape=(21,)))

model.add(layers.Dense(21, activation='relu'))
model.add(layers.Dense(2, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(train_data, train_labels)
predictions = model.predict(test_data)
predictions = np.argmax(predictions, axis=1)

Train on 3436 samples
[0 0 0 ... 0 0 0]


In [61]:

from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels, predictions)

array([[1400,   10],
       [  62,    1]], dtype=int64)