# ACTL3143/5111 Week 3 StoryWall Notebook

## Load Packages

In [21]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.metrics import AUC, Accuracy

## Import Data

In [22]:
freq = pd.read_csv("stroke.csv")
freq

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


## Pre-process Data

Splitting target and features

In [23]:
#drop id column
freq = freq.drop("id", axis=1)

In [24]:
#set target to stroke
target = freq['stroke']

#set features to all columns except stroke
features = freq.drop("stroke", axis=1)


Exploratory analysis

In [25]:
# Number of features and categories
NUM_FEATURES = len(features.columns)
NUM_CATS = len(np.unique(target))
print("Number of features:", NUM_FEATURES)
print("Number of categories:", NUM_CATS)

Number of features: 10
Number of categories: 2


In [26]:
#information about data
freq.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [27]:
#list of values in features
freq.describe(include="all")


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,5110,5110.0,5110.0,5110.0,5110,5110,5110,5110.0,4909.0,5110,5110.0
unique,3,,,,2,5,2,,,4,
top,Female,,,,Yes,Private,Urban,,,never smoked,
freq,2994,,,,3353,2925,2596,,,1892,
mean,,43.226614,0.097456,0.054012,,,,106.147677,28.893237,,0.048728
std,,22.612647,0.296607,0.226063,,,,45.28356,7.854067,,0.21532
min,,0.08,0.0,0.0,,,,55.12,10.3,,0.0
25%,,25.0,0.0,0.0,,,,77.245,23.5,,0.0
50%,,45.0,0.0,0.0,,,,91.885,28.1,,0.0
75%,,61.0,0.0,0.0,,,,114.09,33.1,,0.0


In [28]:
#check for null values
freq.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [29]:
# check dataset imbalance
neg, pos = np.bincount(freq['stroke'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 5110
    Positive: 249 (4.87% of total)



### Convert strings to boolean

In [30]:
#yes/no to 1/0 for ever_married column
features['ever_married'] = features['ever_married'].replace(['Yes', 'No'], [1, 0])

### Split data into train and test set

In [31]:
#split data into train, val and test
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=17)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=17)

### Apply transformation

In [32]:
# Categorical boolean mask to be hot-encoded
categorical_features_mask = (features.dtypes==object)
numerical_features_mask = (features.dtypes!=object)

#filter categorical columns using mask and turn into a list
categorical_cols = features.columns[categorical_features_mask].tolist()

categorical_transformer = Pipeline(steps = [
    ("ohe",OneHotEncoder(handle_unknown="ignore", drop="first"))
])

categorical_cols

['gender', 'work_type', 'Residence_type', 'smoking_status']

In [33]:
# Numerical boolean mask to be hot-encoded
numerical_features_mask = (features.dtypes!=object)

#filter categorical columns using mask and turn into a list
numerical_cols = features.columns[numerical_features_mask].tolist()
numerical_cols.remove("ever_married")
numerical_cols.remove("hypertension")
numerical_cols.remove("heart_disease")


numerical_transformer = Pipeline(steps =[
    ("imputer", SimpleImputer(strategy="mean")), # impute missing values with mean
    ("scaler", StandardScaler()) # standardize features
])

numerical_cols

['age', 'avg_glucose_level', 'bmi']

In [34]:
transformer = ColumnTransformer(transformers = [
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
    ], remainder = "passthrough"
)

In [35]:
#apply to train, val and test data
X_train_transformer = transformer.fit_transform(X_train)
X_train_transform = pd.DataFrame(X_train_transformer, columns=transformer.get_feature_names_out())
X_train_transform.drop("cat__gender_Other", inplace = True, axis = 1)

X_val_transformer = transformer.fit_transform(X_val)
X_val_transform = pd.DataFrame(X_val_transformer, columns=transformer.get_feature_names_out())

X_test_transformer = transformer.fit_transform(X_test)
X_test_transform = pd.DataFrame(X_test_transformer, columns=transformer.get_feature_names_out())

## Modelling

In [36]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight(
    class_weight = 'balanced',
    classes = np.unique(y_train), 
    y = y_train)
class_weights = dict(enumerate(class_weights))
class_weights

{0: 0.5266231535554792, 1: 9.890322580645162}

In [37]:
X_train_transform

Unnamed: 0,num__age,num__avg_glucose_level,num__bmi,cat__gender_Male,cat__work_type_Never_worked,cat__work_type_Private,cat__work_type_Self-employed,cat__work_type_children,cat__Residence_type_Urban,cat__smoking_status_formerly smoked,cat__smoking_status_never smoked,cat__smoking_status_smokes,remainder__hypertension,remainder__heart_disease,remainder__ever_married
0,0.040119,-0.015149,0.445493,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-1.856075,1.120923,-1.266298,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.139489,3.248563,-0.942095,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.303968,-0.059881,0.834537,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
4,-1.323100,1.443259,-1.603469,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3061,1.535262,-0.623203,-0.656797,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3062,1.271413,0.793324,0.510334,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
3063,-1.498999,0.345780,-1.460820,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3064,1.227439,2.012062,-0.215881,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0


In [40]:
tf.random.set_seed(2022)

model = Sequential([
    Dense(30, activation="relu"),
    Dense(1, activation="sigmoid")
])

model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = [Accuracy(), AUC(curve="ROC"), AUC(curve="PR")])

es = EarlyStopping(restore_best_weights=True, patience=50, monitor="val_accuracy")

%time hist = model.fit(X_train_transform, y_train, class_weight=class_weights, epochs=500, \
        callbacks=[es], validation_data=(X_val_transform, y_val), verbose=False);

print(f"Stopped after {len(hist.history['loss'])} epochs.")

CPU times: total: 7.95 s
Wall time: 5.59 s
Stopped after 51 epochs.


## Model Evaluation

In [None]:
model.evaluate(X_train_transform, y_train)



[0.6518418192863464, 0.05055446922779083, 0.5, 0.05055446922779083]

In [None]:
model.evaluate(X_val_transform, y_val)



[0.6572802066802979, 0.04305283725261688, 0.5, 0.04305283725261688]

In [None]:
model.evaluate(X_test_transform, y_test)



[0.6523124575614929, 0.04892367869615555, 0.5, 0.04892367869615555]

In [None]:
matrix = tf.math.confusion_matrix(labels = y_test, predictions = model.predict(X_test_transform))
matrix



<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[972,   0],
       [ 50,   0]])>