# ACTL3143/5111 Week 3 StoryWall Notebook

## Load Packages

In [260]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense



## Import Data

In [261]:
freq = pd.read_csv("stroke.csv")
freq

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


## Pre-process Data

Splitting target and features

In [262]:
#drop id column
freq = freq.drop("id", axis=1)

In [263]:
#set target to stroke
target = freq['stroke']

#set features to all columns except stroke
features = freq.drop("stroke", axis=1)


Exploratory analysis

In [264]:
#information about data
freq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [265]:
#list of values in features
freq.describe(include="all")


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,5110,5110.0,5110.0,5110.0,5110,5110,5110,5110.0,4909.0,5110,5110.0
unique,3,,,,2,5,2,,,4,
top,Female,,,,Yes,Private,Urban,,,never smoked,
freq,2994,,,,3353,2925,2596,,,1892,
mean,,43.226614,0.097456,0.054012,,,,106.147677,28.893237,,0.048728
std,,22.612647,0.296607,0.226063,,,,45.28356,7.854067,,0.21532
min,,0.08,0.0,0.0,,,,55.12,10.3,,0.0
25%,,25.0,0.0,0.0,,,,77.245,23.5,,0.0
50%,,45.0,0.0,0.0,,,,91.885,28.1,,0.0
75%,,61.0,0.0,0.0,,,,114.09,33.1,,0.0


In [266]:
#check for null values
freq.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

### One Hot Encoding

In [267]:
#yes/no to 1/0 for ever_married column
features['ever_married'] = features['ever_married'].replace(['Yes', 'No'], [1, 0])
features

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,1,Private,Urban,228.69,36.6,formerly smoked
1,Female,61.0,0,0,1,Self-employed,Rural,202.21,,never smoked
2,Male,80.0,0,1,1,Private,Rural,105.92,32.5,never smoked
3,Female,49.0,0,0,1,Private,Urban,171.23,34.4,smokes
4,Female,79.0,1,0,1,Self-employed,Rural,174.12,24.0,never smoked
...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,1,Private,Urban,83.75,,never smoked
5106,Female,81.0,0,0,1,Self-employed,Urban,125.20,40.0,never smoked
5107,Female,35.0,0,0,1,Self-employed,Rural,82.99,30.6,never smoked
5108,Male,51.0,0,0,1,Private,Rural,166.29,25.6,formerly smoked


In [268]:
# Categorical boolean mask to be hot-encoded
categorical_features_mask = (features.dtypes==object)
numerical_features_mask = (features.dtypes!=object)

#filter categorical columns using mask and turn into a list
categorical_cols = features.columns[categorical_features_mask].tolist()

categorical_transformer = Pipeline(steps = [
    ("ohe",OneHotEncoder(handle_unknown="ignore", drop="first"))
])

categorical_cols

['gender', 'work_type', 'Residence_type', 'smoking_status']

In [271]:
# Numerical boolean mask to be hot-encoded
numerical_features_mask = (features.dtypes!=object)

#filter categorical columns using mask and turn into a list
numerical_cols = features.columns[numerical_features_mask].tolist()
numerical_cols.remove("ever_married")
numerical_cols.remove("hypertension")
numerical_cols.remove("heart_disease")


numerical_transformer = Pipeline(steps =[
    ("imputer", SimpleImputer(strategy="mean")), 
    ("scaler", StandardScaler())
])

numerical_cols

['age', 'avg_glucose_level', 'bmi']

### Split data into train and test set

In [273]:
#split data into train and test
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [275]:
X_train

Unnamed: 0,onehotencoder__x0_Male,onehotencoder__x0_Other,onehotencoder__x1_Never_worked,onehotencoder__x1_Private,onehotencoder__x1_Self-employed,onehotencoder__x1_children,onehotencoder__x2_Urban,onehotencoder__x3_formerly smoked,onehotencoder__x3_never smoked,onehotencoder__x3_smokes,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi
802,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,79.00,0.0,0.0,1.0,112.64,28.5
3927,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,62.00,0.0,0.0,1.0,88.32,36.3
2337,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,21.00,0.0,0.0,0.0,59.52,33.7
3910,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,31.00,0.0,0.0,1.0,65.70,30.4
1886,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,31.00,0.0,0.0,0.0,59.63,19.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4426,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,43.00,0.0,0.0,1.0,88.00,30.6
466,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,61.00,1.0,0.0,1.0,170.05,60.2
3092,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.16,0.0,0.0,0.0,97.28,17.8
3772,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,80.00,0.0,0.0,1.0,196.08,31.0


In [274]:
transformer = ColumnTransformer(transformers = [
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
    ], remainder = "passthrough"
    )

x_train_transform = transformer.fit_transform(X_train)
x_train_transform = pd.DataFrame(x_train_transform, columns=transformer.get_feature_names_out())
x_train_transform

ValueError: A given column is not a column of the dataframe