# Stroke Data 

## Importing our Dataset

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from stroke_pipeline import train_test_split_stroke,\
    fill_missing,\
    build_encoders,\
    encode_categorical,\
    build_target_encoder,\
    encode_target

df_original= pd.read_csv("healthcare-dataset-stroke-data.csv")
df_original

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


## Exploring Data

In [2]:
display(df_original.shape)
display(df_original.info())
display(df_original.describe())
display(df_original.columns)

(5110, 12)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


None

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [3]:
#Lets go ahead and loop through all of our columns and see what data they reveal

def describe_df(df_original: pd.DataFrame):
    print(f"The dataset contains {df_original.shape[1]} columns and {len(df_original)} rows")
    for col in df_original.columns:
        col_dtype = df_original[col].dtype
        print(f"\nColumn: {col} ({col_dtype})")
        if col_dtype == 'object':
            print(f"--- Percentage of NaNs: {df_original[col].isna().sum() / len(df_original[col]) * 100}")
            print(f"--- Unique values:\n {df_original[col].unique()}")
        else:
            print(f"--- Summary statistics:\n {df_original[col].describe()}")
describe_df(df_original)

The dataset contains 12 columns and 5110 rows

Column: id (int64)
--- Summary statistics:
 count     5110.000000
mean     36517.829354
std      21161.721625
min         67.000000
25%      17741.250000
50%      36932.000000
75%      54682.000000
max      72940.000000
Name: id, dtype: float64

Column: gender (object)
--- Percentage of NaNs: 0.0
--- Unique values:
 ['Male' 'Female' 'Other']

Column: age (float64)
--- Summary statistics:
 count    5110.000000
mean       43.226614
std        22.612647
min         0.080000
25%        25.000000
50%        45.000000
75%        61.000000
max        82.000000
Name: age, dtype: float64

Column: hypertension (int64)
--- Summary statistics:
 count    5110.000000
mean        0.097456
std         0.296607
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: hypertension, dtype: float64

Column: heart_disease (int64)
--- Summary statistics:
 count    5110.000000
mean        0.054012
std        

## Organizing & Sorting the Dataset

In [4]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# drop ID column
df = df_original.drop(columns='id')
# # convert data to dummies
# df = OneHotEncoder(df['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], handle_unknown='infrequent_if_exist')
# drop NA
#df = df.dropna
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [5]:
y = df['stroke'].values.reshape(-1,1)
df_2 = df.copy()
X = df_2.drop(columns='stroke', axis=1)

In [6]:
df_bmi = pd.DataFrame(X['bmi'])
# df_bmi.reset_index()
# df_bmi.columns = ['0', rename='bmi']


scaler = StandardScaler()
scale = scaler.fit(df_bmi)

df_bmi = scale.transform(df_bmi)

converted_bmi_df = pd.DataFrame(df_bmi)
converted_bmi_df=converted_bmi_df.rename(columns={'0':'bmi'}, axis=0)
print(converted_bmi_df)

TypeError: Cannot specify both 'axis' and any of 'index' or 'columns'

In [None]:
df_age = pd.DataFrame(X['age'])

scaler = StandardScaler()
scale = scaler.fit(df_age)

df_age = scale.transform(df_age)

converted_age_df = pd.DataFrame(df_age)
converted_age_df

In [None]:
df_level = pd.DataFrame(X['avg_glucose_level'])

scaler = StandardScaler()
scale = scaler.fit(df_level)

df_level = scale.transform(df_level)

converted_level_df = pd.DataFrame(df_level)
converted_level_df

In [None]:
age_level_df = pd.concat([converted_age_df, converted_level_df], axis=1)
age_level_df

In [None]:
X_converted_df = pd.concat([X, age_level_df], axis=1)
X_converted_df.head(3)

In [None]:
X_converted_df = X_converted_df.drop(columns='age')
X_converted_df = X_converted_df.drop(columns='avg_glucose_level')
X_converted_df = X_converted_df.drop(columns='bmi')
X_converted_df

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.describe()

In [None]:
# # Create an encoder for the age column
# age_ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

# # Train the encoder
# age_ohe.fit(X_train['age'].values.reshape(-1,1))

# age_encoded = age_ohe.transform(X_train['age'].values.reshape(-1,1))
# #   avg_glucose_level_encoded = avg_glucose_level_enc.transform(X_data['avg_glucose_level'].values.reshape(-1,1))

#     # Reorganize the numpy arrays into a DataFrame
# age_df = pd.DataFrame(age_encoded, columns = age_ohe.get_feature_names_out())
# #    avg_glucose_level_df = pd.DataFrame(avg_glucose_level_encoded, columns= avg_glucose_level_ohe.get_feature_names_out())
# #     out_df = pd.concat([age_df, avg_glucose_level_df], axis = 1)
# #     out_df['grade'] = grade_encoded

#     # Return the DataFrame
# #     return out_df
# age_df

In [None]:
# def X_preprocess(X_data):
#     # Transform each column into numpy arrays
#     age_encoded = age_ohe.transform(X_data['age'].values.reshape(-1,1))
# #   avg_glucose_level_encoded = avg_glucose_level_enc.transform(X_data['avg_glucose_level'].values.reshape(-1,1))

#     # Reorganize the numpy arrays into a DataFrame
#     age_df = pd.DataFrame(age_encoded, columns = age_ohe.get_feature_names_out())
# #    avg_glucose_level_df = pd.DataFrame(avg_glucose_level_encoded, columns= avg_glucose_level_ohe.get_feature_names_out())
# #     out_df = pd.concat([age_df, avg_glucose_level_df], axis = 1)
# #     out_df['grade'] = grade_encoded

#     # Return the DataFrame
#     return out_df

In [None]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train[['age', 'avg_glucose_level']])
X_train_scaled

In [None]:
# Create the encoders for categorical variables (use X_train_filled)
encoders = build_encoders(X_train)
encoders

In [None]:
# Fill the missing values using the imported function
X_train_filled = fill_missing(X_train)
X_test_filled = fill_missing(X_test)
X_train_filled.head()

In [None]:
# Encode X_train_filled and X_test_filled
X_train_encoded = encode_categorical(X_train_filled, encoders)
X_test_encoded = encode_categorical(X_test_filled, encoders)

X_train_encoded.head()

In [None]:
# Encode y_train and y_test
y_encoder = build_target_encoder(y_train)
y_train_encoded = encode_target(y_train, y_encoder)
y_test_encoded = encode_target(y_test, y_encoder)

In [None]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components = 10)
pca_model.fit(X_train_encoded)

X_train_pca = pd.DataFrame(pca_model.transform(X_train_encoded))
X_test_pca = pd.DataFrame(pca_model.transform(X_test_encoded))
X_train_pca

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train_pca, y_train_encoded)

In [None]:
y_test_pred = model.predict(X_test_pca)
print(balanced_accuracy_score(y_test_encoded, y_test_pred))

In [None]:
y_train_pred = model.predict(X_train_pca)
print(balanced_accuracy_score(y_train_encoded, y_train_pred))

In [None]:
models = {'train_score': [], 'test_score': [], 'max_depth': []}

for depth in range(1,10):
    models['max_depth'].append(depth)
    model = RandomForestClassifier(n_estimators=100, max_depth=depth)
    model.fit(X_train_pca, y_train_encoded)
    y_test_pred = model.predict(X_test_pca)
    y_train_pred = model.predict(X_train_pca)

    models['train_score'].append(balanced_accuracy_score(y_train_encoded, y_train_pred))
    models['test_score'].append(balanced_accuracy_score(y_test_encoded, y_test_pred))

models_df = pd.DataFrame(models)

In [None]:
models_df.plot(x='max_depth')

In [None]:
models = {'train_score': [], 'test_score': [], 'n_estimators': []}

for n in [50, 100, 500, 1000]:
    models['n_estimators'].append(n)
    model = RandomForestClassifier(n_estimators=n, max_depth=7)
    model.fit(X_train_pca, y_train_encoded)
    y_test_pred = model.predict(X_test_pca)
    y_train_pred = model.predict(X_train_pca)

    models['train_score'].append(balanced_accuracy_score(y_train_encoded, y_train_pred))
    models['test_score'].append(balanced_accuracy_score(y_test_encoded, y_test_pred))

models_df = pd.DataFrame(models)

In [None]:
models_df.plot(x='n_estimators')

In [None]:
from sklearn.neighbors import KNeighborsClassifier


from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
    'weights': ['uniform', 'distance'],
    'leaf_size': [10, 50, 100, 500]
}
random_knn = RandomizedSearchCV(KNeighborsClassifier(), param_grid, verbose=3)

random_knn.fit(X_train_pca, y_train_encoded)

In [None]:
y_pred = random_knn.predict(X_test_pca)
print(balanced_accuracy_score(y_test_encoded, y_pred))

In [None]:
model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=13)
model.fit(X_train_encoded, y_train_encoded)
y_pred = model.predict(X_test_encoded)
print(balanced_accuracy_score(y_test_encoded, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
model.fit(X_train_encoded, y_train_encoded)
y_pred = model.predict(X_test_encoded)
print(balanced_accuracy_score(y_test_encoded, y_pred))