In [1]:
!pip install matplotlib numpy pandas scikit_learn



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.shape

(5110, 12)

Now we want to seperate our the features (inputs / explanatory variables) from the labels (output ie stroke or not stroke).

In [4]:
features_pd= df.iloc[:,:df.shape[1]-1]
labels_pd= df.iloc[:,df.shape[1]-1:]
labels= labels_pd.values.ravel()

Now we want to split the data into training and testing. Here we use stratify to make sure we have an even split of stroke/ not stroke in our training and test data sets.

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features_pd, 
    labels, 
    train_size=0.8, 
    test_size=0.2,
    random_state=1,
    stratify=labels)
print ("X_train, y_train:", X_train.shape, y_train.shape)
print ("X_test, y_test:", X_test.shape, y_test.shape)

print("Proportion stroke in train: ", np.sum(y_train) / len(y_train))
print("Proportion stroke in test: ", np.sum(y_test) / len(y_test))

X_train, y_train: (4088, 11) (4088,)
X_test, y_test: (1022, 11) (1022,)
Proportion stroke in train:  0.04867906066536203
Proportion stroke in test:  0.04892367906066536


Let's put them back together so we can convert into a csv for the whole group to use the same data split in R/Python. Just with a quick check that the X and y have joined up correctly.

In [6]:
training_dataset = pd.concat(
    [X_train, pd.DataFrame(y_train, index=X_train.index, columns=['stroke'])],
    axis=1
)

testing_dataset = pd.concat(
    [X_test, pd.DataFrame(y_test, index=X_test.index, columns=['stroke'])],
    axis=1
)


In [7]:
testing_dataset = testing_dataset.sort_index()
training_dataset = training_dataset.sort_index()

print("Combined Training Data Shape:", training_dataset.shape)
print("Combined Testing Data Shape:", testing_dataset.shape)

training_dataset.head(20)


Combined Training Data Shape: (4088, 12)
Combined Testing Data Shape: (1022, 12)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1
10,12109,Female,81.0,1,0,Yes,Private,Rural,80.43,29.7,never smoked,1


In [None]:
training_dataset.to_csv('stroke_training_dataset.csv', index=False)
testing_dataset.to_csv('stroke_testing_dataset.csv', index=False)

"\ntraining_dataset.to_csv('stroke_training_dataset.csv', index=False)\ntesting_dataset.to_csv('stroke_testing_dataset.csv', index=False)\n"