In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from google.colab import files
uploaded = files.upload()

Saving healthcare-dataset-stroke-data.csv to healthcare-dataset-stroke-data.csv


In [3]:
dataset = pd.read_csv("healthcare-dataset-stroke-data.csv")

In [4]:
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [6]:
print(X)

[[9046 'Male' 67.0 ... 228.69 36.6 'formerly smoked']
 [51676 'Female' 61.0 ... 202.21 nan 'never smoked']
 [31112 'Male' 80.0 ... 105.92 32.5 'never smoked']
 ...
 [19723 'Female' 35.0 ... 82.99 30.6 'never smoked']
 [37544 'Male' 51.0 ... 166.29 25.6 'formerly smoked']
 [44679 'Female' 44.0 ... 85.28 26.2 'Unknown']]


In [7]:
print(y)

[1 1 1 ... 0 0 0]


## **Taking Care of Missing Data**

In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 3:5])
X[:, 3:5] = imputer.transform(X[:, 3:5])

In [9]:
print(X)

[[9046 'Male' 67.0 ... 228.69 36.6 'formerly smoked']
 [51676 'Female' 61.0 ... 202.21 nan 'never smoked']
 [31112 'Male' 80.0 ... 105.92 32.5 'never smoked']
 ...
 [19723 'Female' 35.0 ... 82.99 30.6 'never smoked']
 [37544 'Male' 51.0 ... 166.29 25.6 'formerly smoked']
 [44679 'Female' 44.0 ... 85.28 26.2 'Unknown']]


# Encoding Categorical data

# Encoding the Independent Variable

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [11]:
print(X)

[[0.0 1.0 0.0 ... 228.69 36.6 'formerly smoked']
 [1.0 0.0 0.0 ... 202.21 nan 'never smoked']
 [0.0 1.0 0.0 ... 105.92 32.5 'never smoked']
 ...
 [1.0 0.0 0.0 ... 82.99 30.6 'never smoked']
 [0.0 1.0 0.0 ... 166.29 25.6 'formerly smoked']
 [1.0 0.0 0.0 ... 85.28 26.2 'Unknown']]


# Encoding the Independent Variable

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [13]:
print(y)

[1 1 1 ... 0 0 0]


# Splitting the dataset into the Training set and Test set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [15]:
print(X_train)

[[0.0 1.0 0.0 ... 73.57 28.0 'smokes']
 [0.0 1.0 0.0 ... 231.15 22.3 'never smoked']
 [1.0 0.0 0.0 ... 174.37 23.0 'never smoked']
 ...
 [1.0 0.0 0.0 ... 76.26 35.6 'never smoked']
 [1.0 0.0 0.0 ... 218.1 55.0 'smokes']
 [1.0 0.0 0.0 ... 211.06 39.3 'Unknown']]


In [16]:
print(X_test)

[[1.0 0.0 0.0 ... 112.98 37.2 'formerly smoked']
 [1.0 0.0 0.0 ... 78.29 30.1 'formerly smoked']
 [0.0 1.0 0.0 ... 73.27 25.4 'smokes']
 ...
 [0.0 1.0 0.0 ... 97.84 23.3 'Unknown']
 [1.0 0.0 0.0 ... 94.63 24.9 'never smoked']
 [1.0 0.0 0.0 ... 56.94 45.3 'Unknown']]


In [17]:
print(y_train)

[0 0 0 ... 0 0 1]


In [18]:
print(y_test)

[0 0 0 ... 0 0 0]


# Feature Scaling

In [19]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:5] = sc.fit_transform(X_train[:, 3:5])
X_test[:, 3:5] = sc.transform(X_test[:, 3:5])

In [20]:
print(X_train)

[[0.0 1.0 0.0 ... 73.57 28.0 'smokes']
 [0.0 1.0 0.0 ... 231.15 22.3 'never smoked']
 [1.0 0.0 0.0 ... 174.37 23.0 'never smoked']
 ...
 [1.0 0.0 0.0 ... 76.26 35.6 'never smoked']
 [1.0 0.0 0.0 ... 218.1 55.0 'smokes']
 [1.0 0.0 0.0 ... 211.06 39.3 'Unknown']]


In [21]:
print(X_test)

[[1.0 0.0 0.0 ... 112.98 37.2 'formerly smoked']
 [1.0 0.0 0.0 ... 78.29 30.1 'formerly smoked']
 [0.0 1.0 0.0 ... 73.27 25.4 'smokes']
 ...
 [0.0 1.0 0.0 ... 97.84 23.3 'Unknown']
 [1.0 0.0 0.0 ... 94.63 24.9 'never smoked']
 [1.0 0.0 0.0 ... 56.94 45.3 'Unknown']]


In [22]:
dataset.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [23]:
dataset.to_csv(r'new_dataset.csv', index=False, header=True)