In [1]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit

In [2]:
data = pd.read_csv('stroke.csv')

In [3]:
data.shape

(5110, 12)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [5]:
data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [6]:
data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
data['bmi'] = data['bmi'].fillna(data['bmi'].median())

In [8]:
df = data[['gender','age','hypertension','heart_disease','avg_glucose_level','bmi','smoking_status', 'stroke']]

In [9]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,202.21,28.1,never smoked,1
2,Male,80.0,0,1,105.92,32.5,never smoked,1
3,Female,49.0,0,0,171.23,34.4,smokes,1
4,Female,79.0,1,0,174.12,24.0,never smoked,1


In [10]:
smoking_status_le = LabelEncoder()
df['smoking_status_le'] = smoking_status_le.fit_transform(df['smoking_status'])

In [11]:
gender_le = LabelEncoder()
df['gender_le'] = gender_le.fit_transform(df['gender'])

In [12]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke,smoking_status_le,gender_le
0,Male,67.0,0,1,228.69,36.6,formerly smoked,1,1,1
1,Female,61.0,0,0,202.21,28.1,never smoked,1,2,0
2,Male,80.0,0,1,105.92,32.5,never smoked,1,2,1
3,Female,49.0,0,0,171.23,34.4,smokes,1,3,0
4,Female,79.0,1,0,174.12,24.0,never smoked,1,2,0


In [13]:
x = df[['age','hypertension','heart_disease','avg_glucose_level','bmi','smoking_status_le', 'gender_le']]

In [14]:
y = df['stroke']

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [16]:
forest = RandomForestClassifier()
forest.fit(x_train, y_train)

RandomForestClassifier()

In [17]:
myTree = DecisionTreeClassifier()
myTree.fit(x_train, y_train)

DecisionTreeClassifier()

In [18]:
nb = GaussianNB()
nb.fit(x_train, y_train)

GaussianNB()

In [19]:
def find_best_score_using_gridsearchcv(x, y):
    algorithms = {
        'DecisionTreeClassifier': {
            'model': DecisionTreeClassifier(),
            'params': {
                'splitter': ['best', 'random'],
            }
        },
        'RandomForestClassifier': {
            'model': RandomForestClassifier(),
            'params': {
                'n_estimators': [20,30,50],
            }
        },
        'GaussianNB': {
            'model': GaussianNB(),
            'params': {
                'var_smoothing': [1e-09]
            }
        }
    }
    
    scores = []
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    
    for algo_name, config in algorithms.items():
        grid = GridSearchCV(config['model'], config['params'], cv=cv)
        grid.fit(x, y)
        scores.append({
            'Algorithm': algo_name,
            'Best score': grid.best_score_,
            'Best param': grid.best_params_
        })
    return pd.DataFrame(scores)

find_best_score_using_gridsearchcv(x, y)

Unnamed: 0,Algorithm,Best score,Best param
0,DecisionTreeClassifier,0.911057,{'splitter': 'random'}
1,RandomForestClassifier,0.950391,{'n_estimators': 30}
2,GaussianNB,0.878082,{'var_smoothing': 1e-09}


In [20]:
forest = RandomForestClassifier(n_estimators=30)
forest.fit(x_train, y_train)

RandomForestClassifier(n_estimators=30)

In [21]:
predict = forest.predict([[67.0,0,1,228.69,36.6,1,1]])
predict

array([1], dtype=int64)

In [22]:
def get_diagnose(age,gender,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status):
    prediction = forest.predict([[age,gender,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status]])
    if predict.item() == 0:
        return f"The patient probably won't have a stroke"
    elif predict.item() == 1:
        return f"The patient probably will have a stroke"

In [23]:
age = 25
gender = 1
hypertension = 0
heart_disease = 0
avg_glucose_level = 228.69
bmi = 26.6
smoking_status = 1
get_diagnose(age,gender,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status)

'The patient probably will have a stroke'