<a href="https://colab.research.google.com/github/katusabevictoria/katusabevictoria/blob/main/Classification_Metrics_(Core)_Victoria_Katusabe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score
import missingno
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
# Set pandas as the default output for sklearn
from sklearn import set_config
set_config(transform_output='pandas')


Stroke_dataset ='/content/drive/MyDrive/Colab data uploads/stroke.csv'
df = pd.read_csv(Stroke_dataset)
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1192,Female,31,0,0,No,Govt_job,Rural,70.66,27.2,never smoked,0
1,77,Female,13,0,0,No,children,Rural,85.81,18.6,Unknown,0
2,59200,Male,18,0,0,No,Private,Urban,60.56,33.0,never smoked,0
3,24905,Female,65,0,0,Yes,Private,Urban,205.77,46.0,formerly smoked,1
4,24257,Male,4,0,0,No,children,Rural,90.42,16.2,Unknown,0


In [12]:
df.drop(columns=['id'])

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Female,31,0,0,No,Govt_job,Rural,70.66,27.2,never smoked,0
1,Female,13,0,0,No,children,Rural,85.81,18.6,Unknown,0
2,Male,18,0,0,No,Private,Urban,60.56,33.0,never smoked,0
3,Female,65,0,0,Yes,Private,Urban,205.77,46.0,formerly smoked,1
4,Male,4,0,0,No,children,Rural,90.42,16.2,Unknown,0
...,...,...,...,...,...,...,...,...,...,...,...
1132,Female,1,0,0,No,children,Rural,110.17,20.3,Unknown,0
1133,Male,6,0,0,No,children,Rural,92.98,18.9,Unknown,0
1134,Male,81,0,0,Yes,Private,Urban,234.35,25.3,formerly smoked,0
1135,Female,81,1,0,Yes,Private,Rural,80.43,29.7,never smoked,1


In [13]:
# by using normalize = True with value_counts, our output is the percentage of each class (written as a decimal)
df['stroke'].value_counts(normalize = True)

stroke
0    0.879507
1    0.120493
Name: proportion, dtype: float64

In [14]:
# Define X and y
target = 'stroke'
X = df.drop(columns=target)
y = df[target]


In [15]:
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
72,23427,Female,81,0,0,Yes,Private,Rural,91.82,36.9,Unknown
1091,68171,Male,61,0,0,Yes,Self-employed,Urban,116.78,39.8,formerly smoked
381,50536,Female,62,0,1,Yes,Govt_job,Urban,124.37,28.3,never smoked
760,35999,Female,52,0,0,Yes,Private,Urban,86.85,23.8,formerly smoked
433,47427,Male,49,0,0,Yes,Self-employed,Urban,70.73,27.3,formerly smoked
...,...,...,...,...,...,...,...,...,...,...,...
1044,71143,Male,65,0,0,Yes,Self-employed,Urban,179.67,30.7,formerly smoked
1095,39769,Female,59,0,0,Yes,Self-employed,Urban,82.14,35.6,smokes
1130,70447,Male,50,0,0,Yes,Private,Rural,122.48,35.9,smokes
860,63864,Male,62,0,0,Yes,Private,Rural,107.61,31.3,Unknown


**Prepare a column transformer for preprocessing​**

In [16]:
#list of categorical columns
cat_cols = X_train.select_dtypes('object').columns
# Constructing categorical preprocessing objects
cat_imputer = SimpleImputer(strategy='constant', fill_value='MISSING')
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_pipe = make_pipeline(cat_imputer,ohe_encoder)
cat_tuple = ('cat',cat_pipe, cat_cols)
cat_tuple

('cat',
 Pipeline(steps=[('simpleimputer',
                  SimpleImputer(fill_value='MISSING', strategy='constant')),
                 ('onehotencoder',
                  OneHotEncoder(handle_unknown='ignore', sparse_output=False))]),
 Index(['gender', 'age', 'ever_married', 'work_type', 'Residence_type',
        'smoking_status'],
       dtype='object'))

In [17]:
# Save list of numeric columns
num_cols = X_train.select_dtypes('number').columns
# Constructing numeric preprocesssing objects
num_imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
num_pipe = make_pipeline(num_imputer, scaler)
num_tuple = ('num',num_pipe, num_cols)
num_tuple


('num',
 Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                 ('standardscaler', StandardScaler())]),
 Index(['id', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi'], dtype='object'))

In [18]:
# Create the preprocessing ColumnTransformer
# Define a column transformer
preprocessor  = ColumnTransformer([num_tuple, cat_tuple],
                                  verbose_feature_names_out=False)



In [19]:
X_train

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
72,23427,Female,81,0,0,Yes,Private,Rural,91.82,36.9,Unknown
1091,68171,Male,61,0,0,Yes,Self-employed,Urban,116.78,39.8,formerly smoked
381,50536,Female,62,0,1,Yes,Govt_job,Urban,124.37,28.3,never smoked
760,35999,Female,52,0,0,Yes,Private,Urban,86.85,23.8,formerly smoked
433,47427,Male,49,0,0,Yes,Self-employed,Urban,70.73,27.3,formerly smoked
...,...,...,...,...,...,...,...,...,...,...,...
1044,71143,Male,65,0,0,Yes,Self-employed,Urban,179.67,30.7,formerly smoked
1095,39769,Female,59,0,0,Yes,Self-employed,Urban,82.14,35.6,smokes
1130,70447,Male,50,0,0,Yes,Private,Rural,122.48,35.9,smokes
860,63864,Male,62,0,0,Yes,Private,Rural,107.61,31.3,Unknown
