# Table of Content #

*   **Installing important Libraries**
* **Inspecting the data set for Minor corrections(Data Cleaning )**
* **EDA**
* **Exploratory data analysis**
* **Feature Engineering**
* **Modelling**



**Insalling important library**

In [None]:
!pip install pandas --quiet
!pip install numpy --quiet
!pip install -U scikit-learn --quiet
!pip install xgboost --quiet
!pip install matplotlib --quiet
!pip install plotly --quiet
!pip install seaborn --quiet
!pip install opendatasets --quiet

In [None]:
import os
import opendatasets as od
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
import plotly.subplots as make_subplots
import plotly.graph_objects as go
import warnings


In [None]:
health_care_df=pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

**Inspecting the data set for Minor corrections(Data Cleaning )**

In [None]:
health_care_df

**Converting age to Integer data type**

In [None]:
health_care_df['age']=health_care_df['age'].astype(np.int64)

In [None]:
health_care_df.info()

In [None]:
health_care_df


In [None]:
evermarried=health_care_df['ever_married'].value_counts()
worktype=health_care_df['work_type'].value_counts()
residence_type=health_care_df['Residence_type'].value_counts()
smoking_status=health_care_df['smoking_status'].value_counts()

# **Exploratory data analysis**

**General distribution of categoriacal data**

In [None]:
px.pie(labels=evermarried,values= [i for i in health_care_df['ever_married'].value_counts()],
       names=['Married','Single'],title='Married vs single')

In [None]:
px.pie(labels=worktype.index, values=[i for i in health_care_df['work_type'].value_counts()],
       names=['private','Self-employed','children','Govt_job','Never_worked'],title='Distribution of work type')

In [None]:

px.pie(labels=residence_type,values=[x for x in health_care_df['Residence_type'].value_counts()],
       names=['Urban','Rural'],title='Distribution of Residence type')

In [None]:
px.pie(labels=smoking_status, values=[x for x in health_care_df['smoking_status'].value_counts()],
       names=['never smoked','Unknown','formerly smoked','smokes'],title='Distribution of smoking status')

In [None]:
px.pie(labels=np.array(health_care_df.stroke.value_counts()),values=[x for x in health_care_df.stroke.value_counts()],
       names=['NO','Yes'], title=' Distribution of strokes')

In [None]:
px.histogram(health_care_df.age,nbins=39,color=health_care_df.stroke,title='Distribution of Age vs stroke')

In [None]:
plt.figure(figsize=(15,6))
cormat=health_care_df.corr()
sns.heatmap(cormat,annot=True,cmap="twilight");

In [None]:
px.histogram(health_care_df.avg_glucose_level,color=health_care_df.stroke,title='Distribution of avg-glucose level with respect to stroke')

**Relation between age with respect to stroke**

plt.figure(figsize=(20,6))
sns.set_style('darkgrid')
plt.title('Age with respect to stroke')
sns.countplot(health_care_df.age,hue=health_care_df.stroke);
plt.xticks(rotation=90);

# Feature Engineering

In [None]:
input_cols=health_care_df.columns[1:-1]

In [None]:
input_cols

In [None]:
target_cols=health_care_df.columns[-1]

In [None]:
target_cols

In [None]:
input_df=health_care_df[input_cols]
input_df

In [None]:
numeric_cols=input_df.select_dtypes(['int64','float64']).columns
numeric_cols

In [None]:
categorical_cols=input_df.select_dtypes(['object']).columns
categorical_cols

In [None]:
health_care_df.isna().sum().sort_values(ascending=False)

In [None]:
from sklearn.impute import SimpleImputer


In [None]:
imputer=SimpleImputer(strategy='mean')
imputer.fit(input_df[numeric_cols])


In [None]:
input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])

In [None]:
input_df.isna().sum()

In [None]:
from sklearn.preprocessing import OneHotEncoder


In [None]:
encoder= OneHotEncoder(sparse=False,handle_unknown='ignore').fit(input_df[categorical_cols])

In [None]:
encoded_cols=list(encoder.get_feature_names(categorical_cols))
input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
Scaler= MinMaxScaler()

In [None]:
Scaler.fit(input_df[numeric_cols])

In [None]:
input_df[numeric_cols]=Scaler.transform(input_df[numeric_cols])

In [None]:
input_df.isna().sum().sort_values(ascending=False)


In [None]:
input_df.drop(categorical_cols,axis=1,inplace=True)


In [None]:
input_df

In [None]:
from sklearn.model_selection import train_test_split
train_inputs, val_inputs, train_targets, val_targets= train_test_split(input_df,health_care_df[target_cols],
                                                                     test_size=0.25,random_state=42)

In [None]:
train_inputs

In [None]:
train_targets

# Modeling 
**Trying out different Algorithm to minimise the loss value and increase the accuracy score**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error

* **Decision Tree**

In [None]:
model=DecisionTreeClassifier(random_state=42)

In [None]:
model.fit(train_inputs,train_targets)

In [None]:
train_predictions=model.predict(train_inputs)

In [None]:
train_rmse_error= mean_squared_error(train_predictions,train_targets,squared=False)

In [None]:
print('The RMSE error for train preds {}'.format(train_rmse_error))

In [None]:
val_pred=model.predict(val_inputs)

In [None]:
val_rmse_error=mean_squared_error(val_pred,val_targets)

In [None]:
print(' The RMSE error for val preds{}'.format(val_rmse_error))

In [None]:
model1=DecisionTreeClassifier(random_state=42,max_depth=50,max_leaf_nodes=140)
model1.fit(train_inputs,train_targets)
train1_predictions=model.predict(train_inputs)
val1_pred=model.predict(val_inputs)
val1_rmse_error=mean_squared_error(val1_pred,val_targets)
print(' The RMSE error for val1 preds{}'.format(val1_rmse_error))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(val_pred,val_targets)

In [None]:
accuracy_score(val1_pred,val_targets)

*** RandomForestClassifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
%%time
RF1_model=RandomForestClassifier(n_jobs=-1,random_state=42,n_estimators=20,max_features='sqrt')

In [None]:
%%time
RF1_model.fit(train_inputs,train_targets)

In [None]:
RF1_train_preds=RF1_model.predict(train_inputs)
RF1_preds=RF1_model.predict(val_inputs)

In [None]:
print('The accuracy score :{}'.format(accuracy_score(RF1_preds,val_targets)))

In [None]:

print('The accuracy score :{}'.format(accuracy_score(RF1_preds,val_targets)))
print('The rmse loss is {}'.format(mean_squared_error(RF1_preds,val_targets)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(RF1_preds,val_targets))