In [1]:
!wget https://raw.githubusercontent.com/JoseCaliz/dotfiles/main/css/gruvbox.css 2>/dev/null 1>&2
!pip install feature_engine 2>/dev/null 1>&2
    
from IPython.core.display import HTML
with open('./gruvbox.css', 'r') as file:
    custom_css = file.read()

HTML(custom_css)

## According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
#### We will be predicting whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient

<div style = 'border: 2.5px solid #D9C10B;'>

## Attribute Information

* `id`: unique identifier

* `gender`: "Male", "Female" or "Other"

* `age`: age of the patient

* `hypertension`: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

* `heart_disease`: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease

* `ever_married`: "No" or "Yes"

* `work_type`: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"

* `Residence_type`: "Rural" or "Urban"

* `avg_glucose_level`: average glucose level in blood

* `bmi`: body mass index

* `smoking_status`: "formerly smoked", "never smoked", "smokes" or "Unknown"

* `stroke`: 1 if the patient had a stroke or 0 if not



<div style = 'border: 3px solid #D9C10B;'>

## Importing the required libraries

In [2]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense,Dropout, BatchNormalization
from keras import regularizers
import numpy as np 
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler

## Reading the original dataset (as dataframe) available on this link: 
https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset . 
#### Update : Didn't use it for training as it wasn't improving performance for the architecture I'm using

In [3]:
original_df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


#### `bmi` column had null values, so was imputing them with the mean of the column

In [4]:
# Create an instance of the Imputer class
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the column with missing values
imputer.fit(original_df[['bmi']])

# Use the transform method to fill in missing values with the mean of the column
original_df['bmi'] = imputer.transform(original_df[['bmi']])

In [5]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


### The dataset provided for the competition is imported in `train_df` and `test_df` dataframes

In [6]:
train_df = pd.read_csv('/kaggle/input/playground-series-s3e2/train.csv')
# train_df = pd.concat([train_df,original_df])
# train_df = train_df.sample(frac=1)
# train_df = train_df.reset_index(drop=True)
y_train = train_df['stroke']
train_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.00,40.3,Unknown,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0
...,...,...,...,...,...,...,...,...,...,...,...,...
15299,15299,Female,22.0,0,0,No,Govt_job,Urban,72.63,19.5,never smoked,0
15300,15300,Female,46.0,1,0,Yes,Private,Urban,101.19,32.1,never smoked,0
15301,15301,Female,75.0,0,0,Yes,Self-employed,Urban,87.69,26.2,never smoked,0
15302,15302,Male,46.0,0,0,Yes,Private,Rural,101.13,22.5,Unknown,0


In [7]:
test_df = pd.read_csv('/kaggle/input/playground-series-s3e2/test.csv')
test_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,15304,Female,57.0,0,0,Yes,Private,Rural,82.54,33.4,Unknown
1,15305,Male,70.0,1,0,Yes,Private,Urban,72.06,28.5,Unknown
2,15306,Female,5.0,0,0,No,children,Urban,103.72,19.5,Unknown
3,15307,Female,56.0,0,0,Yes,Govt_job,Urban,69.24,41.4,smokes
4,15308,Male,32.0,0,0,Yes,Private,Rural,111.15,30.1,smokes


# Checking the unique values in each of the categorical columns

In [8]:
train_df['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [9]:
train_df['ever_married'].unique()

array(['Yes', 'No'], dtype=object)

In [10]:
train_df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [11]:
train_df['Residence_type'].unique()

array(['Urban', 'Rural'], dtype=object)

In [12]:
train_df['smoking_status'].unique()

array(['never smoked', 'formerly smoked', 'Unknown', 'smokes'],
      dtype=object)

In [13]:
test_df_id = test_df['id']

In [14]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15304 entries, 0 to 15303
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 15304 non-null  int64  
 1   gender             15304 non-null  object 
 2   age                15304 non-null  float64
 3   hypertension       15304 non-null  int64  
 4   heart_disease      15304 non-null  int64  
 5   ever_married       15304 non-null  object 
 6   work_type          15304 non-null  object 
 7   Residence_type     15304 non-null  object 
 8   avg_glucose_level  15304 non-null  float64
 9   bmi                15304 non-null  float64
 10  smoking_status     15304 non-null  object 
 11  stroke             15304 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 1.4+ MB


In [15]:
train_df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,15304.0,15304.0,15304.0,15304.0,15304.0,15304.0,15304.0
mean,7651.5,41.417708,0.049726,0.023327,89.039853,28.112721,0.041296
std,4418.028595,21.444673,0.217384,0.150946,25.476102,6.722315,0.198981
min,0.0,0.08,0.0,0.0,55.22,10.3,0.0
25%,3825.75,26.0,0.0,0.0,74.9,23.5,0.0
50%,7651.5,43.0,0.0,0.0,85.12,27.6,0.0
75%,11477.25,57.0,0.0,0.0,96.98,32.0,0.0
max,15303.0,82.0,1.0,1.0,267.6,80.1,1.0


# Storing each of the categorical columns for One-Hot encoding

In [16]:
cat_df_1 = train_df[['gender']]
cat_test_df_1 = test_df[['gender']]
cat_df_2 = train_df[['work_type']]
cat_test_df_2 = test_df[['work_type']]
cat_df_3 = train_df[['Residence_type']]
cat_test_df_3 = test_df[['Residence_type']]
cat_df_4 = train_df[['smoking_status']]
cat_test_df_4 = test_df[['smoking_status']]
cat_df_5 = train_df[['ever_married']]
cat_test_df_5 = test_df[['ever_married']]
cat_df_1.head()

Unnamed: 0,gender
0,Male
1,Male
2,Female
3,Male
4,Female


# Applying One-Hot encoding on each of the categorical columns

In [17]:
# Create an instance of the encoder
encoder1 = OneHotEncoder()
encoder2 = OneHotEncoder()
encoder3 = OneHotEncoder()
encoder4 = OneHotEncoder()
encoder5 = OneHotEncoder()
dumm_df_1 = pd.DataFrame(encoder1.fit_transform(cat_df_1).toarray(),columns = encoder1.get_feature_names_out(cat_df_1.columns))
dumm_test_df_1= pd.DataFrame(encoder1.transform(cat_test_df_1).toarray(),columns = encoder1.get_feature_names_out(cat_test_df_1.columns))
dumm_df_2 = pd.DataFrame(encoder2.fit_transform(cat_df_2).toarray(),columns = encoder2.get_feature_names_out(cat_df_2.columns))
dumm_test_df_2= pd.DataFrame(encoder2.transform(cat_test_df_2).toarray(),columns = encoder2.get_feature_names_out(cat_test_df_2.columns))
dumm_df_3 = pd.DataFrame(encoder3.fit_transform(cat_df_3).toarray(),columns = encoder3.get_feature_names_out(cat_df_3.columns))
dumm_test_df_3= pd.DataFrame(encoder3.transform(cat_test_df_3).toarray(),columns = encoder3.get_feature_names_out(cat_test_df_3.columns))
dumm_df_4 = pd.DataFrame(encoder4.fit_transform(cat_df_4).toarray(),columns = encoder4.get_feature_names_out(cat_df_4.columns))
dumm_test_df_4= pd.DataFrame(encoder4.transform(cat_test_df_4).toarray(),columns = encoder4.get_feature_names_out(cat_test_df_4.columns))
dumm_df_5 = pd.DataFrame(encoder5.fit_transform(cat_df_5).toarray(),columns = encoder5.get_feature_names_out(cat_df_5.columns))
dumm_test_df_5= pd.DataFrame(encoder5.transform(cat_test_df_5).toarray(),columns = encoder5.get_feature_names_out(cat_test_df_5.columns))

concat_dumm = pd.concat([dumm_df_1,dumm_df_2,dumm_df_3,dumm_df_4],axis = 1)
concat_dumm_test = pd.concat([dumm_test_df_1, dumm_test_df_2,dumm_test_df_3,dumm_test_df_4],axis = 1)
concat_dumm

Unnamed: 0,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15299,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
15300,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
15301,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
15302,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [18]:
concat_dumm_test

Unnamed: 0,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10199,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
10200,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
10201,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
10202,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


# Storing the numerical columns for scaling the respective values

In [19]:
num_df_1 = train_df[['age']]
print(type(num_df_1))
num_test_df_1 = test_df[['age']]
num_df_2 = train_df[['avg_glucose_level']]
num_test_df_2 = test_df[['avg_glucose_level']]
num_df_3 = train_df[['bmi']]
num_test_df_3 = test_df[['bmi']]
num_df_1.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,age
0,28.0
1,33.0
2,42.0
3,56.0
4,24.0


In [20]:
scaler1 = RobustScaler()
scaler2 =  RobustScaler()

scaler3 =  RobustScaler()
scaled_df_1 = pd.DataFrame(scaler1.fit_transform(num_df_1),columns = num_df_1.columns)
scaled_test_df_1 =  pd.DataFrame(scaler1.transform(num_test_df_1),columns = num_test_df_1.columns)
scaled_df_2 =  pd.DataFrame(scaler2.fit_transform(num_df_2),columns = num_df_2.columns)
scaled_test_df_2 =  pd.DataFrame(scaler2.transform(num_test_df_2) ,columns = num_test_df_2.columns)
scaled_df_3 =  pd.DataFrame(scaler3.fit_transform(num_df_3),columns = num_df_3.columns)
scaled_test_df_3 =  pd.DataFrame(scaler3.transform(num_test_df_3),columns = num_test_df_3.columns)

concat_scaled_df = pd.concat([scaled_df_1,scaled_df_2,scaled_df_3],axis = 1)
concat_test_scaled_df = pd.concat([scaled_test_df_1,scaled_test_df_2,scaled_test_df_3],axis =1)
concat_test_scaled_df


Unnamed: 0,age,avg_glucose_level,bmi
0,0.451613,-0.116848,0.682353
1,0.870968,-0.591486,0.105882
2,-1.225806,0.842391,-0.952941
3,0.419355,-0.719203,1.623529
4,-0.354839,1.178895,0.294118
...,...,...,...
10199,-0.516129,-0.423460,-1.176471
10200,0.193548,0.805707,-0.105882
10201,-1.290323,0.856884,-1.094118
10202,-0.387097,-0.122736,0.129412


In [21]:
rem_two_cols  = train_df[['hypertension','heart_disease']]
rem_two_cols_test = test_df[['hypertension','heart_disease']]
rem_two_cols_test.head()

Unnamed: 0,hypertension,heart_disease
0,0,0
1,1,0
2,0,0
3,0,0
4,0,0


# Joining the scaled, one-hot encoded and remaining columns to be used as training

In [22]:
X_train = pd.concat([concat_dumm,concat_scaled_df,rem_two_cols],axis =1)
X_train

Unnamed: 0,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,age,avg_glucose_level,bmi,hypertension,heart_disease
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.483871,-0.253170,0.411765,0,0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.322581,-0.302536,-0.435294,0,0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,-0.032258,0.809783,1.494118,0,0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.419355,-0.917120,0.141176,0,0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.612903,-0.532609,0.141176,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15299,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.677419,-0.565670,-0.952941,0,0
15300,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.096774,0.727808,0.529412,1,0
15301,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.032258,0.116395,-0.164706,0,0
15302,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.096774,0.725091,-0.600000,0,0


# Joining the scaled, one-hot encoded and remaining columns to be used for testing

In [23]:
X_test = pd.concat([concat_dumm_test,concat_test_scaled_df,rem_two_cols_test],axis =1)
X_test

Unnamed: 0,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,age,avg_glucose_level,bmi,hypertension,heart_disease
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.451613,-0.116848,0.682353,0,0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.870968,-0.591486,0.105882,1,0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,-1.225806,0.842391,-0.952941,0,0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.419355,-0.719203,1.623529,0,0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.354839,1.178895,0.294118,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10199,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.516129,-0.423460,-1.176471,0,0
10200,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.193548,0.805707,-0.105882,0,0
10201,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.290323,0.856884,-1.094118,0,0
10202,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.387097,-0.122736,0.129412,0,0


# Defining the Model Architecture and appropriate loss function and metrices

In [24]:

model = Sequential()

# Add layers to the model
model.add(Dense(8116, input_dim=19, activation='selu')) #input layer with 64 neurons
model.add(Dropout(0.5))
model.add(Dense(2048,activation= 'relu'))
model.add(Dropout(0.5))
model.add(Dense(2048,activation= 'relu'))
model.add(Dropout(0.5))
model.add(Dense(1024,activation= 'selu'))
model.add(BatchNormalization())
model.add(Dense(512,activation = 'relu'))
model.add(Dropout(0.4))
model.add(Dense(512,activation = 'relu'))
model.add(Dropout(0.4))
model.add(BatchNormalization())
model.add(Dense(256,activation = 'selu'))
model.add(Dropout(0.2))
model.add(Dense(256,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation = 'selu'))
model.add(Dense(8,activation= 'relu'))
model.add(Dense(1, activation='sigmoid')) #output layer with 1 neuron 
model.compile(loss=['binary_crossentropy','mse'], optimizer= tf.keras.optimizers.Adam(4e-5),metrics = [tf.keras.metrics.AUC(num_thresholds=700000,curve='ROC')])
model.summary()

2023-01-15 09:33:54.864417: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-15 09:33:54.961731: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-15 09:33:54.962611: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-15 09:33:54.964879: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 8116)              162320    
_________________________________________________________________
dropout (Dropout)            (None, 8116)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 2048)              16623616  
_________________________________________________________________
dropout_1 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 2048)              4196352   
_________________________________________________________________
dropout_2 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1024)              2

In [25]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):


    if((logs.get('auc') >= 0.90) and (logs.get('val_auc') >= 0.90) ):

      # Stop if threshold is met
      print("\nLoss is lower than 0.4 so cancelling training!")
      self.model.stop_training = True

# Instantiate class
callbacks = myCallback()

# Training the model with validation_split = 0.26 

In [26]:
model.fit(X_train,y_train,validation_split = 0.25,batch_size = 256,epochs = 200)#,callbacks=[callbacks])



2023-01-15 09:33:58.547515: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7f8c2be9f8d0>

# Getting the feature importances

In [27]:
# Get the weights of the first layer
weights = model.layers[0].get_weights()[0]

# Get the absolute values of the weights
importances = np.abs(weights)

# Normalize the importances
importances = importances / importances.sum(axis=0)

# Print the importances of each feature
for i, importance in enumerate(importances):
    print("Feature", i, "Importance", np.median(importance))


Feature 0 Importance 0.04994481
Feature 1 Importance 0.049996264
Feature 2 Importance 0.051287215
Feature 3 Importance 0.052700005
Feature 4 Importance 0.0516539
Feature 5 Importance 0.05058777
Feature 6 Importance 0.052276473
Feature 7 Importance 0.052200876
Feature 8 Importance 0.05112683
Feature 9 Importance 0.04957308
Feature 10 Importance 0.052007943
Feature 11 Importance 0.050893314
Feature 12 Importance 0.050806034
Feature 13 Importance 0.053029668
Feature 14 Importance 0.059475914
Feature 15 Importance 0.04549481
Feature 16 Importance 0.05150235
Feature 17 Importance 0.05490274
Feature 18 Importance 0.057150766


# Prediction on X_test

In [28]:
y_pred = model.predict(X_test)
print(y_pred)

[[3.2281987e-02]
 [1.2069977e-01]
 [8.7947410e-06]
 ...
 [1.9675424e-06]
 [1.0418543e-06]
 [2.7811784e-06]]


# Conversion of DataFrame in required format for submission to the competition

In [29]:
y_pred = pd.DataFrame(y_pred)
y_pred.columns = ['stroke']
submissions_df = pd.DataFrame(pd.concat([test_df_id,y_pred],axis = 1))
submissions_df = submissions_df.reset_index(drop = True)

submissions_df.to_csv('submission549.csv', index=False)
submissions_df.head()

Unnamed: 0,id,stroke
0,15304,0.032282
1,15305,0.1207
2,15306,9e-06
3,15307,0.038178
4,15308,0.004369


### Please upvote if you find it useful. Thanks