In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

### Load dataframe and separate it into train and test

In [2]:
all_df = pd.read_csv("participants_dataset.csv", index_col=0)
all_df.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
55407,Female,47.0,0,0,Yes,Private,Urban,93.18,42.6,formerly smoked,0.0
56075,Female,58.0,0,0,Yes,Private,Rural,196.5,37.7,never smoked,0.0
22939,Female,22.0,0,0,No,Private,Rural,80.72,29.3,Unknown,0.0
62681,Female,38.0,1,0,Yes,Private,Urban,137.94,41.8,never smoked,0.0
57539,Female,68.0,0,0,Yes,Private,Rural,233.59,43.9,never smoked,0.0


In [3]:
all_df['gender'].value_counts()

Female    3627
Male      2627
Other        1
Name: gender, dtype: int64

In [4]:
all_df.loc[all_df.gender == 'Male', 'gender'] = 0
all_df.loc[all_df.gender == 'Female', 'gender'] = 1
all_df.loc[all_df.gender == 'Other', 'gender'] = 1

In [5]:
all_df['ever_married'].value_counts()

Yes    4414
No     1841
Name: ever_married, dtype: int64

In [6]:
all_df.loc[all_df.ever_married == 'Yes', 'ever_married'] = 0
all_df.loc[all_df.ever_married == 'No', 'ever_married'] = 1

In [7]:
all_df['work_type'].value_counts()

Private          3561
Self-employed    1232
Govt_job          753
children          687
Never_worked       22
Name: work_type, dtype: int64

In [8]:
all_df.loc[all_df.work_type == 'Private', 'work_type'] = 0
all_df.loc[all_df.work_type == 'Self-employed', 'work_type'] = 1
all_df.loc[all_df.work_type == 'Govt_job', 'work_type'] = 2
all_df.loc[all_df.work_type == 'children', 'work_type'] = 3
all_df.loc[all_df.work_type == 'Never_worked', 'work_type'] = 4

In [9]:
all_df['Residence_type'].value_counts()

Urban    3163
Rural    3092
Name: Residence_type, dtype: int64

In [10]:
all_df.loc[all_df.Residence_type == 'Urban', 'Residence_type'] = 0
all_df.loc[all_df.Residence_type == 'Rural', 'Residence_type'] = 1

In [11]:
all_df['smoking_status'].value_counts()

never smoked       2358
Unknown            1735
formerly smoked    1199
smokes              963
Name: smoking_status, dtype: int64

In [12]:
all_df.loc[all_df.smoking_status == 'never smoked', 'smoking_status'] = 0
all_df.loc[all_df.smoking_status == 'Unknown', 'smoking_status'] = 1
all_df.loc[all_df.smoking_status == 'formerly smoked', 'smoking_status'] = 2
all_df.loc[all_df.smoking_status == 'smokes', 'smoking_status'] = 3

In [13]:
all_df.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
55407,1,47.0,0,0,0,0,0,93.18,42.6,2,0.0
56075,1,58.0,0,0,0,0,1,196.5,37.7,0,0.0
22939,1,22.0,0,0,1,0,1,80.72,29.3,1,0.0
62681,1,38.0,1,0,0,0,0,137.94,41.8,0,0.0
57539,1,68.0,0,0,0,0,1,233.59,43.9,0,0.0


In [14]:
train_df = all_df[np.isnan(all_df["label"])==False]
test_df = all_df[np.isnan(all_df["label"])]

In [15]:
print(f"train df has shape {train_df.shape} and test df has shape {test_df.shape}")

train df has shape (5004, 11) and test df has shape (1251, 11)


### Define and train model on train_df

In [16]:
standard_scaler = StandardScaler()
x_train = standard_scaler.fit_transform(train_df.iloc[:,:-1])
y_train = np.array(train_df["label"])

In [17]:
from xgboost import XGBClassifier
model_xgb = XGBClassifier()
model_xgb.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

### Score model based on training data

In [18]:
y_pred = model_xgb.predict(x_train)

In [19]:
cr = classification_report(y_train, y_pred)
print(cr)

              precision    recall  f1-score   support

         0.0       0.96      0.99      0.98      3877
         1.0       0.97      0.86      0.91      1127

    accuracy                           0.96      5004
   macro avg       0.97      0.93      0.95      5004
weighted avg       0.96      0.96      0.96      5004



### Apply model on test data

In [20]:
x_test = standard_scaler.transform(test_df.iloc[:,:-1])

In [21]:
y_pred = model_xgb.predict(x_test)

In [22]:
new_test_df = test_df.copy(deep=True)
new_test_df["label"] = y_pred

In [23]:
new_test_df.to_csv("Stroke_submission.csv")