In [1]:
import pandas as pd
import sweetviz as sv
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from pycaret.classification import *

In [2]:
df_train = pd.read_csv(r'../Data/Train/train.csv')
df_test = pd.read_csv(r'../Data/test.csv')

In [3]:
df_first = pd.read_csv(r'../Data/Train/First_Health_Camp_Attended.csv') #importing data on first medical camp
df_second = pd.read_csv(r'../Data/Train/Second_Health_Camp_Attended.csv') #importing data on second medical camp
df_third = pd.read_csv(r'../Data/Train/Third_Health_Camp_Attended.csv') #importing data on third medical camp

In [4]:
#We rename health score column to be representative of the 1st health camp
df_first = df_first.rename(columns={"Health_Score":"Health_Score_firstCamp"})
df_first.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Donation,Health_Score_firstCamp,Unnamed: 4
0,506181,6560,40,0.439024,
1,494977,6560,20,0.097561,
2,518680,6560,10,0.04878,
3,509916,6560,30,0.634146,
4,488006,6560,20,0.02439,


In [5]:
#We rename health score column to be representative of the 2nd health camp
df_second = df_second.rename(columns={"Health Score":"Health_Score_secondCamp"})
df_second.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Health_Score_secondCamp
0,526631,6536,0.875136
1,509122,6536,0.7557
2,498864,6536,0.673181
3,515398,6536,0.722041
4,504624,6536,0.464712


In [6]:
df_third.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Number_of_stall_visited,Last_Stall_Visited_Number
0,517875,6527,3,1
1,504692,6578,1,1
2,504692,6527,3,1
3,493167,6527,4,4
4,510954,6528,2,2


We will import features from 3 csv file to both train and test sets."Patient_ID" will be used as primary key to map the values.

In [7]:
df_train.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,489652,6578,10-Sep-05,4,0,0,0,2
1,507246,6578,18-Aug-05,45,5,0,0,7
2,523729,6534,29-Apr-06,0,0,0,0,0
3,524931,6535,07-Feb-04,0,0,0,0,0
4,521364,6529,28-Feb-06,15,1,0,0,7


In [8]:
#Mapping the values from medical camp cvs to train data set
df_train_main = pd.merge(df_train,df_first,how='left',on=['Patient_ID','Health_Camp_ID'])
df_train_main = pd.merge(df_train_main,df_second,how='left',on=['Patient_ID','Health_Camp_ID'])
df_train_main = pd.merge(df_train_main,df_third,how='left',on=['Patient_ID','Health_Camp_ID'])

In [9]:
df_train_main.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Donation,Health_Score_firstCamp,Unnamed: 4,Health_Score_secondCamp,Number_of_stall_visited,Last_Stall_Visited_Number
0,489652,6578,10-Sep-05,4,0,0,0,2,,,,,2.0,1.0
1,507246,6578,18-Aug-05,45,5,0,0,7,,,,,,
2,523729,6534,29-Apr-06,0,0,0,0,0,,,,0.402054,,
3,524931,6535,07-Feb-04,0,0,0,0,0,,,,,,
4,521364,6529,28-Feb-06,15,1,0,0,7,,,,0.845597,,


In [10]:
df_train_main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75278 entries, 0 to 75277
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Patient_ID                 75278 non-null  int64  
 1   Health_Camp_ID             75278 non-null  int64  
 2   Registration_Date          74944 non-null  object 
 3   Var1                       75278 non-null  int64  
 4   Var2                       75278 non-null  int64  
 5   Var3                       75278 non-null  int64  
 6   Var4                       75278 non-null  int64  
 7   Var5                       75278 non-null  int64  
 8   Donation                   6218 non-null   float64
 9   Health_Score_firstCamp     6218 non-null   float64
 10  Unnamed: 4                 0 non-null      float64
 11  Health_Score_secondCamp    7819 non-null   float64
 12  Number_of_stall_visited    6515 non-null   float64
 13  Last_Stall_Visited_Number  6515 non-null   flo

In [11]:
#Dropping the rows with date values as NA and dropping Unnamed 4 column. The date column is changed from string to date type.
df_train_main.dropna(subset=['Registration_Date'],inplace=True)
df_train_main.drop(['Unnamed: 4'],axis=1,inplace=True)
df_train_main.Registration_Date = df_train_main.Registration_Date.astype('datetime64')
df_train_main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74944 entries, 0 to 75277
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Patient_ID                 74944 non-null  int64         
 1   Health_Camp_ID             74944 non-null  int64         
 2   Registration_Date          74944 non-null  datetime64[ns]
 3   Var1                       74944 non-null  int64         
 4   Var2                       74944 non-null  int64         
 5   Var3                       74944 non-null  int64         
 6   Var4                       74944 non-null  int64         
 7   Var5                       74944 non-null  int64         
 8   Donation                   6173 non-null   float64       
 9   Health_Score_firstCamp     6173 non-null   float64       
 10  Health_Score_secondCamp    7669 non-null   float64       
 11  Number_of_stall_visited    6514 non-null   float64       
 12  Last

We fill all NA values in health score columns and number of stalls visited as zero. Where by zero indicates the person dint checkin
to take the test.

In [12]:
df_train_main.fillna(0,inplace=True)
df_train_main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74944 entries, 0 to 75277
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Patient_ID                 74944 non-null  int64         
 1   Health_Camp_ID             74944 non-null  int64         
 2   Registration_Date          74944 non-null  datetime64[ns]
 3   Var1                       74944 non-null  int64         
 4   Var2                       74944 non-null  int64         
 5   Var3                       74944 non-null  int64         
 6   Var4                       74944 non-null  int64         
 7   Var5                       74944 non-null  int64         
 8   Donation                   74944 non-null  float64       
 9   Health_Score_firstCamp     74944 non-null  float64       
 10  Health_Score_secondCamp    74944 non-null  float64       
 11  Number_of_stall_visited    74944 non-null  float64       
 12  Last

A favorable outcome is defined as that the person has either attended one of the medical camps. In other words the person shall have a non zero value in any of the following columns.Health_Score_firstCamp, Health_Score_secondCamp, Number_of_stall_visited.We create a small function that assigns a value of 1 to outcome column if this condition is satsfied.

In [13]:
def outcome(row):
    if((row['Health_Score_firstCamp'] > 0) or (row['Health_Score_secondCamp'] > 0) or (row['Number_of_stall_visited'] > 0)):
        return 1
    else:
        return 0
df_train_main['outcome'] = df_train_main.apply(outcome,axis=1)
df_train_main.head(10)

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Donation,Health_Score_firstCamp,Health_Score_secondCamp,Number_of_stall_visited,Last_Stall_Visited_Number,outcome
0,489652,6578,2005-09-10,4,0,0,0,2,0.0,0.0,0.0,2.0,1.0,1
1,507246,6578,2005-08-18,45,5,0,0,7,0.0,0.0,0.0,0.0,0.0,0
2,523729,6534,2006-04-29,0,0,0,0,0,0.0,0.0,0.402054,0.0,0.0,1
3,524931,6535,2004-02-07,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0
4,521364,6529,2006-02-28,15,1,0,0,7,0.0,0.0,0.845597,0.0,0.0,1
5,494493,6570,2005-05-20,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0
6,523001,6562,2005-05-22,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0
7,500733,6535,2004-01-31,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0
8,501155,6538,2004-01-31,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0
9,501457,6538,2004-08-12,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0


We will extract feature that indicates if the given day is weekday or weekend from Registration_Date column.Weekend will be assigned 1 while weekday will be zero. The new column is name as is_weekend.

In [14]:
df_train_main['is_weekend'] = df_train_main['Registration_Date'].apply(lambda x:int(x.dayofweek//5==1))
df_train_main.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Donation,Health_Score_firstCamp,Health_Score_secondCamp,Number_of_stall_visited,Last_Stall_Visited_Number,outcome,is_weekend
0,489652,6578,2005-09-10,4,0,0,0,2,0.0,0.0,0.0,2.0,1.0,1,1
1,507246,6578,2005-08-18,45,5,0,0,7,0.0,0.0,0.0,0.0,0.0,0,0
2,523729,6534,2006-04-29,0,0,0,0,0,0.0,0.0,0.402054,0.0,0.0,1,1
3,524931,6535,2004-02-07,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,1
4,521364,6529,2006-02-28,15,1,0,0,7,0.0,0.0,0.845597,0.0,0.0,1,0


Splitting feature and target. We then use ExtraTreesClassifier to give picture of features sorted in descending order for feature importance

In [15]:
X_train = df_train_main.drop(columns=['outcome','Registration_Date','Patient_ID'])
y_train = df_train_main[['outcome']]
etc_model = ExtraTreesClassifier(oob_score=True,random_state=27,bootstrap=True,n_estimators=15)
etc_model.fit(X_train,y_train)
columns = list(X_train.columns)
pd_feature = pd.DataFrame(data=[columns,etc_model.feature_importances_])
pd_feature = pd_feature.T
pd_feature.columns=['features','scores']
pd_feature.sort_values(['scores'],ascending=False)

  after removing the cwd from sys.path.
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


Unnamed: 0,features,scores
8,Health_Score_secondCamp,0.356263
7,Health_Score_firstCamp,0.225236
9,Number_of_stall_visited,0.188615
10,Last_Stall_Visited_Number,0.118913
6,Donation,0.0848545
0,Health_Camp_ID,0.0244002
1,Var1,0.000514327
5,Var5,0.000431798
2,Var2,0.000334838
4,Var4,0.000229295


In [17]:
eda = sv.analyze(df_train_main,'outcome')
eda.show_html('eda.html')

:FEATURES DONE:                    |█████████████████████| [100%]   00:05  -> (00:00 left)
:PAIRWISE DONE:                    |█████████████████████| [100%]   00:03  -> (00:00 left)


Creating Associations graph... DONE!
Report eda.html was generated! NOTEBOOK/COLAB USERS: no browser will pop up, the report is saved in your notebook/colab files.


In [22]:
df_temp = df_train_main.drop(columns=['Registration_Date','Patient_ID'])
model_compare = setup(data=df_temp,target='outcome')                                      
best = compare_models(sort='AUC')                

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0255
1,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.1155
2,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.093
3,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.3809
4,Extra Trees Classifier,0.9999,1.0,0.9999,0.9999,0.9999,0.9999,0.9999,0.5569
5,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0578
6,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.166
7,CatBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.2916
8,Linear Discriminant Analysis,0.9403,0.9999,0.78,1.0,0.8763,0.8378,0.8491,0.1266
9,K Neighbors Classifier,0.9923,0.9988,0.9806,0.9911,0.9858,0.9806,0.9806,3.2853


In [16]:
df_train_main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74944 entries, 0 to 75277
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Patient_ID                 74944 non-null  int64         
 1   Health_Camp_ID             74944 non-null  int64         
 2   Registration_Date          74944 non-null  datetime64[ns]
 3   Var1                       74944 non-null  int64         
 4   Var2                       74944 non-null  int64         
 5   Var3                       74944 non-null  int64         
 6   Var4                       74944 non-null  int64         
 7   Var5                       74944 non-null  int64         
 8   Donation                   74944 non-null  float64       
 9   Health_Score_firstCamp     74944 non-null  float64       
 10  Health_Score_secondCamp    74944 non-null  float64       
 11  Number_of_stall_visited    74944 non-null  float64       
 12  Last

In [24]:
df_train_main.Health_Camp_ID.value_counts()

6543    6543
6527    4144
6538    3954
6537    3859
6529    3823
6526    3809
6534    3597
6570    3562
6580    3515
6578    2837
6586    2622
6542    2368
6562    2334
6554    2301
6523    2084
6571    2076
6532    1993
6539    1992
6536    1902
6535    1882
6549    1832
6528    1743
6555    1725
6541    1547
6581    1467
6540    1410
6585    1396
6564     514
6546     403
6530     259
6561     200
6569     177
6563     171
6524     148
6544     128
6531     120
6553      94
6575      90
6552      82
6587      79
6565      66
6557      52
6558      44
Name: Health_Camp_ID, dtype: int64

In [25]:
df_temp = df_train_main.drop(columns=['Registration_Date','Patient_ID','Health_Camp_ID'])
model_compare = setup(data=df_temp,target='outcome')                                      
best = compare_models(sort='AUC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Logistic Regression,0.9938,1.0,0.9772,1.0,0.9885,0.9843,0.9844,0.3078
1,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0224
2,Random Forest Classifier,0.9999,1.0,0.9998,1.0,0.9999,0.9999,0.9999,0.1162
3,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9874
4,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0141
5,Linear Discriminant Analysis,0.9362,1.0,0.7649,1.0,0.8667,0.8258,0.8386,0.1126
6,Extra Trees Classifier,1.0,1.0,0.9999,1.0,0.9999,0.9999,0.9999,0.5457
7,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6637
8,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.1538
9,CatBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.4921
