In [1]:

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score,recall_score,f1_score,roc_auc_score,roc_curve,precision_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler,power_transform,OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier


In [2]:
train=pd.read_csv("Train (3).csv")
test=pd.read_csv("Test (3).csv")
patient=pd.read_csv("Patient_Details.csv")
first_camp=pd.read_csv("First_Health_Camp.csv")
second_camp=pd.read_csv("Second_Health_Camp.csv")
third_camp=pd.read_csv("Third_Health_Camp.csv")
camp=pd.read_csv("Health_Camp_Detail.csv")




In [3]:
df=pd.read_csv("merged.csv",index_col=0)

In [4]:
np.where(df["Category1"].str.contains("First"),0,1)

array([0, 1, 0, ..., 1, 1, 0])

In [5]:
df.isnull().sum()

Health_Camp_ID                   0
Registration_Date              231
Var1                             0
Var2                             0
Var3                             0
Var4                             0
Var5                             0
outcome                          0
Online_Follower                  0
LinkedIn_Shared                  0
Twitter_Shared                   0
Facebook_Shared                  0
Income                           0
Education_Score                  0
Age                              0
First_Interaction                0
City_Type                    23236
Employer_Category            42095
Camp_Start_Date                  0
Camp_End_Date                    0
Category1                        0
Category2                        0
Category3                        0
Donation                     48337
Health_Score                 48337
Health Score                 47214
Number_of_stall_visited      48179
Last_Stall_Visited_Number    48179
dtype: int64

import pandas_profiling
pfr = pandas_profiling.ProfileReport(df)
pfr

In [6]:
df.drop(['Number_of_stall_visited','Health_Score','Var3','Var1','Var5','Camp_End_Date','Camp_Start_Date','Registration_Date','First_Interaction','Category3','Var4','Online_Follower','LinkedIn_Shared','Twitter_Shared','Var2','Facebook_Shared'],axis=1,inplace=True)

In [7]:
X=df.drop("outcome",axis=1)

In [8]:
y=df["outcome"]

In [9]:
X.replace("None", np.nan,inplace=True)

In [10]:
X=pd.get_dummies(X.drop(["Category2",'Health_Camp_ID','Employer_Category','City_Type',"Age",'Education_Score','Income','Category1'],axis=1),drop_first=True)

In [11]:
X.shape

(52694, 3)

In [12]:
X.isnull().sum()

Donation                     48337
Health Score                 47214
Last_Stall_Visited_Number    48179
dtype: int64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [14]:
X_train.shape

(42155, 3)

In [15]:
X_train.isnull().sum()

Donation                     38639
Health Score                 37733
Last_Stall_Visited_Number    38564
dtype: int64

In [16]:
X_train.shape

(42155, 3)

In [17]:
X_train.tail(20)

Unnamed: 0_level_0,Donation,Health Score,Last_Stall_Visited_Number
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
496681,,,
489692,,0.90499,
513343,,,
497258,,,
504601,,,
512538,,,
526697,,,
487119,,,
509415,,,
526157,,,


In [18]:
knn=KNNImputer().fit(X_train)

x_train=pd.DataFrame(knn.transform(X_train),columns=X_train.columns)
x_test=pd.DataFrame(knn.transform(X_test),columns=X_test.columns)

In [19]:
x_train

Unnamed: 0,Donation,Health Score,Last_Stall_Visited_Number
0,32.394767,0.557359,2.379003
1,32.394767,0.557359,2.379003
2,32.394767,0.373206,2.379003
3,32.394767,0.557359,2.379003
4,30.000000,0.557359,2.379003
...,...,...,...
42150,32.394767,0.402054,2.379003
42151,32.394767,0.557359,1.000000
42152,40.000000,0.557359,2.379003
42153,32.394767,0.557359,2.379003


In [20]:
clfs={"Logreg":LogisticRegression(),
      "KNN":KNeighborsClassifier(),
      "Naive Bayes":GaussianNB(),
      "NN":MLPClassifier(),
      "DecisionTreeClassifier":DecisionTreeClassifier(),
      "RandomForestClassifier":RandomForestClassifier(),
      "Ada Boos":AdaBoostClassifier(),
      "Gradient Boos":GradientBoostingClassifier(),
      "XgBoost":XGBClassifier()}
models_report=pd.DataFrame(columns=["model name","accuracy","recall","precision","roc auc","f1 score"])
for clf,clf_name in list(zip(clfs.values(),clfs.keys())):
    clf.fit(x_train,y_train)
    y_pred=clf.predict(x_test)
    print("Fitting the model......",clf_name)
    t={"model name":clf_name
       ,"accuracy":accuracy_score(y_test,y_pred)
       ,"recall":recall_score(y_test,y_pred),
       "precision":precision_score(y_test,y_pred),
       "roc auc":roc_auc_score(y_test,y_pred),
       "f1 score":f1_score(y_test,y_pred)}
    models_report=models_report.append(t,ignore_index=True)
    
models_report=models_report.sort_values(by="f1 score",ascending=False)
models_report

Fitting the model...... Logreg


  _warn_prf(average, modifier, msg_start, len(result))
  models_report=models_report.append(t,ignore_index=True)


Fitting the model...... KNN
Fitting the model...... Naive Bayes


  models_report=models_report.append(t,ignore_index=True)
  models_report=models_report.append(t,ignore_index=True)


Fitting the model...... NN
Fitting the model...... DecisionTreeClassifier


  models_report=models_report.append(t,ignore_index=True)
  models_report=models_report.append(t,ignore_index=True)


Fitting the model...... RandomForestClassifier


  models_report=models_report.append(t,ignore_index=True)


Fitting the model...... Ada Boos


  models_report=models_report.append(t,ignore_index=True)


Fitting the model...... Gradient Boos


  models_report=models_report.append(t,ignore_index=True)


Fitting the model...... XgBoost


  models_report=models_report.append(t,ignore_index=True)


Unnamed: 0,model name,accuracy,recall,precision,roc auc,f1 score
1,KNN,1.0,1.0,1.0,1.0,1.0
4,DecisionTreeClassifier,1.0,1.0,1.0,1.0,1.0
5,RandomForestClassifier,1.0,1.0,1.0,1.0,1.0
6,Ada Boos,1.0,1.0,1.0,1.0,1.0
7,Gradient Boos,1.0,1.0,1.0,1.0,1.0
8,XgBoost,1.0,1.0,1.0,1.0,1.0
2,Naive Bayes,0.999051,0.997164,0.99929,0.998452,0.998226
3,NN,0.993358,0.975186,1.0,0.987593,0.987437
0,Logreg,0.732328,0.0,0.0,0.5,0.0


In [21]:
rf=RandomForestClassifier()


In [22]:
rf.fit(x_train,y_train)

In [23]:
pd.DataFrame(rf.feature_importances_,index=x_train.columns).sort_values(by=0,ascending=False)

Unnamed: 0,0
Health Score,0.393722
Last_Stall_Visited_Number,0.315058
Donation,0.291219


In [24]:
df3=pd.merge(test,patient,how="left",on=["Patient_ID"])
df3.shape

(22584, 18)

In [25]:
df3=pd.merge(df3,camp,how="left",on='Health_Camp_ID')
df3.shape

(22584, 23)

In [26]:
df3=pd.merge(df3,first_camp,how="left",on=["Patient_ID",'Health_Camp_ID']).drop(["Unnamed: 4"],axis=1)
df3.shape

(22584, 25)

In [27]:
df3=pd.merge(df3,second_camp,how="left",on=["Patient_ID",'Health_Camp_ID'])
df3.shape

(22584, 26)

In [28]:




df3=pd.merge(df3,third_camp,on=["Patient_ID",'Health_Camp_ID'],how="left")
df3.shape

(22584, 28)

In [29]:
df3.drop(['Number_of_stall_visited','Health_Score','Var3','Var1','Var5','Camp_End_Date','Camp_Start_Date','Registration_Date','First_Interaction','Category3','Var4','Online_Follower','LinkedIn_Shared','Twitter_Shared','Var2','Facebook_Shared'],axis=1,inplace=True)

In [30]:
X=df3

In [31]:
X.replace("None", np.nan,inplace=True)

In [32]:
X=pd.get_dummies(X.drop(["Category2",'Patient_ID','Health_Camp_ID','Employer_Category','City_Type',"Age",'Education_Score','Income','Category1'],axis=1),drop_first=True)

In [33]:
x_test=pd.DataFrame(knn.transform(X),columns=x_train.columns)

x_test=pd.DataFrame(sc.transform(x_test),columns=x_test.columns)


In [34]:
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)


In [35]:
sub=pd.concat([test["Patient_ID"],pd.Series(y_pred)],axis=1,ignore_index=True)
sub.columns=["Patient_ID","outcome"]
sub.to_csv("sub.csv",index=False)
pd.read_csv("sub.csv")

Unnamed: 0,Patient_ID,outcome
0,507518,1
1,516180,0
2,507204,0
3,505815,1
4,501809,1
...,...,...
22579,521837,0
22580,490592,0
22581,499464,0
22582,525833,0
