In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
datafile_train="train.csv"
datafile_test="test_share.csv"
bd_train=pd.read_csv(datafile_train)
bd_test=pd.read_csv(datafile_test)

In [3]:
# We have been given additional data files which contain information about patient's 
# medical and demographic details . we'll merge those details with train and test 
# files using a common unique identifier 



In [4]:
medical_hist='medical_history.csv'
demo_det='demographic_details.csv'

In [5]:
med=pd.read_csv(medical_hist)

In [6]:
dem=pd.read_csv(demo_det)

In [7]:
# both the datasets have patient id as unique identifier 

In [8]:
med.nunique()

PatientId       62299
Hipertension        2
Diabetes            2
Alcoholism          2
Handcap             5
dtype: int64

In [9]:
dem.nunique()

PatientId        62299
Gender               2
Age                104
Neighbourhood       81
Scholarship          2
dtype: int64

In [10]:
# note that this patient info is for both train and test

In [11]:
comb=pd.merge(med,dem,how='outer',on='PatientId')

In [12]:
comb.head()

Unnamed: 0,PatientId,Hipertension,Diabetes,Alcoholism,Handcap,Gender,Age,Neighbourhood,Scholarship
0,29872500000000.0,1,0,0,0,F,62,JARDIM DA PENHA,0
1,558997800000000.0,0,0,0,0,M,56,JARDIM DA PENHA,0
2,4262962000000.0,0,0,0,0,F,62,MATA DA PRAIA,0
3,867951200000.0,0,0,0,0,F,8,PONTAL DE CAMBURI,0
4,8841186000000.0,1,1,0,0,F,56,JARDIM DA PENHA,0


In [13]:
bd_train.head()

Unnamed: 0,PatientId,AppointmentID,ScheduledDay,AppointmentDay,SMS_received,No-show
0,29872500000000.0,5642903,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,0,No
1,558997800000000.0,5642503,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,0,No
2,867951200000.0,5642828,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,0,No
3,8841186000000.0,5642494,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,0,No
4,95985130000000.0,5626772,2016-04-27T08:36:51Z,2016-04-29T00:00:00Z,0,No


In [26]:
train=pd.merge(bd_train,comb,how='left',on='PatientId')
test=pd.merge(bd_test,comb,how='left',on='PatientId')


<class 'pandas.core.frame.DataFrame'>


In [15]:
train.head()

Unnamed: 0,PatientId,AppointmentID,ScheduledDay,AppointmentDay,SMS_received,No-show,Hipertension,Diabetes,Alcoholism,Handcap,Gender,Age,Neighbourhood,Scholarship
0,29872500000000.0,5642903,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,0,No,1,0,0,0,F,62,JARDIM DA PENHA,0
1,558997800000000.0,5642503,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,0,No,0,0,0,0,M,56,JARDIM DA PENHA,0
2,867951200000.0,5642828,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,0,No,0,0,0,0,F,8,PONTAL DE CAMBURI,0
3,8841186000000.0,5642494,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,0,No,1,1,0,0,F,56,JARDIM DA PENHA,0
4,95985130000000.0,5626772,2016-04-27T08:36:51Z,2016-04-29T00:00:00Z,0,No,1,0,0,0,F,76,REPÚBLICA,0


In [16]:
# we are going to drop columns 'PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay'

# you should try making features out of date columns to improve performance of your model
# try the difference in dates , cyclic features from date components 

In [17]:
drop_cols=['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay']

In [25]:
print(type(test))

<class 'NoneType'>


In [27]:
train.drop(drop_cols,axis=1,inplace=True)
test.drop(drop_cols,axis=1,inplace=True)

In [29]:
train['data']='train'
test['data']='test'

all_data=pd.concat([train,test],axis=0,sort=False)

In [30]:
all_data['gender_f']=(all_data['Gender']=='F').astype(int)
del all_data['Gender']

In [31]:
all_data.shape

(110344, 11)

In [32]:
k=all_data['Neighbourhood'].value_counts()
cats=k[k>2000].index

# you can try slightly loweer cutoff to include dummy vars for more neighbourhoods 
# check if that improves your model 

In [33]:
for cat in cats:
    name='Neighbourhood_'+cat
    all_data[name]=(all_data['Neighbourhood']==cat).astype(int)
del all_data['Neighbourhood']

In [34]:
all_data.shape

(110344, 31)

In [35]:
all_data.head()

Unnamed: 0,SMS_received,No-show,Hipertension,Diabetes,Alcoholism,Handcap,Age,Scholarship,data,gender_f,...,Neighbourhood_SANTO ANDRÉ,Neighbourhood_CARATOÍRA,Neighbourhood_JABOUR,Neighbourhood_SÃO PEDRO,Neighbourhood_ILHA DO PRÍNCIPE,Neighbourhood_NOVA PALESTINA,Neighbourhood_DA PENHA,Neighbourhood_ANDORINHAS,Neighbourhood_ROMÃO,Neighbourhood_GURIGICA
0,0,No,1,0,0,0,62,0,train,1,...,0,0,0,0,0,0,0,0,0,0
1,0,No,0,0,0,0,56,0,train,0,...,0,0,0,0,0,0,0,0,0,0
2,0,No,0,0,0,0,8,0,train,1,...,0,0,0,0,0,0,0,0,0,0
3,0,No,1,1,0,0,56,0,train,1,...,0,0,0,0,0,0,0,0,0,0
4,0,No,1,0,0,0,76,0,train,1,...,0,0,0,0,0,0,0,0,0,0


In [36]:
target='No-show'

In [38]:
x_train=all_data.drop([target,'data'],axis=1)[all_data['data']=='train']
y_train=all_data[target][all_data['data']=='train']
x_test=all_data.drop([target,'data'],axis=1)[all_data['data']=='test']

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

In [40]:
params={'penalty':['l1','l2'],'class_weight':['balanced',None],
        'C':np.linspace(0.01,100,100)}

In [41]:
model=LogisticRegression()


In [42]:
rs=RandomizedSearchCV(model,param_distributions=params,n_iter=10,
                      scoring='roc_auc',cv=10,n_jobs=-1,verbose=20)

In [43]:
rs.fit(x_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [35]:
submissions=pd.DataFrame({'Junk':rs.predict_proba(x_test)[:,1]})
submissions.to_csv('submission.csv',index=False)