In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
import warnings
warnings.filterwarnings('ignore')


In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [3]:
train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397.0,7.0,Emergency,Extreme,2.0,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397.0,7.0,Trauma,Extreme,2.0,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397.0,7.0,Trauma,Extreme,2.0,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397.0,7.0,Trauma,Extreme,2.0,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397.0,7.0,Trauma,Extreme,2.0,51-60,5558.0,41-50


In [4]:
train.info()
train.Stay.unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50057 entries, 0 to 50056
Data columns (total 18 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   case_id                            50057 non-null  int64  
 1   Hospital_code                      50057 non-null  int64  
 2   Hospital_type_code                 50057 non-null  object 
 3   City_Code_Hospital                 50057 non-null  int64  
 4   Hospital_region_code               50057 non-null  object 
 5   Available Extra Rooms in Hospital  50057 non-null  int64  
 6   Department                         50057 non-null  object 
 7   Ward_Type                          50057 non-null  object 
 8   Ward_Facility_Code                 50057 non-null  object 
 9   Bed Grade                          50047 non-null  float64
 10  patientid                          50056 non-null  float64
 11  City_Code_Patient                  49601 non-null  flo

array(['0-10', '41-50', '31-40', '11-20', '51-60', '21-30', '71-80',
       'More than 100 Days', '81-90', '61-70', '91-100', nan],
      dtype=object)

In [5]:
train.isnull().sum().sort_values(ascending = False)

City_Code_Patient                    456
Bed Grade                             10
patientid                              1
Admission_Deposit                      1
Age                                    1
Visitors with Patient                  1
Severity of Illness                    1
Type of Admission                      1
Stay                                   1
Hospital_code                          0
Ward_Facility_Code                     0
Ward_Type                              0
Department                             0
Available Extra Rooms in Hospital      0
Hospital_region_code                   0
City_Code_Hospital                     0
Hospital_type_code                     0
case_id                                0
dtype: int64

In [6]:
test.isnull().sum().sort_values(ascending = False)

City_Code_Patient                    1590
Bed Grade                              35
Admission_Deposit                       1
Age                                     1
Visitors with Patient                   1
Severity of Illness                     1
Type of Admission                       0
patientid                               0
case_id                                 0
Hospital_code                           0
Ward_Type                               0
Department                              0
Available Extra Rooms in Hospital       0
Hospital_region_code                    0
City_Code_Hospital                      0
Hospital_type_code                      0
Ward_Facility_Code                      0
dtype: int64

In [7]:
train.shape

(50057, 18)

In [8]:
test.shape


(106511, 17)

In [9]:
for i in train.columns:
    print(i, ':', train[i].nunique())

case_id : 50057
Hospital_code : 32
Hospital_type_code : 7
City_Code_Hospital : 11
Hospital_region_code : 3
Available Extra Rooms in Hospital : 13
Department : 5
Ward_Type : 6
Ward_Facility_Code : 6
Bed Grade : 4
patientid : 10029
City_Code_Patient : 35
Type of Admission : 3
Severity of Illness : 3
Visitors with Patient : 25
Age : 10
Admission_Deposit : 5617
Stay : 11


In [10]:
for i in test.columns:
    print(i, ':', test[i].nunique())

case_id : 106511
Hospital_code : 32
Hospital_type_code : 7
City_Code_Hospital : 11
Hospital_region_code : 3
Available Extra Rooms in Hospital : 14
Department : 5
Ward_Type : 6
Ward_Facility_Code : 6
Bed Grade : 4
patientid : 25850
City_Code_Patient : 37
Type of Admission : 4
Severity of Illness : 3
Visitors with Patient : 27
Age : 10
Admission_Deposit : 6349


In [11]:
train['Bed Grade'].fillna(train['Bed Grade'].mode()[0], inplace = True)
test['Bed Grade'].fillna(test['Bed Grade'].mode()[0], inplace = True)

In [12]:
train['City_Code_Patient'].fillna(train['City_Code_Patient'].mode()[0], inplace = True)
test['City_Code_Patient'].fillna(test['City_Code_Patient'].mode()[0], inplace = True)

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['Stay'] = le.fit_transform(train['Stay'].astype('str'))

In [14]:
test['Stay'] = -1
df = pd.concat([train, test])
df.shape

(156568, 18)

In [15]:
for i in ['Hospital_type_code', 'Hospital_region_code', 'Department',
          'Ward_Type', 'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness', 'Age']:
    le = LabelEncoder()
    df[i] = le.fit_transform(df[i].astype(str))

In [16]:
train = df[df['Stay']!=-1]
test = df[df['Stay']==-1]


In [17]:
train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,2,3,2,3,3,2,5,2.0,31397.0,7.0,0,0,2.0,5,4911.0,0
1,2,2,2,5,2,2,3,3,5,2.0,31397.0,7.0,2,0,2.0,5,5954.0,4
2,3,10,4,1,0,2,1,3,4,2.0,31397.0,7.0,2,0,2.0,5,4745.0,3
3,4,26,1,2,1,2,3,2,3,2.0,31397.0,7.0,2,0,2.0,5,7272.0,4
4,5,26,1,2,1,2,3,3,3,2.0,31397.0,7.0,2,0,2.0,5,5558.0,4


In [18]:
test.head()


Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,318439,21,2,3,2,3,2,3,0,2.0,17006.0,2.0,0,2,2.0,7,3095.0,-1
1,318440,29,0,4,0,2,2,3,5,2.0,17006.0,2.0,2,2,4.0,7,4018.0,-1
2,318441,26,1,2,1,3,2,1,3,4.0,17006.0,2.0,0,2,3.0,7,4492.0,-1
3,318442,6,0,6,0,3,2,1,5,2.0,17006.0,2.0,2,2,3.0,7,4173.0,-1
4,318443,28,1,11,0,2,2,2,5,2.0,17006.0,2.0,2,2,4.0,7,4161.0,-1


In [19]:
train.shape


(50057, 18)

In [20]:
test.shape


(106511, 18)

In [21]:
train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 50057 entries, 0 to 50056
Data columns (total 18 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   case_id                            50057 non-null  int64  
 1   Hospital_code                      50057 non-null  int64  
 2   Hospital_type_code                 50057 non-null  int64  
 3   City_Code_Hospital                 50057 non-null  int64  
 4   Hospital_region_code               50057 non-null  int64  
 5   Available Extra Rooms in Hospital  50057 non-null  int64  
 6   Department                         50057 non-null  int64  
 7   Ward_Type                          50057 non-null  int64  
 8   Ward_Facility_Code                 50057 non-null  int64  
 9   Bed Grade                          50057 non-null  float64
 10  patientid                          50056 non-null  float64
 11  City_Code_Patient                  50057 non-null  flo

In [22]:
test.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 106511 entries, 0 to 106510
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            106511 non-null  int64  
 1   Hospital_code                      106511 non-null  int64  
 2   Hospital_type_code                 106511 non-null  int64  
 3   City_Code_Hospital                 106511 non-null  int64  
 4   Hospital_region_code               106511 non-null  int64  
 5   Available Extra Rooms in Hospital  106511 non-null  int64  
 6   Department                         106511 non-null  int64  
 7   Ward_Type                          106511 non-null  int64  
 8   Ward_Facility_Code                 106511 non-null  int64  
 9   Bed Grade                          106511 non-null  float64
 10  patientid                          106511 non-null  float64
 11  City_Code_Patient                  1065

In [23]:
def get_countid_enocde(train, test, cols, name):
  temp = train.groupby(cols)['case_id'].count().reset_index().rename(columns = {'case_id': name})
  temp2 = test.groupby(cols)['case_id'].count().reset_index().rename(columns = {'case_id': name})
  train = pd.merge(train, temp, how='left', on= cols)
  test = pd.merge(test,temp2, how='left', on= cols)
  train[name] = train[name].astype('float')
  test[name] = test[name].astype('float')
  train[name].fillna(np.median(temp[name]), inplace = True)
  test[name].fillna(np.median(temp2[name]), inplace = True)
  return train, test

In [25]:
train, test = get_countid_enocde(train, test, ['patientid'], name = 'count_id_patient')
train, test = get_countid_enocde(train, test, 
                                 ['patientid', 'Hospital_region_code'], name = 'count_id_patient_hospitalCode')
train, test = get_countid_enocde(train, test, 
                                 ['patientid', 'Ward_Facility_Code'], name = 'count_id_patient_wardfacilityCode')

In [26]:
test1 = test.drop(['Stay', 'patientid', 'Hospital_region_code', 'Ward_Facility_Code'], axis =1)
train1 = train.drop(['case_id', 'patientid', 'Hospital_region_code', 'Ward_Facility_Code'], axis =1)

In [27]:
X1 = train1.drop('Stay', axis =1)
y1 = train1['Stay']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size =0.20, random_state =100)