In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#importing ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#ignoring the warnings
import warnings
warnings.filterwarnings('ignore')

# Reading and Exploring the data

In [3]:
pharma_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Training_set_begs.csv')
np.random.seed(7)
pharma_data.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,22374,8,3333,DX6,56,18.479385,YES,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,18164,5,5740,DX2,36,22.945566,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,6283,23,10446,DX6,48,27.510027,YES,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,5339,51,12011,DX1,5,19.130976,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,33012,0,12513,,128,1.3484,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1


## Data Description:

- ID_Patient_Care_Situation: Care situation of a patient during treatment
- Diagnosed_Condition: The diagnosed condition of the patient
- ID_Patient: Patient identifier number
- Treatment_with_drugs: Class of drugs used during treatment
- Survived_1_year: If the patient survived after one year (0 means did not survive; 1 means survived)
- Patient_Age: Age of the patient
- Patient_Body_Mass_Index: A calculated value based on the patient’s weight, height, etc.
- Patient_Smoker: If the patient was a smoker or not
- Patient_Rural_Urban: If the patient stayed in Rural or Urban part of the country
- Previous_Condition: Condition of the patient before the start of the treatment 
> ( This variable is splitted into 8 columns - A, B, C, D, E, F, Z and Number_of_prev_cond. A, B, C, D, E, F and Z are the previous conditions of the patient. Suppose for one patient, if the entry in column A is 1, it means that the previous condition of the patient was A. If the patient didn't have that condition, it is 0 and same for other conditions. If a patient has previous condition as A and C , columns A and C will have entries as 1 and 1 respectively while the other column B, D, E, F, Z will have entries 0, 0, 0, 0, 0 respectively. The column Number_of_prev_cond will have entry as 2 i.e. 1 + 0 + 1 + 0 + 0 + 0 + 0 + 0 = 2 in this case. )


In [3]:
pharma_data.describe()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
count,23097.0,23097.0,23097.0,23097.0,23097.0,21862.0,21862.0,21862.0,21862.0,21862.0,21862.0,21862.0,21862.0,23097.0
mean,16545.712041,26.413127,6261.280772,33.209768,23.45482,0.897905,0.136355,0.18507,0.083615,0.393239,0.0537,0.000595,1.75048,0.632247
std,9532.263503,15.030865,3595.99062,19.549882,3.807661,0.30278,0.343173,0.388363,0.276817,0.48848,0.225431,0.024379,0.770311,0.482204
min,2.0,0.0,1.0,0.0,1.0893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,8280.0,13.0,3181.0,16.0,20.20555,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,16597.0,26.0,6242.0,33.0,23.386199,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
75%,24825.0,39.0,9363.0,50.0,26.788154,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0
max,33014.0,52.0,12515.0,149.0,29.999579,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0


In [4]:
pharma_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23097 entries, 0 to 23096
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  23097 non-null  int64  
 1   Diagnosed_Condition        23097 non-null  int64  
 2   Patient_ID                 23097 non-null  int64  
 3   Treated_with_drugs         23084 non-null  object 
 4   Patient_Age                23097 non-null  int64  
 5   Patient_Body_Mass_Index    23097 non-null  float64
 6   Patient_Smoker             23097 non-null  object 
 7   Patient_Rural_Urban        23097 non-null  object 
 8   Patient_mental_condition   23097 non-null  object 
 9   A                          21862 non-null  float64
 10  B                          21862 non-null  float64
 11  C                          21862 non-null  float64
 12  D                          21862 non-null  float64
 13  E                          21862 non-null  flo

In [5]:
pharma_data.isnull().sum()

ID_Patient_Care_Situation       0
Diagnosed_Condition             0
Patient_ID                      0
Treated_with_drugs             13
Patient_Age                     0
Patient_Body_Mass_Index         0
Patient_Smoker                  0
Patient_Rural_Urban             0
Patient_mental_condition        0
A                            1235
B                            1235
C                            1235
D                            1235
E                            1235
F                            1235
Z                            1235
Number_of_prev_cond          1235
Survived_1_year                 0
dtype: int64

In [15]:
print("Percentage of Null Rows = ", "{:.2f}".format((1248/23097)*100), "%")

Percentage of Null Rows =  5.40 %


Since the above percentage is below 20%, it seems safe to drop the null rows

In [11]:
pharma_data[pharma_data['Treated_with_drugs'].isnull()]

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
4,33012,0,12513,,128,1.3484,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
218,33008,0,12509,,112,1.2925,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
2540,33014,0,12515,,149,1.7784,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
2711,33004,0,12505,,123,1.929,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
4308,33001,0,12502,,114,1.6239,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
4887,33011,0,12512,,132,1.9896,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
6308,33010,0,12511,,142,1.5701,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
10388,33006,0,12507,,138,1.3745,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
10663,33009,0,12510,,142,1.145,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
12782,33000,0,12501,,112,1.5465,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1


In [12]:
pharma_data[pharma_data['Patient_Smoker'] == 'Cannot say']

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
4,33012,0,12513,,128,1.3484,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
218,33008,0,12509,,112,1.2925,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
2540,33014,0,12515,,149,1.7784,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
2711,33004,0,12505,,123,1.929,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
4308,33001,0,12502,,114,1.6239,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
4887,33011,0,12512,,132,1.9896,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
6308,33010,0,12511,,142,1.5701,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
10388,33006,0,12507,,138,1.3745,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
10663,33009,0,12510,,142,1.145,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
12782,33000,0,12501,,112,1.5465,Cannot say,RURAL,Stable,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1


In [20]:
print(pharma_data.Patient_Smoker.value_counts())
print(pharma_data.Patient_Rural_Urban.value_counts())
print(pharma_data.Patient_mental_condition.value_counts())
print("\n")

print(data.Patient_Smoker.value_counts())
print(data.Patient_Rural_Urban.value_counts())
print(data.Patient_mental_condition.value_counts())

NO            13246
YES            9838
Cannot say       13
Name: Patient_Smoker, dtype: int64
RURAL    16134
URBAN     6963
Name: Patient_Rural_Urban, dtype: int64
Stable    23097
Name: Patient_mental_condition, dtype: int64


NO     12519
YES     9330
Name: Patient_Smoker, dtype: int64
RURAL    15279
URBAN     6570
Name: Patient_Rural_Urban, dtype: int64
Stable    21849
Name: Patient_mental_condition, dtype: int64


It looks like the all the pantients have the same value "Stable" for the variable `Patient_mental_condition`, hence that column won't be of much help.

In [11]:
data = data.drop(['Patient_mental_condition'], axis = 1)
data_copy = data.copy()

In [12]:
le = LabelEncoder()
data.Patient_Smoker = le.fit_transform(data.Patient_Smoker)
data.Patient_Rural_Urban = le.fit_transform(data.Patient_Rural_Urban)
data.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,22374,8,3333,DX6,56,18.48,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
1,18164,5,5740,DX2,36,22.95,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,6283,23,10446,DX6,48,27.51,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,5339,51,12011,DX1,5,19.13,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
5,10808,45,7977,DX6,47,26.16,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [42]:
# data.Treated_with_drugs.value_counts()

# data1 = pd.get_dummies(data, columns=['Treated_with_drugs'])
# data1.head()

The above method creates way too many dummy variables for the `data2.head()` column, as there are multiple values for the variable (as seen in the `value_counts`).
So we should create the dummies by combining the values to genetare multiple '1's, by treating the column as a Pandas Series object and performing str operations.

We also don't need the `ID_Patient_Care_Situation` and `Patient_ID` columns, as they offer no valuable information.

In [13]:
data1 = data.copy()
df = data1['Treated_with_drugs'].str.get_dummies(sep=' ')

In [14]:
data2 = pd.concat([data1, df], axis=1)
data2 = data2.drop(['Treated_with_drugs', 'ID_Patient_Care_Situation', 'Patient_ID'], axis=1)
data2.head()

Unnamed: 0,Diagnosed_Condition,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year,DX1,DX2,DX3,DX4,DX5,DX6
0,8,56,18.48,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0,0,0,0,0,0,1
1,5,36,22.95,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,1,0,0,0,0
2,23,48,27.51,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,1
3,51,5,19.13,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1,0,0,0,0,0
5,45,47,26.16,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,1
