## EDA Drug Persistency Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
pd.options.display.float_format = '{:,.2f}'.format

#### Import Dataset

In [5]:
df = pd.read_excel('Healthcare_dataset_only.xlsx')

In [6]:
df.head()

Unnamed: 0,Ptid,Persistency_Flag,Gender,Race,Ethnicity,Region,Age_Bucket,Ntm_Speciality,Ntm_Specialist_Flag,Ntm_Speciality_Bucket,...,Risk_Family_History_Of_Osteoporosis,Risk_Low_Calcium_Intake,Risk_Vitamin_D_Insufficiency,Risk_Poor_Health_Frailty,Risk_Excessive_Thinness,Risk_Hysterectomy_Oophorectomy,Risk_Estrogen_Deficiency,Risk_Immobilization,Risk_Recurring_Falls,Count_Of_Risks
0,P1,Persistent,Male,Caucasian,Not Hispanic,West,>75,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,N,N,N,N,N,N,N,N,N,0
1,P2,Non-Persistent,Male,Asian,Not Hispanic,West,55-65,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,N,N,N,N,N,N,N,N,N,0
2,P3,Non-Persistent,Female,Other/Unknown,Hispanic,Midwest,65-75,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,N,Y,N,N,N,N,N,N,N,2
3,P4,Non-Persistent,Female,Caucasian,Not Hispanic,Midwest,>75,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,N,N,N,N,N,N,N,N,N,1
4,P5,Non-Persistent,Female,Caucasian,Not Hispanic,Midwest,>75,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,N,N,N,N,N,N,N,N,N,1


#### Observing Data Types

In [37]:
pd.set_option('display.max_rows', 70)
dtypes = df.dtypes
print(dtypes)

Ptid                                                                  object
Persistency_Flag                                                      object
Gender                                                                object
Race                                                                  object
Ethnicity                                                             object
Region                                                                object
Age_Bucket                                                            object
Ntm_Speciality                                                        object
Ntm_Specialist_Flag                                                   object
Ntm_Speciality_Bucket                                                 object
Gluco_Record_Prior_Ntm                                                object
Gluco_Record_During_Rx                                                object
Dexa_Freq_During_Rx                                                    int64

In [9]:
shape = df.shape
print(shape)

(3424, 69)


#### Checking for NaN Values

In [14]:
is_null = df.isnull().values.any()
print(is_null)

False


In [18]:
df.Persistency_Flag.unique()

array(['Persistent', 'Non-Persistent'], dtype=object)

In [19]:
df.Gender.unique()

array(['Male', 'Female'], dtype=object)

In [15]:
df.Race.unique()

array(['Caucasian', 'Asian', 'Other/Unknown', 'African American'],
      dtype=object)

In [17]:
df.Race.value_counts()

Caucasian           3148
Other/Unknown         97
African American      95
Asian                 84
Name: Race, dtype: int64

In [25]:
(97/df.Race.count())*100

2.832943925233645

In [20]:
df.Ethnicity.unique()

array(['Not Hispanic', 'Hispanic', 'Unknown'], dtype=object)

In [21]:
df.Ethnicity.value_counts()

Not Hispanic    3235
Hispanic          98
Unknown           91
Name: Ethnicity, dtype: int64

In [23]:
(91/(91+98+3235))*100

2.6577102803738315

In [26]:
df.Region.unique()

array(['West', 'Midwest', 'South', 'Other/Unknown', 'Northeast'],
      dtype=object)

In [27]:
df.Region.value_counts()

Midwest          1383
South            1247
West              502
Northeast         232
Other/Unknown      60
Name: Region, dtype: int64

In [28]:
(60/df.Region.count())*100

1.7523364485981308

In [30]:
df.Age_Bucket.unique()

array(['>75', '55-65', '65-75', '<55'], dtype=object)

In [32]:
df.Ntm_Speciality.unique()

array(['GENERAL PRACTITIONER', 'Unknown', 'ENDOCRINOLOGY', 'RHEUMATOLOGY',
       'ONCOLOGY', 'PATHOLOGY', 'OBSTETRICS AND GYNECOLOGY',
       'PSYCHIATRY AND NEUROLOGY', 'ORTHOPEDIC SURGERY',
       'PHYSICAL MEDICINE AND REHABILITATION',
       'SURGERY AND SURGICAL SPECIALTIES', 'PEDIATRICS',
       'PULMONARY MEDICINE', 'HEMATOLOGY & ONCOLOGY', 'UROLOGY',
       'PAIN MEDICINE', 'NEUROLOGY', 'RADIOLOGY', 'GASTROENTEROLOGY',
       'EMERGENCY MEDICINE', 'PODIATRY', 'OPHTHALMOLOGY',
       'OCCUPATIONAL MEDICINE', 'TRANSPLANT SURGERY', 'PLASTIC SURGERY',
       'CLINICAL NURSE SPECIALIST', 'OTOLARYNGOLOGY', 'HOSPITAL MEDICINE',
       'ORTHOPEDICS', 'NEPHROLOGY', 'GERIATRIC MEDICINE',
       'HOSPICE AND PALLIATIVE MEDICINE',
       'OBSTETRICS & OBSTETRICS & GYNECOLOGY & OBSTETRICS & GYNECOLOGY',
       'VASCULAR SURGERY', 'CARDIOLOGY', 'NUCLEAR MEDICINE'], dtype=object)

In [34]:
df.Ntm_Speciality.value_counts()

GENERAL PRACTITIONER                                              1535
RHEUMATOLOGY                                                       604
ENDOCRINOLOGY                                                      458
Unknown                                                            310
ONCOLOGY                                                           225
OBSTETRICS AND GYNECOLOGY                                           90
UROLOGY                                                             33
ORTHOPEDIC SURGERY                                                  30
CARDIOLOGY                                                          22
PATHOLOGY                                                           16
HEMATOLOGY & ONCOLOGY                                               14
OTOLARYNGOLOGY                                                      14
PEDIATRICS                                                          13
PHYSICAL MEDICINE AND REHABILITATION                                11
PULMON

In [35]:
(310/df.Ntm_Speciality.count())*100

9.05373831775701

In [36]:
df.Ntm_Specialist_Flag.unique()

array(['Others', 'Specialist'], dtype=object)

In [38]:
df.Ntm_Speciality_Bucket.unique()

array(['OB/GYN/Others/PCP/Unknown', 'Endo/Onc/Uro', 'Rheum'], dtype=object)

In [47]:
df.Risk_Segment_During_Rx.unique()

array(['VLR_LR', 'Unknown', 'HR_VHR'], dtype=object)

In [48]:
df.Risk_Segment_During_Rx.value_counts()

Unknown    1497
HR_VHR      965
VLR_LR      962
Name: Risk_Segment_During_Rx, dtype: int64

In [49]:
(1497/df.Risk_Segment_During_Rx.count())*100

43.720794392523366

In [50]:
df.Tscore_Bucket_During_Rx.unique()

array(['<=-2.5', 'Unknown', '>-2.5'], dtype=object)

In [51]:
df.Tscore_Bucket_During_Rx.value_counts()

Unknown    1497
<=-2.5     1017
>-2.5       910
Name: Tscore_Bucket_During_Rx, dtype: int64

In [52]:
(1497/df.Tscore_Bucket_During_Rx.count())*100

43.720794392523366

In [53]:
df.Change_T_Score.unique()

array(['No change', 'Unknown', 'Worsened', 'Improved'], dtype=object)

In [54]:
df.Change_Risk_Segment.unique()

array(['Unknown', 'No change', 'Worsened', 'Improved'], dtype=object)

In [55]:
df.Change_Risk_Segment.value_counts()

Unknown      2229
No change    1052
Worsened      121
Improved       22
Name: Change_Risk_Segment, dtype: int64

In [56]:
(2229/df.Change_Risk_Segment.count())*100

65.09929906542055

In [57]:
df.Injectable_Experience_During_Rx.unique()

array(['Y', 'N'], dtype=object)

In [58]:
df.Adherent_Flag.unique()

array(['Adherent', 'Non-Adherent'], dtype=object)

In [60]:
df.Idn_Indicator.unique()

array(['N', 'Y'], dtype=object)