In [1]:
import os
import sys
import pandas as pd
import re

In [17]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# QC the custom HCP predictors 

In [3]:
df_HCP = pd.read_excel(r'F:\Projects\Pfizer_mCRPC\Data\Raw_data\Pfizer_mCRPC_HCPHCOS.xlsx')

In [4]:
df_HCP.head()

Unnamed: 0,PATIENT_ID,PROVIDER_ID,PN_FLAG,PRI_SPCL_DESC,IMS_ORG_ID,BUSINESS_NAME,COT_CLASSIFICATION,COT_FACILITY_TYPE,HCOS_MCRPC_DECILE,HCP_ZX_DECILE,HCP_MCRPC_DECILE,HCP_AGONIST_PAT_CNT,HCP_ANTAGONIST_PAT_CNT,HCP_ADT_SCORE,HCP_REGION,HHI
0,839544914,6804099,N,MEDICAL ONCOLOGY,INS00052246,Comprehensive Cancer Centers of Nevada Central...,Outpatient Center,Medical Group,10.0,10.0,10,61.0,20.0,0.506173,WEST,0.528096
1,582866916,6805982,N,UROLOGY,,,,,,,7,6.0,,1.0,WEST,0.78125
2,171011889,6814767,N,UROLOGY,INS00120362,"Michigan Institute of Urology, PC",Outpatient Center,Medical Group,10.0,,3,1.0,,1.0,MIDWEST,1.0
3,645101969,6839361,N,HEMATOLOGY/ONCOLOGY,INS00115109,Saint Lukes Mountain States Tumor Institute,Outpatient Center,Medical Group,4.0,1.0,3,,,,WEST,1.0
4,152961559,6841478,N,HEMATOLOGY/ONCOLOGY,INS00041246,Medical Group of the Carolinas Hematology Onco...,Outpatient Center,Medical Group,7.0,1.0,5,1.0,,1.0,SOUTH,0.52


### Check if numeric columns have negative values

In [47]:
# See if any numeric column has negative values
(df_HCP.loc[:,df_HCP.apply(pd.api.types.is_numeric_dtype, reduce=False)] < 0).any()

PATIENT_ID                False
PROVIDER_ID               False
HCOS_MCRPC_DECILE         False
HCP_ZX_DECILE             False
HCP_MCRPC_DECILE          False
HCP_AGONIST_PAT_CNT       False
HCP_ANTAGONIST_PAT_CNT    False
HCP_ADT_SCORE              True
HHI                       False
dtype: bool

In [52]:
# ADT Preference score is a score ranging from -1.0 to +1.0, indicating preference for 
# ADT agonist or antagonist after initial diagnosis.  It is calculated in the following manner:
(df_HCP['HCP_ADT_SCORE'].unique() >=-1) & (df_HCP['HCP_ADT_SCORE'].unique() <= 1)

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True], dtype=bool)

### Check number of unique values for each categorical column 

In [24]:
# We need to know the number of unique values for the following columns to decide on making dummy variables
ls_object_columns = ['PROVIDER_ID', 'PRI_SPCL_DESC', 'IMS_ORG_ID', 'BUSINESS_NAME', 'COT_CLASSIFICATION', 'COT_FACILITY_TYPE', 'HCP_REGION']
df_HCP[ls_object_columns].nunique()

PROVIDER_ID           27056
PRI_SPCL_DESC           140
IMS_ORG_ID            12389
BUSINESS_NAME         11258
COT_CLASSIFICATION       12
COT_FACILITY_TYPE        27
HCP_REGION                4
dtype: int64

In [18]:
df_HCP['COT_CLASSIFICATION'].unique()
df_HCP['COT_FACILITY_TYPE'].unique()

array(['Outpatient Center', nan, 'Hospital', 'Residential', 'Insurance',
       'Academic', 'Laboratory', 'Research', 'Miscellaneous', 'Pharmacy',
       'Emergency', 'Elder Care', 'Veterinary Medicine'], dtype=object)

array(['Medical Group', nan, 'Independent Physician Practice',
       'Acute Care Hospital', 'Imaging Center', 'Home Health',
       'Managed Care', 'Outpatient Surgi Center', 'Nursing Homes',
       'Medical School', 'Clinic', 'Clinical Laboratory',
       'Other Laboratory', 'Research Institute', 'Rehabilitation Hospital',
       'Physical Medicine and Rehabilitation', 'Non-health Delivery',
       'Psychiatric Hospital', 'Assisted Living', 'Blood Bank',
       'Closed-door', 'Health Department', 'Outpatient Pharmacy', 'Rescue',
       'Adult Day Care', 'Hospital/Clinic', 'Ambulance',
       'Research Laboratory'], dtype=object)

### Check if patient ids map correctly

In [19]:
df_HCP.shape

(70883, 16)

In [20]:
df_patient_id = pd.read_excel(r'F:\Projects\Pfizer_mCRPC\Data\Raw_data\Pfizer_mCRPC_Patients.xlsx')

In [32]:
# Patient IDs map perfectly
df_HCP['PATIENT_ID'].nunique()
set_patient_ids = set(df_patient_id['PATIENT_ID'])
df_HCP['PATIENT_ID'].isin(set_patient_ids).all()

70883

True

In [37]:
# And the labels are consistent
df_merged = df_patient_id[['PATIENT_ID', 'PN_FLAG']].merge(df_HCP[['PATIENT_ID', 'PN_FLAG']], how = 'inner', on = 'PATIENT_ID')
(df_merged['PN_FLAG_x'] == df_merged['PN_FLAG_y']).all()

True

# Create dummy variables

In [53]:
ls_cat_columns = ['PRI_SPCL_DESC', 'COT_CLASSIFICATION', 'COT_FACILITY_TYPE', 'HCP_REGION']
df_cat = df_HCP[ls_cat_columns]

In [59]:
df_dummies = pd.get_dummies(df_cat)

### Clean up column names (no spaces, lower case)

In [61]:
ls_colnames = df_dummies.columns.tolist()

In [62]:
sys.path.append('F:\Frederik\pfizer_mcrpc\paqc')
from paqc.utils import utils

In [66]:
ls_colnames_clean = [utils.clean_string(colname) for colname in ls_colnames]
df_dummies.columns = ls_colnames_clean

### Merge with the original dataframe

In [74]:
# Drop the unused columns (the ones changed into dummies and the ones that we don't use)
ls_deleted_columns = ['PROVIDER_ID', 'PN_FLAG', 'IMS_ORG_ID', 'BUSINESS_NAME', 'PRI_SPCL_DESC', 
                      'COT_CLASSIFICATION', 'COT_FACILITY_TYPE', 'HCP_REGION']
df_HCP = df_HCP.drop(ls_deleted_columns, axis=1)
df_HCP.shape

(70883, 8)

In [81]:
# Done
df_total = pd.concat([df_HCP, df_dummies], axis=1)
df_total.shape

(70883, 191)

# Export to csv

In [82]:
df_total.to_csv(r'F:\Projects\Pfizer_mCRPC\Data\Raw_data\Pfizer_mCRPC_HCPHCOS_WITH_DUMMIES.csv')