In [2]:
import pandas as pd
import re
import numpy as np

In [47]:
# Take the subset of var_config features that need to be mapped to a description
var_config = pd.read_csv('04_with_datediff_and_dummies_no_flags_var_config.csv')
var_config = var_config.iloc[4:3528, :]
var_config.tail()

Unnamed: 0,Column,Type,NumUniqueValues
3523,s62_max_min_dd,numerical,580
3524,s63_claim_cnt,numerical,46
3525,s63_index_max_dd,numerical,682
3526,s63_index_min_dd,numerical,704
3527,s63_max_min_dd,numerical,523


In [48]:
# Take only the code part of the predictors
var_config_code = var_config['Column'].apply(lambda x: x.split('_')[0])

In [28]:
# The series that will step by step be filled for the different groups of features
df_descriptions = pd.DataFrame(np.nan, index=range(4,3528), columns = ['Code', 'Description', 'Class'])

In [50]:
df_descriptions.tail()

Unnamed: 0,Code,Description,Class
3523,,,
3524,,,
3525,,,
3526,,,
3527,,,


# GPI-6 

In [7]:
df_prod = pd.read_csv(r'F:\Projects\Pfizer_mCRPC\Data\Raw_data\NDW Clinical Codes\V_PRODUCT.csv', dtype = {'GPI6_CD': str})

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
df_prod.head()

Unnamed: 0,PRODUCT_ID,NDC,CMF_PPK_NBR,CMF_PROD_NBR,CMF_PACK_NBR,MKTED_PROD_FORMLTN_NM,MKTED_PROD_FORMLTN_SHORT_NM,MKTED_PROD_DETL_NM,MKTED_PROD_TYP_CD,MKTED_PROD_NM,...,GPI_THPTC_CLAS_ID,GPI_THPTC_CLAS_DESC,GENERIC_THPTC_CLAS_ID,GENERIC_THPTC_CLAS_DESC,AHFS_CD,AHFS_DESC,THPTC_CLAS_ID,THPTC_CLAS_DESC,CMF_MKTED_PROD_NM,CMF_USC_CD
0,1093553,53191006301,,,,E-MAX-1000 ORAL CAPSULE 1000 UNIT,E-MAX-1000 ORAL CAPSULE 1000 UNIT,E-MAX-1000 OTC,B,E-MAX-1000,...,14.0,NUTRITIONAL PRODUCTS,95.0,VITAMINS,88200000.0,VITAMIN E,80.0,FAT SOLUBLE VITAMINS,,
1,1248854,55629051204,,,,E-MINIC ORAL,E-MINIC ORAL,E-MINIC OTC,B,E-MINIC,...,6.0,ANTIHISTAMINES/NASAL AGENTS/COUGH & COLD/RESPI...,18.0,ANTIHISTAMINE AND DECONGESTANT COMBINATION,40420000.0,PROPYLAMINE DERIVATIVES,99.0,MISCELLANEOUS,,
2,1248853,55629051104,,,,E-MINIC ORAL,E-MINIC ORAL,E-MINIC OTC,B,E-MINIC,...,6.0,ANTIHISTAMINES/NASAL AGENTS/COUGH & COLD/RESPI...,50.0,COUGH/COLD PREPARATIONS,48160000.0,EXPECTORANTS,16.0,COUGH PREPARATIONS/EXPECTORANTS,,
3,1023989,247124630,,,,E-MYCIN ORAL TABLET ENTERICCOATED 333 MG,E-MYCIN ORAL TABLET ENTERICCOATED 333 MG,E-MYCIN RX,T,E-MYCIN,...,1.0,ANTI-INFECTIVE AGENTS,19.0,ANTIBIOTICS,8121200.0,MACROLIDES,25.0,ERYTHROMYCINS,,
4,1024033,247136130,,,,E-MYCIN ORAL TABLET ENTERICCOATED 250 MG,E-MYCIN ORAL TABLET ENTERICCOATED 250 MG,E-MYCIN RX,T,E-MYCIN,...,1.0,ANTI-INFECTIVE AGENTS,19.0,ANTIBIOTICS,8121200.0,MACROLIDES,25.0,ERYTHROMYCINS,,


In [51]:
gpi6_colnames = ['GPI6_CD', 'GPI6_DESC']

In [52]:
df_prod = df_prod[gpi6_colnames].drop_duplicates()

In [53]:
df_prod.head()

Unnamed: 0,GPI6_CD,GPI6_DESC
0,772030,VITAMIN E***
1,439930,DECONGESTANT & ANTIHISTAMINE***
2,439962,DECONGESTANT W/ EXPECTORANT***
3,31000,ERYTHROMYCINS***
20,12000,AMINOPENICILLINS***


In [54]:
prog = re.compile('^g[0-9]+')
gpi6 = var_config['Column'].apply(prog.search)
ss_matches = gpi6[gpi6.apply(bool)].apply(lambda x: x.group(0)[1:])

In [55]:
df_matches = pd.DataFrame({'GPI6_CD':ss_matches})

In [56]:
df_gpi6 = df_matches.reset_index().merge(df_prod[gpi6_colnames], on = 'GPI6_CD', how = 'left').set_index('index')

In [57]:
df_gpi6 = df_gpi6.rename(columns = {'GPI6_CD': 'Code', 'GPI6_DESC': 'Description'})

In [58]:
df_descriptions.loc[df_gpi6.index,:] = df_gpi6
df_descriptions.loc[df_gpi6.index, 'Class'] = 'GPI-6'

In [59]:
# Seems good, code and descriptions filled together
df_descriptions.apply(pd.isnull).sum()

Code           2948
Description    2948
Class          2948
dtype: int64

#  CPT/HCPCS

In [60]:
df_proc = pd.read_excel(r'F:\Projects\Pfizer_mCRPC\Data\Raw_data\NDW Clinical Codes\V_PROCEDURE.xlsx', 
                        dtype = {'DIAG_STD_CD': str})

In [61]:
df_proc = df_proc[['PRC_STD_CD', 'PRC_DESC']].drop_duplicates()

In [62]:
# HCPCS
codes_hcpcs = pd.DataFrame({'PRC_STD_CD':
                            var_config_code[var_config_code.apply(lambda x: x[0] == 'h')].apply(lambda x: x[1:].upper())})

df_hcpcs = codes_hcpcs.reset_index().merge(df_proc, on = 'PRC_STD_CD', how = 'left').set_index('index')
df_hcpcs = df_hcpcs.rename(columns = {'PRC_STD_CD': 'Code', 'PRC_DESC': 'Description'})

df_descriptions.loc[df_hcpcs.index,:] = df_hcpcs
df_descriptions.loc[df_hcpcs.index, 'Class'] = 'HCPCS'

In [63]:
# CPT
codes_cpt = pd.DataFrame({'PRC_STD_CD':
                            var_config_code[var_config_code.apply(lambda x: x[0] == 'c')].apply(lambda x: x[1:].upper())})

df_cpt = codes_cpt.reset_index().merge(df_proc, on = 'PRC_STD_CD', how = 'left').set_index('index')
df_cpt = df_cpt.rename(columns = {'PRC_STD_CD': 'Code', 'PRC_DESC': 'Description'})

df_descriptions.loc[df_cpt.index,:] = df_cpt
df_descriptions.loc[df_cpt.index,'Class'] = 'CPT'

In [64]:
# Seems good, code and descriptions filled together
df_descriptions.apply(pd.isnull).sum()

Code           1360
Description    1360
Class          1360
dtype: int64

#  ICD-9 level 4

In [65]:
df_diag = pd.read_excel(r'F:\Projects\Pfizer_mCRPC\Data\Raw_data\NDW Clinical Codes\V_DIAG_XREF_ICD9.xlsx', 
                        dtype = {'DIAG_STD_CD': str, 'DIAG_CD': str})

In [66]:
df_diag = df_diag[['DIAG_CD', 'DIAG_DESC']].drop_duplicates()

In [67]:
# ICD
codes_icd = pd.DataFrame({'DIAG_CD':
                            var_config_code[var_config_code.apply(lambda x: x[0] == 'd')].apply(lambda x: x[1:].upper())})

def take_first_4(str_code):
    if len(str_code) < 5:
        return(str_code)
    else:
        return(str_code[:4])
    

df_diag['DIAG_CD'] = df_diag['DIAG_CD'].apply(take_first_4)

df_icd = codes_icd.reset_index().merge(df_diag, on = 'DIAG_CD', how = 'left').set_index('index')
df_icd = df_icd.rename(columns = {'DIAG_CD': 'Code', 'DIAG_DESC': 'Description'})

df_descriptions.loc[df_icd.index,:] = df_icd
df_descriptions.loc[df_icd.index,'Class'] = 'ICD-9'

In [68]:
# Seems good, code and descriptions filled together
df_descriptions.apply(pd.isnull).sum()

Code           252
Description    252
Class          252
dtype: int64

# Speciality

In [69]:
df_spec_ref = pd.read_excel(r'F:\Projects\Pfizer_mCRPC\Data\Raw_data\specialty_ref.xlsx')

In [70]:
codes_spec = pd.DataFrame({'SPEC_PREDICTOR_ID':
                            var_config_code[var_config_code.apply(lambda x: x[0] == 's')].apply(lambda x: int(x[1:].upper()))})

df_spec = codes_spec.reset_index().merge(df_spec_ref, on = 'SPEC_PREDICTOR_ID', how = 'left').set_index('index')
df_spec = df_spec.rename(columns = {'SPEC_PREDICTOR_ID': 'Code', 'SPEC_PREDICTOR_LIST': 'Description'})

df_descriptions.loc[df_spec.index,:] = df_spec
df_descriptions.loc[df_spec.index, 'Class'] = 'Specialty'

In [71]:
# None without description!
df_descriptions.apply(pd.isnull).sum()

Code           0
Description    0
Class          0
dtype: int64

# Merge with var_config

In [72]:
var_config_full = pd.read_csv('04_with_datediff_and_dummies_no_flags_var_config.csv')
var_config_full['Code'] = np.nan
var_config_full['Description'] = np.nan
var_config_full['Class'] = np.nan
var_config_full[['Code', 'Description', 'Class']] = df_descriptions
var_config_full.head()

Unnamed: 0,Column,Type,NumUniqueValues,Code,Description,Class
0,patient_id,numerical,70883,,,
1,pn_flag,numerical,2,,,
2,age_index,numerical,67,,,
3,age_pc,numerical,67,,,
4,c00142_claim_cnt,numerical,8,142.0,ANESTHESIA FOR PROCEDURES ON EYE; LENS SURGERY,CPT


In [73]:
var_config_full.to_csv('04_with_datediff_and_dummies_no_flags_var_config.csv', index=False)

# Merge with bivariate stats

Was only necessary because bivariate stats was run before description columns were added to the var_config file

In [544]:
bivar_stats = pd.read_csv('01_pfizer_total_with_ddbivar_stats_y_flag.csv')
bivar_stats['Code'] = np.nan
bivar_stats['Description'] = np.nan
bivar_stats[['Code', 'Description']] = df_descriptions
bivar_stats.head()

Unnamed: 0,Variable,Type,NumUniqueValues,Variable_Type,Num Unique Values,Non Missing,Missing_N,Missing_P,Above 0,Positive Above 0,...,Positive Max,Negative Min,Negative P10,Negative P25,Negative P50,Negative P75,Negative P90,Negative Max,Code,Description
0,age_index,numerical,67,,67,70883,0,0.0,70883,3106,...,85,19.0,59.0,65.0,73.0,81.0,85.0,85,,
1,age_pc,numerical,67,,67,70883,0,0.0,70883,3106,...,85,19.0,57.0,63.0,70.0,78.0,85.0,85,,
2,c00142_claim_cnt,numerical,8,,8,70883,0,0.0,1575,69,...,4,0.0,0.0,0.0,0.0,0.0,0.0,8,,
3,c00142_flag,numerical,2,,2,70883,0,0.0,1575,69,...,1,0.0,0.0,0.0,0.0,0.0,0.0,1,,
4,c00142_index_max_dd,numerical,631,Date_Difference,631,1575,69308,0.97778,1572,69,...,723,0.0,76.0,166.25,349.0,552.0,664.0,730,142.0,ANESTHESIA FOR PROCEDURES ON EYE; LENS SURGERY


In [557]:
ls_columns = bivar_stats.columns.tolist()
ls_columns_reordered = ls_columns[:4] + ls_columns[-2:] + ls_columns[4:-2]

In [561]:
bivar_stats = bivar_stats[ls_columns_reordered]

In [562]:
bivar_stats.to_csv('01_pfizer_total_with_ddbivar_stats_y_flag_with_descriptions.csv', index=False)