In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm import tqdm
%load_ext autoreload
%autoreload 2
import mmm_transformations
import mmm_preprocessing
import mmm_modeling
import mmm_response_curves
import mmm_optimization
from scipy.optimize import curve_fit
from sklearn.preprocessing import MinMaxScaler

# Preprocessing

In [2]:
df = pd.read_csv('bioxcel_pr_definitive_month_level_all.csv')
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,definitive_id,date,shipped_quantity,speaker_npi_ct,pp_imp_tot,pp_imp_desktop,pp_imp_mobile,pp_imp_set_top_box,pp_imp_tablet,pp_imp_connected_device,...,cash_on_hand,total_overhead_costs,est_#_of_er_visits,territory_name,territory_employee,drg_880_patients_in_cohort,2022_agitation_pts,antipsych_tot_rx_qt,hcp_call_tot,top_10_psych_npis_by antipsych_vol
0,1,2022-05,,,,,,,,,...,,,24206.0,PHOENIX,Kelly Sweeney,,67.0,360060.0,,"1841552056,1497897730,1295741791,1356503510,19..."
1,1,2022-06,,,,,,,,,...,,,24206.0,PHOENIX,Kelly Sweeney,,67.0,360060.0,,"1841552056,1497897730,1295741791,1356503510,19..."
2,1,2022-07,,,,,,,,,...,,,24206.0,PHOENIX,Kelly Sweeney,,67.0,360060.0,,"1841552056,1497897730,1295741791,1356503510,19..."
3,1,2022-08,,,,,,,,,...,,,24206.0,PHOENIX,Kelly Sweeney,,67.0,360060.0,,"1841552056,1497897730,1295741791,1356503510,19..."
4,1,2022-09,,,,,,,,,...,,,24206.0,PHOENIX,Kelly Sweeney,,67.0,360060.0,,"1841552056,1497897730,1295741791,1356503510,19..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142495,999580,2023-03,,,,,,,,,...,,13654541.0,59807.0,SEATTLE NORTH,Lori Akiyama,,,,,
142496,999580,2023-04,,,,,,,,,...,,13654541.0,59807.0,SEATTLE NORTH,Lori Akiyama,,,,,
142497,999580,2023-05,,,,,,,,,...,,13654541.0,59807.0,SEATTLE NORTH,Lori Akiyama,,,,,
142498,999580,2023-06,,,,,,,,,...,,13654541.0,59807.0,SEATTLE NORTH,Lori Akiyama,,,,,


In [3]:
df_pp = df.copy()
calls_only = ['call_inperson',
 'call_phone',
 'call_video',
 'call_inperson_gt1_repl',
 'call_phone_gt1_repl',
 'call_video_gt1_repl',
 'call_tot',
 'call_tot_gt1_repl',
 'ed_in_person',
 'ed_phone',
 'ed_video',
 'other_in_person',
 'other_nan',
 'other_phone',
 'other_video',
 'pharmacy_in_person',
 'pharmacy_phone',
 'pharmacy_video',
 'psychiatry_in_person',
 'psychiatry_nan',
 'psychiatry_phone',
 'psychiatry_video',
 'pharmacy_call_tot',
 'psychiatry_call_tot',
 'ed_call_tot',
 'other_call_tot']
media = ['speaker_npi_ct', 'pp_imp_tot', 'rxnt_imp', 'webmd_imp_tot']
binary = ['2023_targets']
financials = ['net_patient_revenue', 'net_income', 'operating_income', 'cash_on_hand', 'total_overhead_costs', 'total_revenues']
clinical = ['est_#_of_er_visits']
snr = [x for x in df_pp.columns if (x.startswith('anti')) | (x.startswith('benzo')) | (x.startswith('payor'))]
non_media_cat = ['formulary_type', 'segmentation', 'flag']
non_media_num = snr + financials + clinical
cols = ['definitive_idn_id', 'idn', 'definitive_id', 'date', 'shipped_quantity', 'confirmed_date'] + media + calls_only + non_media_cat + binary + non_media_num
df_pp = df_pp[cols]
## filter to only ordering accounts
#ordering_acc = df_pp.groupby('definitive_id')['shipped_quantity'].sum().reset_index()
#ordering_acc = ordering_acc[ordering_acc['shipped_quantity']>0]['definitive_id'].tolist()
#df_pp = df_pp[df_pp['definitive_id'].isin(ordering_acc)]
# replace nulls for media cols and shipped quantity with 0
for i in media + calls_only + ['shipped_quantity'] + binary:
    df_pp[i] = df_pp[i].fillna(0)
# replace nulls for non media categorical columns with unknown
for i in non_media_cat:
    df_pp[i] = df_pp[i].fillna('Unknown')
# convert payor mix cols to float
for i in non_media_num:
    if i.startswith('payor'):
        df_pp[i] = df_pp[i].str.replace('%','').astype(float)
df_pp

Unnamed: 0,definitive_idn_id,idn,definitive_id,date,shipped_quantity,confirmed_date,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,...,payor_mix_medicaid_days,payor_mix_private/self-pay/other_days,antipsych_tot_rx_qt,net_patient_revenue,net_income,operating_income,cash_on_hand,total_overhead_costs,total_revenues,est_#_of_er_visits
0,541814.0,VA Desert Pacific Healthcare Network (VISN 22),1,2022-05,0.0,,0.0,0.0,0.0,0.0,...,,,360060.0,,,,,,,24206.0
1,541814.0,VA Desert Pacific Healthcare Network (VISN 22),1,2022-06,0.0,,0.0,0.0,0.0,0.0,...,,,360060.0,,,,,,,24206.0
2,541814.0,VA Desert Pacific Healthcare Network (VISN 22),1,2022-07,0.0,,0.0,0.0,0.0,0.0,...,,,360060.0,,,,,,,24206.0
3,541814.0,VA Desert Pacific Healthcare Network (VISN 22),1,2022-08,0.0,,0.0,0.0,0.0,0.0,...,,,360060.0,,,,,,,24206.0
4,541814.0,VA Desert Pacific Healthcare Network (VISN 22),1,2022-09,0.0,,0.0,0.0,0.0,0.0,...,,,360060.0,,,,,,,24206.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142495,999580.0,Kaiser Permanente Washington (FKA Group Health...,999580,2023-03,0.0,,0.0,0.0,0.0,0.0,...,2.3,96.7,,1.0,1.0,1.0,,13654541.0,1.0,59807.0
142496,999580.0,Kaiser Permanente Washington (FKA Group Health...,999580,2023-04,0.0,,0.0,0.0,0.0,0.0,...,2.3,96.7,,1.0,1.0,1.0,,13654541.0,1.0,59807.0
142497,999580.0,Kaiser Permanente Washington (FKA Group Health...,999580,2023-05,0.0,,0.0,0.0,0.0,0.0,...,2.3,96.7,,1.0,1.0,1.0,,13654541.0,1.0,59807.0
142498,999580.0,Kaiser Permanente Washington (FKA Group Health...,999580,2023-06,0.0,,0.0,0.0,0.0,0.0,...,2.3,96.7,,1.0,1.0,1.0,,13654541.0,1.0,59807.0


In [32]:
# imputation/NA statistics
for i in non_media_num:
    print(f"{i}:{len(df_pp[df_pp[i].isna()])}")

antipsychotic_im_iv_pack_units:60450
antipsychotic_im_iv_volume_units:60450
antipsychotic_im_iv_wac_dollars:60435
antipsychotic_oral_pack_units:58905
antipsychotic_oral_volume_units:58875
antipsychotic_oral_wac_dollars:58845
benzodiazepine_im_iv_pack_units:57450
benzodiazepine_im_iv_volume_units:57450
benzodiazepine_im_iv_wac_dollars:57450
benzodiazepine_oral_pack_units:56340
benzodiazepine_oral_volume_units:56355
benzodiazepine_oral_wac_dollars:56325
payor_mix_medicare_days:41160
payor_mix_medicaid_days:53835
payor_mix_private/self-pay/other_days:40410
antipsych_tot_rx_qt:45105
net_patient_revenue:42600
net_income:40170
operating_income:42615
cash_on_hand:47625
total_overhead_costs:40050
total_revenues:42480
est_#_of_er_visits:53475


In [4]:
# one-hot encode non media categorical columns
preprocessing = mmm_preprocessing.MMMPreprocessing()
df_pp = preprocessing.one_hot(df_pp, non_media_cat)
df_pp

Unnamed: 0,definitive_idn_id,idn,definitive_id,date,shipped_quantity,confirmed_date,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,...,formulary_type_On Restricted,formulary_type_Unknown,segmentation_A,segmentation_B,segmentation_C,segmentation_D,segmentation_Unknown,flag_Unknown,flag_Wave I,flag_Wave II
0,541814.0,VA Desert Pacific Healthcare Network (VISN 22),1,2022-05,0.0,,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
1,541814.0,VA Desert Pacific Healthcare Network (VISN 22),1,2022-06,0.0,,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
2,541814.0,VA Desert Pacific Healthcare Network (VISN 22),1,2022-07,0.0,,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
3,541814.0,VA Desert Pacific Healthcare Network (VISN 22),1,2022-08,0.0,,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
4,541814.0,VA Desert Pacific Healthcare Network (VISN 22),1,2022-09,0.0,,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142495,999580.0,Kaiser Permanente Washington (FKA Group Health...,999580,2023-03,0.0,,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
142496,999580.0,Kaiser Permanente Washington (FKA Group Health...,999580,2023-04,0.0,,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
142497,999580.0,Kaiser Permanente Washington (FKA Group Health...,999580,2023-05,0.0,,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
142498,999580.0,Kaiser Permanente Washington (FKA Group Health...,999580,2023-06,0.0,,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0


In [5]:
keep_num = ['shipped_quantity'] + media + calls_only + non_media_cat + binary + non_media_num + [x for x in df_pp.columns if x.startswith('formulary_')]
df_pp['confirmed_date'] = pd.to_datetime(df_pp['confirmed_date'])
j1 = df_pp[['definitive_idn_id', 'date'] + keep_num].groupby(['definitive_idn_id', 'date']).sum().reset_index()
j2 = df_pp[['definitive_idn_id', 'idn', 'confirmed_date']].groupby('definitive_idn_id').agg({'idn':'first', 'confirmed_date':'min'}).reset_index()
df_pp = j1.merge(j2, on='definitive_idn_id', how='left')
df_pp

Unnamed: 0,definitive_idn_id,date,shipped_quantity,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_inperson,call_phone,call_video,...,total_overhead_costs,total_revenues,est_#_of_er_visits,formulary_type_Accessible/non-formulary,formulary_type_Off Formulary,formulary_type_On Formulary,formulary_type_On Restricted,formulary_type_Unknown,idn,confirmed_date
0,786.0,2022-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,808806220.0,1.510556e+09,200566.0,0,0,0,0,3,NCH Healthcare System,NaT
1,786.0,2022-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,808806220.0,1.510556e+09,200566.0,0,0,0,0,3,NCH Healthcare System,NaT
2,786.0,2022-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,808806220.0,1.510556e+09,200566.0,0,0,0,0,3,NCH Healthcare System,NaT
3,786.0,2022-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,808806220.0,1.510556e+09,200566.0,0,0,0,0,3,NCH Healthcare System,NaT
4,786.0,2022-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,808806220.0,1.510556e+09,200566.0,0,0,0,0,3,NCH Healthcare System,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16735,1061529.0,2023-03,0.0,0.0,92.0,0.0,0.0,0.0,0.0,0.0,...,115707252.0,2.072294e+08,13926.0,0,0,0,0,4,Rural Wellness,NaT
16736,1061529.0,2023-04,0.0,0.0,87.0,0.0,0.0,1.0,0.0,0.0,...,115707252.0,2.072294e+08,13926.0,0,0,0,0,4,Rural Wellness,NaT
16737,1061529.0,2023-05,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,115707252.0,2.072294e+08,13926.0,0,0,0,0,4,Rural Wellness,NaT
16738,1061529.0,2023-06,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,115707252.0,2.072294e+08,13926.0,0,0,0,0,4,Rural Wellness,NaT


# Transformations

In [6]:
transform = mmm_transformations.MMMTransformations()

In [7]:
df_t = transform.lag_dv(df_pp, 'shipped_quantity', 3, 'definitive_idn_id')
df_t = transform.lag_dv(df_t, 'speaker_npi_ct', 3, 'definitive_idn_id')
df_t = transform.lag_dv(df_t, 'pp_imp_tot', 3, 'definitive_idn_id')
df_t = transform.lag_dv(df_t, 'rxnt_imp', 3, 'definitive_idn_id')
df_t = transform.lag_dv(df_t, 'webmd_imp_tot', 3, 'definitive_idn_id')
#df_t = transform.lag_dv(df_t, 'call_tot', 3, 'definitive_id')
#df_t = transform.lag_dv(df_t, 'call_inperson', 3, 'definitive_id')
#df_t = transform.lag_dv(df_t, 'call_inperson_gt1_repl', 3, 'definitive_id')
#df_t = transform.lag_dv(df_t, 'call_phone_gt1_repl', 3, 'definitive_id')
#df_t = transform.lag_dv(df_t, 'call_video_gt1_repl', 3, 'definitive_id')
df_t = transform.lag_dv(df_t, 'psychiatry_call_tot', 3, 'definitive_idn_id')
df_t = transform.lag_dv(df_t, 'ed_call_tot', 3, 'definitive_idn_id')
df_t = transform.lag_dv(df_t, 'other_call_tot', 3, 'definitive_idn_id')
df_t

Unnamed: 0,definitive_idn_id,date,shipped_quantity,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_inperson,call_phone,call_video,...,webmd_imp_tot_lag3,psychiatry_call_tot_lag1,psychiatry_call_tot_lag2,psychiatry_call_tot_lag3,ed_call_tot_lag1,ed_call_tot_lag2,ed_call_tot_lag3,other_call_tot_lag1,other_call_tot_lag2,other_call_tot_lag3
0,786.0,2022-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,786.0,2022-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,786.0,2022-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,786.0,2022-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,786.0,2022-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16735,1061529.0,2023-03,0.0,0.0,92.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16736,1061529.0,2023-04,0.0,0.0,87.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16737,1061529.0,2023-05,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16738,1061529.0,2023-06,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df_t.describe()

Unnamed: 0,definitive_idn_id,shipped_quantity,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_inperson,call_phone,call_video,call_inperson_gt1_repl,...,webmd_imp_tot_lag3,psychiatry_call_tot_lag1,psychiatry_call_tot_lag2,psychiatry_call_tot_lag3,ed_call_tot_lag1,ed_call_tot_lag2,ed_call_tot_lag3,other_call_tot_lag1,other_call_tot_lag2,other_call_tot_lag3
count,16740.0,16740.0,16740.0,16740.0,16740.0,16740.0,16740.0,16740.0,16740.0,16740.0,...,16740.0,16740.0,16740.0,16740.0,16740.0,16740.0,16740.0,16740.0,16740.0,16740.0
mean,506788.3,0.013262,0.01589,232.336559,61.010573,0.348507,0.588292,0.282616,0.016428,0.525448,...,0.348507,0.311051,0.275627,0.23399,0.294325,0.259379,0.21583,0.169176,0.151135,0.123596
std,376016.0,0.249386,0.29632,951.471025,302.494679,4.365485,1.955605,1.071436,0.157737,1.669978,...,4.365485,1.364099,1.283288,1.165509,1.304403,1.204562,1.037248,1.077578,1.009591,0.88307
min,786.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7229.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,550196.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,845346.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1061529.0,15.0,20.0,14673.0,6993.0,198.0,53.0,29.0,4.0,49.0,...,198.0,42.0,42.0,42.0,48.0,48.0,24.0,37.0,37.0,25.0


In [30]:
order_acc = df_t.groupby('definitive_idn_id')['shipped_quantity'].sum().reset_index()
order_acc = order_acc[order_acc['shipped_quantity'] > 0]
df_t = df_t[df_t['definitive_idn_id'].isin(np.unique(order_acc['definitive_idn_id']))]
df_t

Unnamed: 0,definitive_idn_id,date,shipped_quantity,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_inperson,call_phone,call_video,...,webmd_imp_tot_lag3,psychiatry_call_tot_lag1,psychiatry_call_tot_lag2,psychiatry_call_tot_lag3,ed_call_tot_lag1,ed_call_tot_lag2,ed_call_tot_lag3,other_call_tot_lag1,other_call_tot_lag2,other_call_tot_lag3
30,2342.0,2022-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,2342.0,2022-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32,2342.0,2022-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,2342.0,2022-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,2342.0,2022-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16615,1058226.0,2023-03,0.0,0.0,2221.0,1033.0,0.0,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,5.0,4.0,0.0,0.0,0.0
16616,1058226.0,2023-04,4.0,0.0,1940.0,0.0,36.0,8.0,0.0,0.0,...,0.0,1.0,0.0,0.0,10.0,3.0,5.0,1.0,0.0,0.0
16617,1058226.0,2023-05,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,...,0.0,0.0,1.0,0.0,5.0,10.0,3.0,0.0,1.0,0.0
16618,1058226.0,2023-06,0.0,0.0,0.0,0.0,0.0,6.0,1.0,0.0,...,0.0,1.0,0.0,1.0,6.0,5.0,10.0,2.0,0.0,1.0


# EDA

In [31]:
# distribution of promotion activity before first shipment
df_pre_order = pd.DataFrame()
order_acc = df_t.groupby('definitive_idn_id')['shipped_quantity'].sum().reset_index()
order_acc = order_acc[order_acc['shipped_quantity'] > 0]
order_acc['definitive_idn_id']
for i in np.unique(df_t['definitive_idn_id']):
    df_filt = df_t[df_t['definitive_idn_id']==i].reset_index()
    df_filt = df_filt.iloc[:df_filt['shipped_quantity'].values.searchsorted('1', side='right')]
    df_pre_order = pd.concat([df_pre_order, df_filt])

df_stats = df_pre_order.groupby('definitive_idn_id').agg({'date':'count', 'speaker_npi_ct': 'mean', 'pp_imp_tot': 'mean',
                                                      'rxnt_imp': 'mean', 'webmd_imp_tot': 'mean', 'call_tot': 'mean',
                                                      'call_inperson': 'mean', 'call_inperson_gt1_repl': 'mean',
                                                      'call_phone_gt1_repl': 'mean', 'call_video_gt1_repl': 'mean'}).reset_index()

df_stats.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
definitive_idn_id,43.0,427255.651163,396330.778193,2342.0,7093.0,542164.0,780732.0,1058226.0
date,43.0,12.906977,3.076903,6.0,11.0,15.0,15.0,15.0
speaker_npi_ct,43.0,0.031572,0.072167,0.0,0.0,0.0,0.033333,0.3333333
pp_imp_tot,43.0,340.369767,418.026793,0.0,27.6,206.266667,487.6,2235.467
rxnt_imp,43.0,107.324242,175.533405,0.0,9.577778,57.2,135.066667,981.7333
webmd_imp_tot,43.0,0.35969,1.130635,0.0,0.0,0.0,0.0,7.0
call_tot,43.0,3.474932,5.837666,0.0,1.166667,2.266667,3.766667,38.4
call_inperson,43.0,2.082278,3.882846,0.0,0.666667,1.166667,2.266667,25.46667
call_inperson_gt1_repl,43.0,1.891208,3.660825,0.0,0.636364,0.933333,2.0,24.13333
call_phone_gt1_repl,43.0,1.303386,2.065124,0.0,0.233333,0.666667,1.619048,12.53333


In [10]:
df_pp[['definitive_idn_id'] + [x for x in df_pp.columns if x.startswith('formulary_')]].groupby('definitive_idn_id').sum().reset_index().describe()

Unnamed: 0,definitive_idn_id,formulary_type_Accessible/non-formulary,formulary_type_Off Formulary,formulary_type_On Formulary,formulary_type_On Restricted,formulary_type_Unknown
count,1116.0,1116.0,1116.0,1116.0,1116.0,1116.0
mean,506788.3,2.043011,5.30914,1.182796,0.645161,94.099462
std,376173.3,22.928861,24.395257,9.519332,12.561436,150.009177
min,786.0,0.0,0.0,0.0,0.0,15.0
25%,7229.75,0.0,0.0,0.0,0.0,45.0
50%,550196.0,0.0,0.0,0.0,0.0,60.0
75%,845346.2,0.0,0.0,0.0,0.0,105.0
max,1061529.0,570.0,420.0,180.0,390.0,2430.0


In [12]:
# distribution of promotion activity before on formulary
df_pre_form = df_pp.copy()
df_pre_form = df_pre_form[~df_pre_form['confirmed_date'].isna()]
# filter to idns with a confirmed date and at least 50% accounts are on form
form_acc = df_pre_form[['definitive_idn_id'] + [x for x in df_pre_form.columns if x.startswith('form')]].groupby('definitive_idn_id').sum().reset_index()
form_acc['tot_form'] = form_acc[['formulary_type_Accessible/non-formulary', 'formulary_type_Off Formulary', 'formulary_type_On Formulary', 'formulary_type_On Restricted']].sum(axis=1)
form_acc['perc_form'] = form_acc['formulary_type_On Formulary']/form_acc['tot_form']
form_acc = form_acc[form_acc['perc_form']>=0.5]['definitive_idn_id']
df_pre_form = df_pre_form[df_pre_form['definitive_idn_id'].isin(np.unique(form_acc))]
df_pre_form['confirmed_month'] = df_pre_form['confirmed_date'].dt.to_period('M')
keep = ['speaker_npi_ct', 'pp_imp_tot', 'rxnt_imp', 'webmd_imp_tot', 'call_tot', 'call_inperson', 'call_inperson_gt1_repl', 'call_phone_gt1_repl', 'call_video_gt1_repl']
for i in keep:
    df_pre_form[i] = df_pre_form[i].replace(np.nan, 0)
df_pre_form = df_pre_form[df_pre_form['date'] < df_pre_form['confirmed_month']]
df_stats = df_pre_form.groupby('definitive_idn_id').agg({'date':'count', 'speaker_npi_ct': 'mean', 'pp_imp_tot': 'mean',
                                                     'rxnt_imp': 'mean', 'webmd_imp_tot': 'mean', 'call_tot': 'mean',
                                                     'call_inperson': 'mean', 'call_inperson_gt1_repl': 'mean',
                                                     'call_phone_gt1_repl': 'mean', 'call_video_gt1_repl': 'mean'}).reset_index()

df_stats.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
definitive_idn_id,17.0,443943.0,400492.666375,4722.0,7189.0,550060.0,780814.0,1058226.0
date,17.0,9.058824,1.853058,6.0,8.0,9.0,10.0,12.0
speaker_npi_ct,17.0,0.004902,0.020211,0.0,0.0,0.0,0.0,0.08333333
pp_imp_tot,17.0,202.468538,398.227794,0.0,0.0,0.0,208.0,1548.917
rxnt_imp,17.0,96.840939,125.732157,0.0,0.0,65.222222,114.0,434.1
webmd_imp_tot,17.0,0.083333,0.343592,0.0,0.0,0.0,0.0,1.416667
call_tot,17.0,2.00421,1.807047,0.0,0.555556,1.7,2.6,6.583333
call_inperson,17.0,1.418084,1.19187,0.0,0.333333,1.6,2.1,3.75
call_inperson_gt1_repl,17.0,1.282181,1.085942,0.0,0.333333,1.1,2.0,3.583333
call_phone_gt1_repl,17.0,0.521093,0.728017,0.0,0.111111,0.3,0.555556,2.75


In [13]:
# distribution of promotion activity among non-ordering accounts
df_non_order = df_pp.copy()
non_order = df_pp[['definitive_idn_id', 'shipped_quantity']].groupby('definitive_idn_id').sum().reset_index()
non_order = non_order[non_order['shipped_quantity']==0]['definitive_idn_id']
df_non_order = df_non_order[df_non_order['definitive_idn_id'].isin(non_order)]
keep = ['speaker_npi_ct', 'pp_imp_tot', 'rxnt_imp', 'webmd_imp_tot', 'call_tot', 'call_inperson', 'call_inperson_gt1_repl', 'call_phone_gt1_repl', 'call_video_gt1_repl']
for i in keep:
    df_non_order[i] = df_non_order[i].replace(np.nan, 0)
df_stats = df_non_order.groupby('definitive_idn_id').agg({'date':'count', 'speaker_npi_ct': 'mean', 'pp_imp_tot': 'mean',
                                                     'rxnt_imp': 'mean', 'webmd_imp_tot': 'mean', 'call_tot': 'mean',
                                                     'call_inperson': 'mean', 'call_inperson_gt1_repl': 'mean',
                                                     'call_phone_gt1_repl': 'mean', 'call_video_gt1_repl': 'mean'}).reset_index()

df_stats.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
definitive_idn_id,1073.0,509975.487418,375185.779157,786.0,7237.0,550206.0,846981.0,1061529.0
date,1073.0,15.0,0.0,15.0,15.0,15.0,15.0,15.0
speaker_npi_ct,1073.0,0.014601,0.080872,0.0,0.0,0.0,0.0,1.333333
pp_imp_tot,1073.0,224.391861,362.866379,0.0,1.0,67.6,279.133333,2801.8
rxnt_imp,1073.0,58.690463,116.18427,0.0,0.0,12.0,64.0,1046.733
webmd_imp_tot,1073.0,0.341348,1.070894,0.0,0.0,0.0,0.0,13.2
call_tot,1073.0,0.759304,1.498774,0.0,0.0,0.133333,0.866667,18.13333
call_inperson,1073.0,0.51078,1.109988,0.0,0.0,0.066667,0.6,17.6
call_inperson_gt1_repl,1073.0,0.45772,0.926406,0.0,0.0,0.066667,0.533333,10.13333
call_phone_gt1_repl,1073.0,0.222367,0.517549,0.0,0.0,0.0,0.2,5.866667


# Correlations - overall level

In [39]:
df = pd.read_csv('bioxcel_pr_definitive_id_level_all.csv')

df_pp = df.copy()
calls_only = ['call_inperson',
 'call_phone',
 'call_video',
 'call_inperson_gt1_repl',
 'call_phone_gt1_repl',
 'call_video_gt1_repl',
 'call_tot',
 'call_tot_gt1_repl',
 'ed_in_person',
 'ed_phone',
 'ed_video',
 'other_in_person',
 'other_nan',
 'other_phone',
 'other_video',
 'pharmacy_in_person',
 'pharmacy_phone',
 'pharmacy_video',
 'psychiatry_in_person',
 'psychiatry_nan',
 'psychiatry_phone',
 'psychiatry_video',
 'pharmacy_call_tot',
 'psychiatry_call_tot',
 'ed_call_tot',
 'other_call_tot']
media = ['speaker_npi_ct', 'pp_imp_tot', 'rxnt_imp', 'webmd_imp_tot']
binary = ['2023_targets']
financials = ['net_patient_revenue', 'net_income', 'operating_income', 'cash_on_hand', 'total_overhead_costs', 'total_revenues']
clinical = ['est_#_of_er_visits']
snr = [x for x in df_pp.columns if (x.startswith('anti')) | (x.startswith('benzo')) | (x.startswith('payor'))]
non_media_cat = ['formulary_type', 'segmentation', 'flag']
non_media_num = snr + financials + clinical
cols = ['definitive_idn_id', 'idn', 'definitive_id', 'shipped_quantity', 'confirmed_date'] + media + calls_only + non_media_cat + binary + non_media_num
df_pp = df_pp[cols]
## filter to only ordering accounts
#ordering_acc = df_pp.groupby('definitive_id')['shipped_quantity'].sum().reset_index()
#ordering_acc = ordering_acc[ordering_acc['shipped_quantity']>0]['definitive_id'].tolist()
#df_pp = df_pp[df_pp['definitive_id'].isin(ordering_acc)]
# replace nulls for media cols and shipped quantity with 0
for i in media + calls_only + ['shipped_quantity'] + binary:
    df_pp[i] = df_pp[i].fillna(0)
# replace nulls for non media categorical columns with unknown
for i in non_media_cat:
    df_pp[i] = df_pp[i].fillna('Unknown')
# convert payor mix cols to float
for i in non_media_num:
    if i.startswith('payor'):
        df_pp[i] = df_pp[i].str.replace('%','').astype(float)
        
# one-hot encode non media categorical columns
preprocessing = mmm_preprocessing.MMMPreprocessing()
df_pp = preprocessing.one_hot(df_pp, non_media_cat)

# roll up to idn level
keep_num = ['shipped_quantity'] + media + calls_only + non_media_cat + binary + non_media_num + [x for x in df_pp.columns if x.startswith('formulary_')]
j1 = df_pp[['definitive_idn_id'] + keep_num].groupby('definitive_idn_id').sum().reset_index()
j2 = df_pp[['definitive_idn_id', 'idn']].groupby('definitive_idn_id').first().reset_index()
df_pp_idn = j1.merge(j2, on='definitive_idn_id', how='left')
df_pp_idn

Unnamed: 0,definitive_idn_id,shipped_quantity,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_inperson,call_phone,call_video,call_inperson_gt1_repl,...,cash_on_hand,total_overhead_costs,total_revenues,est_#_of_er_visits,formulary_type_Accessible/non-formulary,formulary_type_Off Formulary,formulary_type_On Formulary,formulary_type_On Restricted,formulary_type_Unknown,idn
0,786.0,0.0,0.0,4635.0,971.0,2.0,0.0,0.0,0.0,0.0,...,3.185479e+07,8.088062e+08,1.510556e+09,200566.0,0,0,0,0,3,NCH Healthcare System
1,1657.0,0.0,0.0,792.0,0.0,5.0,1.0,3.0,0.0,1.0,...,2.262910e+09,3.248538e+09,6.316878e+09,385107.0,0,0,0,0,5,UK HealthCare (FKA University of Kentucky Heal...
2,2342.0,2.0,0.0,673.0,0.0,0.0,4.0,0.0,0.0,4.0,...,1.120850e+08,1.490168e+09,1.040426e+10,215930.0,0,0,0,0,9,University of Missouri Health Care (AKA MU Hea...
3,2754.0,0.0,9.0,14138.0,4856.0,10.0,26.0,4.0,1.0,26.0,...,2.345615e+08,5.771124e+09,1.227575e+10,1133704.0,1,0,0,0,11,Montefiore Health System (AKA Montefiore Medic...
4,2760.0,1.0,0.0,3464.0,631.0,0.0,51.0,1.0,0.0,51.0,...,1.107037e+08,7.333335e+08,1.826515e+09,166918.0,0,0,0,0,5,United Health Services
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,1059010.0,0.0,0.0,547.0,233.0,0.0,0.0,0.0,0.0,0.0,...,-1.109574e+07,2.998408e+08,5.030944e+08,162606.0,0,0,0,0,3,Resilience Healthcare
1112,1059592.0,0.0,0.0,6081.0,2031.0,9.0,3.0,0.0,0.0,3.0,...,3.882500e+08,2.698838e+08,2.669334e+09,118686.0,0,0,0,0,3,University of Arkansas for Medical Sciences He...
1113,1059696.0,0.0,0.0,2304.0,557.0,0.0,0.0,0.0,0.0,0.0,...,1.890096e+07,3.084147e+08,3.983740e+08,120950.0,0,1,0,0,2,Insight
1114,1060215.0,0.0,0.0,538.0,152.0,0.0,5.0,3.0,0.0,5.0,...,0.000000e+00,1.868943e+08,0.000000e+00,0.0,0,0,0,0,5,Bureau of Public Health Hospitals


In [15]:
df_pp[df_pp['shipped_quantity']>0][['shipped_quantity', 'psychiatry_in_person']].corr()

Unnamed: 0,shipped_quantity,psychiatry_in_person
shipped_quantity,1.0,0.300657
psychiatry_in_person,0.300657,1.0


In [16]:
df_pp_idn[df_pp_idn['shipped_quantity']>0][['shipped_quantity', 'psychiatry_in_person']].corr()

Unnamed: 0,shipped_quantity,psychiatry_in_person
shipped_quantity,1.0,-0.016409
psychiatry_in_person,-0.016409,1.0


In [18]:
# only 54 definitive ids with a valud definitive idn id
# removing those 15 accounts significantly changes the correlation analysis for calls
# those 15 accounts have the strongest correlations with ordering
channels = ['speaker_npi_ct', 'pp_imp_tot', 'rxnt_imp', 'webmd_imp_tot'] + calls_only
df_pp[df_pp['shipped_quantity']>0][['definitive_id', 'definitive_idn_id', 'shipped_quantity'] + channels].describe()

Unnamed: 0,definitive_id,definitive_idn_id,shipped_quantity,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_inperson,call_phone,call_video,...,pharmacy_phone,pharmacy_video,psychiatry_in_person,psychiatry_nan,psychiatry_phone,psychiatry_video,pharmacy_call_tot,psychiatry_call_tot,ed_call_tot,other_call_tot
count,69.0,54.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,...,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0
mean,86419.64,397043.9,4.463768,0.15942,1508.072464,433.913043,2.275362,11.507246,8.15942,0.217391,...,0.0,0.0,5.695652,0.0,1.521739,0.072464,0.0,7.289855,5.144928,3.637681
std,209750.5,390776.3,5.69463,0.558969,1937.730459,662.31705,5.94802,14.163565,12.742499,0.661317,...,0.0,0.0,9.136714,0.0,3.475158,0.312431,0.0,10.159176,8.767043,9.554751
min,40.0,2342.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1725.0,7059.25,1.0,0.0,47.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3784.0,408080.5,2.0,0.0,513.0,135.0,0.0,7.0,4.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
75%,5567.0,581825.0,6.0,0.0,2636.0,631.0,0.0,19.0,11.0,0.0,...,0.0,0.0,8.0,0.0,1.0,0.0,0.0,11.0,8.0,2.0
max,1047935.0,1058226.0,36.0,3.0,7654.0,3373.0,28.0,80.0,62.0,3.0,...,0.0,0.0,41.0,0.0,17.0,2.0,0.0,42.0,45.0,54.0


In [19]:
# only 54 definitive ids with a valud definitive idn id
# removing those 15 accounts significantly changes the correlation analysis for calls
chk = df_pp[df_pp['shipped_quantity']>0][['definitive_id', 'definitive_idn_id', 'shipped_quantity'] + channels]
chk = chk.dropna()
chk.corr()

Unnamed: 0,definitive_id,definitive_idn_id,shipped_quantity,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_inperson,call_phone,call_video,...,pharmacy_phone,pharmacy_video,psychiatry_in_person,psychiatry_nan,psychiatry_phone,psychiatry_video,pharmacy_call_tot,psychiatry_call_tot,ed_call_tot,other_call_tot
definitive_id,1.0,-0.078535,-0.118645,-0.149109,-0.191513,-0.077581,-0.153539,-0.301334,-0.130514,-0.134455,...,,,-0.147706,,-0.039969,0.028256,,-0.143641,-0.113355,0.177928
definitive_idn_id,-0.078535,1.0,-0.005731,-0.099592,-0.075449,0.092355,0.213262,0.046155,-0.104785,0.145326,...,,,-0.01211,,-0.091859,-0.048319,,-0.047161,0.082311,-0.239192
shipped_quantity,-0.118645,-0.005731,1.0,-0.042489,-0.153147,-0.04692,-0.074365,-0.01985,0.284619,-0.015886,...,,,-0.001888,,0.216138,-0.095562,,0.07735,-0.210869,-0.210746
speaker_npi_ct,-0.149109,-0.099592,-0.042489,1.0,0.200319,0.192375,0.193586,0.114365,0.031127,-0.092368,...,,,0.24071,,0.387224,-0.072807,,0.355686,0.020622,-0.109939
pp_imp_tot,-0.191513,-0.075449,-0.153147,0.200319,1.0,0.653845,0.394955,0.222995,0.149738,-0.02096,...,,,0.291785,,0.171987,0.036574,,0.322228,0.28456,0.215955
rxnt_imp,-0.077581,0.092355,-0.04692,0.192375,0.653845,1.0,0.511118,0.156623,0.048411,-0.095125,...,,,0.482895,,0.205705,0.1841,,0.507394,0.284273,0.240442
webmd_imp_tot,-0.153539,0.213262,-0.074365,0.193586,0.394955,0.511118,1.0,0.054988,-0.102288,-0.125168,...,,,0.179404,,-0.061656,0.092857,,0.13681,0.182235,-0.046099
call_inperson,-0.301334,0.046155,-0.01985,0.114365,0.222995,0.156623,0.054988,1.0,0.389394,0.055456,...,,,0.475449,,0.141418,-0.120155,,0.466241,0.547504,0.081121
call_phone,-0.130514,-0.104785,0.284619,0.031127,0.149738,0.048411,-0.102288,0.389394,1.0,0.049606,...,,,0.053815,,0.659953,-0.1087,,0.294496,0.017266,-0.005277
call_video,-0.134455,0.145326,-0.015886,-0.092368,-0.02096,-0.095125,-0.125168,0.055456,0.049606,1.0,...,,,0.058155,,0.021744,0.201011,,0.065938,0.063611,0.086997


In [20]:
chk = df_pp[df_pp['shipped_quantity']>0][['definitive_id', 'definitive_idn_id', 'shipped_quantity'] + channels]
chk[chk['definitive_idn_id'].isna()].corr()

Unnamed: 0,definitive_id,definitive_idn_id,shipped_quantity,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_inperson,call_phone,call_video,...,pharmacy_phone,pharmacy_video,psychiatry_in_person,psychiatry_nan,psychiatry_phone,psychiatry_video,pharmacy_call_tot,psychiatry_call_tot,ed_call_tot,other_call_tot
definitive_id,1.0,,0.099677,0.417238,0.016655,-0.152866,0.015444,-0.19214,0.140861,0.280999,...,,,-0.026113,,0.204169,0.213403,,0.019999,-0.408483,-0.449966
definitive_idn_id,,,,,,,,,,,...,,,,,,,,,,
shipped_quantity,0.099677,,1.0,0.010917,-0.052765,-0.136894,0.007439,0.751347,-0.110846,0.242156,...,,,0.671177,,-0.06811,-0.132036,,0.623214,0.48714,0.003626
speaker_npi_ct,0.417238,,0.010917,1.0,-0.200816,-0.272588,-0.104828,-0.179801,0.167902,-0.186989,...,,,-0.010912,,0.651635,-0.104828,,0.115522,-0.293241,-0.153279
pp_imp_tot,0.016655,,-0.052765,-0.200816,1.0,0.945103,0.783562,0.237271,0.420694,0.415656,...,,,-0.009451,,-0.097185,-0.149589,,-0.031427,0.592069,-0.156645
rxnt_imp,-0.152866,,-0.136894,-0.272588,0.945103,1.0,0.664481,0.175063,0.327006,0.275324,...,,,0.039543,,-0.173879,-0.185738,,-0.000631,0.609905,-0.181247
webmd_imp_tot,0.015444,,0.007439,-0.104828,0.783562,0.664481,1.0,0.190577,0.359561,0.637059,...,,,-0.168526,,-0.096003,-0.071429,,-0.181045,0.549477,-0.069628
call_inperson,-0.19214,,0.751347,-0.179801,0.237271,0.175063,0.190577,1.0,0.156905,0.033994,...,,,0.776796,,0.006099,-0.040838,,0.740465,0.772463,0.098195
call_phone,0.140861,,-0.110846,0.167902,0.420694,0.327006,0.359561,0.156905,1.0,0.269668,...,,,-0.138365,,0.363136,-0.069461,,-0.061913,0.063884,-0.105548
call_video,0.280999,,0.242156,-0.186989,0.415656,0.275324,0.637059,0.033994,0.269668,1.0,...,,,-0.256404,,-0.171247,0.254824,,-0.272395,0.178207,-0.161461


In [21]:
df_pp_idn_ord = df_pp_idn[df_pp_idn['shipped_quantity'] > 0]
cor, features, cts = [], [], []
channels = ['speaker_npi_ct', 'pp_imp_tot', 'rxnt_imp', 'webmd_imp_tot'] + calls_only
for i in channels + non_media_num:
    df_filt = df_pp_idn_ord[~df_pp_idn_ord[i].isna()]
    if i not in channels:
        df_filt = df_filt[df_filt[i] != 0]
    cor.append(np.corrcoef(df_filt['shipped_quantity'], df_filt[i])[0][1])
    cts.append(len(df_filt[df_filt[i]!=0]))
    features.append(i)
pd.DataFrame({'feature': features, 'cor': cor, 'n': cts}).sort_values(['cor'], ascending=False)

  c /= stddev[:, None]
  c /= stddev[None, :]


Unnamed: 0,feature,cor,n
32,antipsychotic_im_iv_wac_dollars,0.317812,42
31,antipsychotic_im_iv_volume_units,0.285695,42
35,antipsychotic_oral_wac_dollars,0.269699,42
33,antipsychotic_oral_pack_units,0.228589,42
34,antipsychotic_oral_volume_units,0.216495,42
8,call_phone_gt1_repl,0.209837,39
5,call_phone,0.2098,39
9,call_video_gt1_repl,0.15838,11
30,antipsychotic_im_iv_pack_units,0.140101,42
6,call_video,0.131448,11


In [40]:
df = pd.read_csv('bioxcel_pr_definitive_month_level_all.csv')
df_monthly = df[['definitive_idn_id', 'shipped_quantity', 'call_inperson_gt1_repl']].fillna(0)
df_monthly = df_monthly.groupby('definitive_idn_id').mean().reset_index().rename(columns={'shipped_quantity':'avg_monthly_shipped_quantity', 'call_inperson_gt1_repl':'avg_monthly_call_inperson_gt1_repl'})
df_pp_idn = df_pp_idn.merge(df_monthly, on='definitive_idn_id', how='left')
df_pp_idn['antipsychotic_im_iv_volume_units'] = df_pp_idn['antipsychotic_im_iv_volume_units'].astype(float)
df_pp_idn['decile_antipsychotic_im_iv_volume_units'] = pd.qcut(df_pp_idn['antipsychotic_im_iv_volume_units'], 10, labels=False, duplicates='drop')
df_pp_idn['cash_on_hand'] = df_pp_idn['cash_on_hand'].astype(float)
df_pp_idn['decile_cash_on_hand'] = pd.qcut(df_pp_idn['cash_on_hand'], 10, labels=False, duplicates='drop')
df_pp_idn.to_csv('ysp_idn.csv', index=False)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [34]:
df_pp_idn

Unnamed: 0,definitive_idn_id,shipped_quantity,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_inperson,call_phone,call_video,call_inperson_gt1_repl,...,total_revenues,est_#_of_er_visits,formulary_type_Accessible/non-formulary,formulary_type_Off Formulary,formulary_type_On Formulary,formulary_type_On Restricted,formulary_type_Unknown,idn,avg_monthly_shipped_quantity,avg_monthly_call_inperson_gt1_repl
0,786.0,0.0,0.0,4635.0,971.0,2.0,0.0,0.0,0.0,0.0,...,1.510556e+09,200566.0,0,0,0,0,3,NCH Healthcare System,0.000000,0.000000
1,1657.0,0.0,0.0,792.0,0.0,5.0,1.0,3.0,0.0,1.0,...,6.316878e+09,385107.0,0,0,0,0,5,UK HealthCare (FKA University of Kentucky Heal...,0.000000,0.013333
2,2342.0,2.0,0.0,673.0,0.0,0.0,4.0,0.0,0.0,4.0,...,1.040426e+10,215930.0,0,0,0,0,9,University of Missouri Health Care (AKA MU Hea...,0.014815,0.029630
3,2754.0,0.0,9.0,14138.0,4856.0,10.0,26.0,4.0,1.0,26.0,...,1.227575e+10,1133704.0,1,0,0,0,11,Montefiore Health System (AKA Montefiore Medic...,0.000000,0.144444
4,2760.0,1.0,0.0,3464.0,631.0,0.0,51.0,1.0,0.0,51.0,...,1.826515e+09,166918.0,0,0,0,0,5,United Health Services,0.013333,0.680000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,1059010.0,0.0,0.0,547.0,233.0,0.0,0.0,0.0,0.0,0.0,...,5.030944e+08,162606.0,0,0,0,0,3,Resilience Healthcare,0.000000,0.000000
1112,1059592.0,0.0,0.0,6081.0,2031.0,9.0,3.0,0.0,0.0,3.0,...,2.669334e+09,118686.0,0,0,0,0,3,University of Arkansas for Medical Sciences He...,0.000000,0.066667
1113,1059696.0,0.0,0.0,2304.0,557.0,0.0,0.0,0.0,0.0,0.0,...,3.983740e+08,120950.0,0,1,0,0,2,Insight,0.000000,0.000000
1114,1060215.0,0.0,0.0,538.0,152.0,0.0,5.0,3.0,0.0,5.0,...,0.000000e+00,0.0,0,0,0,0,5,Bureau of Public Health Hospitals,0.000000,0.066667
