In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm import tqdm
%load_ext autoreload
%autoreload 2
import mmm_transformations
import mmm_preprocessing
import mmm_modeling
import mmm_response_curves
import mmm_optimization

# Preprocessing

In [2]:
df = pd.read_csv('bioxcel_pr_definitive_month_level.csv')
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,definitive_id,date,shipped_quantity,speaker_npi_ct,pp_imp_tot,pp_imp_desktop,pp_imp_mobile,pp_imp_set_top_box,pp_imp_tablet,pp_imp_connected_device,...,psychiatric_unit_beds,patients_discharged_on_multiple_antipsychotic_medications_with_appropriate_justification_rate,_hours_of_seclusion_rate,hours_of_physical_restraint_use_rate,total_revenues,operating_income,net_income,cash_on_hand,total_overhead_costs,est_#_of_er_visits
0,1,2022-05,,,,,,,,,...,,,,,,,,,,24206.0
1,1,2022-06,,,,,,,,,...,,,,,,,,,,24206.0
2,1,2022-07,,,,,,,,,...,,,,,,,,,,24206.0
3,1,2022-08,,,,,,,,,...,,,,,,,,,,24206.0
4,1,2022-09,,,,,,,,,...,,,,,,,,,,24206.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75707,999006,2023-02,,,,,,,,,...,,,,,,,,,,
75708,999006,2023-03,,,8.0,,,,,,...,,,,,,,,,,
75709,999006,2023-04,,,1.0,,1.0,,,,...,,,,,,,,,,
75710,999006,2023-05,,,,,,,,,...,,,,,,,,,,


In [37]:
df_pp = df.copy()
media = ['speaker_npi_ct', 'pp_imp_tot', 'rxnt_imp', 'webmd_imp_tot', 'call_tot', 'call_inperson', 'call_inperson_gt1_repl', 'call_phone_gt1_repl', 'call_video_gt1_repl']
binary = ['2023_targets']
financials = ['net_patient_revenue', 'net_income', 'operating_income', 'cash_on_hand', 'total_overhead_costs', 'total_revenues']
clinical = ['est_#_of_er_visits']
snr = [x for x in df_pp.columns if (x.startswith('anti')) | (x.startswith('benzo')) | (x.startswith('payor'))]
non_media_cat = ['formulary_type', 'segmentation', 'flag']
non_media_num = snr + financials + clinical
cols = ['definitive_id', 'date', 'shipped_quantity', 'confirmed_date'] + media + non_media_cat + binary + non_media_num
df_pp = df_pp[cols]
# filter to only ordering accounts
ordering_acc = df_pp.groupby('definitive_id')['shipped_quantity'].sum().reset_index()
ordering_acc = ordering_acc[ordering_acc['shipped_quantity']>0]['definitive_id'].tolist()
df_pp = df_pp[df_pp['definitive_id'].isin(ordering_acc)]
# replace nulls for media cols and shipped quantity with 0
for i in media + ['shipped_quantity'] + binary:
    df_pp[i] = df_pp[i].fillna(0)
# replace nulls for non media categorical columns with unknown
for i in non_media_cat:
    df_pp[i] = df_pp[i].fillna('Unknown')
# convert payor mix cols to float
for i in non_media_num:
    if i.startswith('payor'):
        df_pp[i] = df_pp[i].str.replace('%','').astype(float)
df_pp

Unnamed: 0,definitive_id,date,shipped_quantity,confirmed_date,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_tot,call_inperson,...,payor_mix_medicare_days,payor_mix_medicaid_days,payor_mix_private/self-pay/other_days,net_patient_revenue,net_income,operating_income,cash_on_hand,total_overhead_costs,total_revenues,est_#_of_er_visits
1764,1047935,2022-05,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1765,1047935,2022-06,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1766,1047935,2022-07,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1767,1047935,2022-08,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1768,1047935,2022-09,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,795,2023-02,0.0,,0.0,741.0,178.0,0.0,1.0,0.0,...,15.0,8.1,76.9,697048507.0,90736430.0,-186235197.0,106799158.0,446151573.0,932461300.0,96786.0
69996,795,2023-03,0.0,,0.0,947.0,125.0,0.0,4.0,4.0,...,15.0,8.1,76.9,697048507.0,90736430.0,-186235197.0,106799158.0,446151573.0,932461300.0,96786.0
69997,795,2023-04,0.0,,0.0,1362.0,0.0,0.0,1.0,0.0,...,15.0,8.1,76.9,697048507.0,90736430.0,-186235197.0,106799158.0,446151573.0,932461300.0,96786.0
69998,795,2023-05,0.0,,0.0,0.0,0.0,0.0,3.0,2.0,...,15.0,8.1,76.9,697048507.0,90736430.0,-186235197.0,106799158.0,446151573.0,932461300.0,96786.0


In [184]:
# imputation/NA statistics
for i in non_media_num:
    print(f"{i}:{len(df_pp[df_pp[i].isna()])}")

antipsychotic_im_iv_pack_units:112
antipsychotic_im_iv_volume_units:112
antipsychotic_im_iv_wac_dollars:112
antipsychotic_oral_pack_units:112
antipsychotic_oral_volume_units:112
antipsychotic_oral_wac_dollars:112
benzodiazepine_im_iv_pack_units:112
benzodiazepine_im_iv_volume_units:112
benzodiazepine_im_iv_wac_dollars:112
benzodiazepine_oral_pack_units:112
benzodiazepine_oral_volume_units:112
benzodiazepine_oral_wac_dollars:112
payor_mix_medicare_days:126
payor_mix_medicaid_days:224
payor_mix_private/self-pay/other_days:126
net_patient_revenue:168
net_income:126
operating_income:168
cash_on_hand:210
total_overhead_costs:126
total_revenues:196
est_#_of_er_visits:280


In [38]:
# one-hot encode non media categorical columns
preprocessing = mmm_preprocessing.MMMPreprocessing()
df_pp = preprocessing.one_hot(df_pp, non_media_cat)
df_pp

Unnamed: 0,definitive_id,date,shipped_quantity,confirmed_date,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_tot,call_inperson,...,formulary_type_On Restricted,formulary_type_Unknown,segmentation_A,segmentation_B,segmentation_C,segmentation_D,segmentation_Unknown,flag_Unknown,flag_Wave I,flag_Wave II
1764,1047935,2022-05,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
1765,1047935,2022-06,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
1766,1047935,2022-07,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
1767,1047935,2022-08,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
1768,1047935,2022-09,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,795,2023-02,0.0,,0.0,741.0,178.0,0.0,1.0,0.0,...,0,1,1,0,0,0,0,0,0,1
69996,795,2023-03,0.0,,0.0,947.0,125.0,0.0,4.0,4.0,...,0,1,1,0,0,0,0,0,0,1
69997,795,2023-04,0.0,,0.0,1362.0,0.0,0.0,1.0,0.0,...,0,1,1,0,0,0,0,0,0,1
69998,795,2023-05,0.0,,0.0,0.0,0.0,0.0,3.0,2.0,...,0,1,1,0,0,0,0,0,0,1


In [42]:
df_pp.to_csv('bioxcel_pr_time_series_for_modeling.csv', index=False)

# Transformations

In [5]:
transform = mmm_transformations.MMMTransformations()

In [61]:
df_t = transform.lag_dv(df_pp, 'shipped_quantity', 3, 'definitive_id')
df_t = transform.lag_dv(df_t, 'speaker_npi_ct', 3, 'definitive_id')
df_t = transform.lag_dv(df_t, 'pp_imp_tot', 3, 'definitive_id')
df_t = transform.lag_dv(df_t, 'rxnt_imp', 3, 'definitive_id')
df_t = transform.lag_dv(df_t, 'webmd_imp_tot', 3, 'definitive_id')
#df_t = transform.lag_dv(df_t, 'call_tot', 3, 'definitive_id')
#df_t = transform.lag_dv(df_t, 'call_inperson', 3, 'definitive_id')
df_t = transform.lag_dv(df_t, 'call_inperson_gt1_repl', 3, 'definitive_id')
#df_t = transform.lag_dv(df_t, 'call_phone_gt1_repl', 3, 'definitive_id')
#df_t = transform.lag_dv(df_t, 'call_video_gt1_repl', 3, 'definitive_id')
df_t

Unnamed: 0,definitive_id,date,shipped_quantity,confirmed_date,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_tot,call_inperson,...,pp_imp_tot_lag3,rxnt_imp_lag1,rxnt_imp_lag2,rxnt_imp_lag3,webmd_imp_tot_lag1,webmd_imp_tot_lag2,webmd_imp_tot_lag3,call_inperson_gt1_repl_lag1,call_inperson_gt1_repl_lag2,call_inperson_gt1_repl_lag3
1764,1047935,2022-05,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1765,1047935,2022-06,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1766,1047935,2022-07,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1767,1047935,2022-08,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1768,1047935,2022-09,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,795,2023-02,0.0,,0.0,741.0,178.0,0.0,1.0,0.0,...,0.0,174.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
69996,795,2023-03,0.0,,0.0,947.0,125.0,0.0,4.0,4.0,...,0.0,178.0,174.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
69997,795,2023-04,0.0,,0.0,1362.0,0.0,0.0,1.0,0.0,...,0.0,125.0,178.0,174.0,0.0,0.0,0.0,3.0,0.0,1.0
69998,795,2023-05,0.0,,0.0,0.0,0.0,0.0,3.0,2.0,...,741.0,0.0,125.0,178.0,0.0,0.0,0.0,0.0,3.0,0.0


In [294]:
df_t.describe()

Unnamed: 0,definitive_id,shipped_quantity,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_tot,2023_targets,antipsychotic_im_iv_pack_units,antipsychotic_im_iv_volume_units,...,pp_imp_tot_lag3,rxnt_imp_lag1,rxnt_imp_lag2,rxnt_imp_lag3,webmd_imp_tot_lag1,webmd_imp_tot_lag2,webmd_imp_tot_lag3,call_tot_lag1,call_tot_lag2,call_tot_lag3
count,966.0,966.0,966.0,966.0,966.0,966.0,966.0,966.0,854.0,854.0,...,966.0,966.0,966.0,966.0,966.0,966.0,966.0,966.0,966.0,966.0
mean,86419.64,0.318841,0.011387,107.719462,30.993789,0.162526,2.168737,0.637681,2583.672131,7773.72459,...,72.069358,30.993789,30.993789,30.993789,0.162526,0.162526,0.0,2.165631,1.763975,1.510352
std,208332.8,1.044004,0.139855,365.072364,123.46829,1.684271,5.040303,0.480919,2359.707656,13365.945612,...,299.57091,123.46829,123.46829,123.46829,1.684271,1.684271,0.0,5.041332,4.395026,4.123909
min,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1725.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,884.0,1088.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3784.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1937.0,3185.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5567.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,3555.0,7159.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0
max,1047935.0,10.0,3.0,2801.0,1469.0,28.0,45.0,1.0,10386.0,79103.0,...,2572.0,1469.0,1469.0,1469.0,28.0,28.0,0.0,45.0,45.0,45.0


In [308]:
df_t[['definitive_id', 'date'] + [x for x in df_t.columns if 'call' in x]].head(50)

Unnamed: 0,definitive_id,date,call_tot,call_tot_lag1,call_tot_lag2,call_tot_lag3
1764,1047935,2022-05,0.0,0.0,0.0,0.0
1765,1047935,2022-06,0.0,0.0,0.0,0.0
1766,1047935,2022-07,0.0,0.0,0.0,0.0
1767,1047935,2022-08,0.0,0.0,0.0,0.0
1768,1047935,2022-09,0.0,0.0,0.0,0.0
1769,1047935,2022-10,0.0,0.0,0.0,0.0
1770,1047935,2022-11,0.0,0.0,0.0,0.0
1771,1047935,2022-12,0.0,0.0,0.0,0.0
1772,1047935,2023-01,0.0,0.0,0.0,0.0
1773,1047935,2023-02,0.0,0.0,0.0,0.0


In [333]:
df_t.groupby('date')['call_tot'].sum().reset_index()

Unnamed: 0,date,call_tot
0,2022-05,9.0
1,2022-06,45.0
2,2022-07,95.0
3,2022-08,123.0
4,2022-09,91.0
5,2022-10,136.0
6,2022-11,120.0
7,2022-12,126.0
8,2023-01,138.0
9,2023-02,238.0


# EDA

In [40]:
# distribution of promotion activity before first shipment
df_pre_order = pd.DataFrame()
for i in np.unique(df_t['definitive_id']):
    df_filt = df_t[df_t['definitive_id']==i].reset_index()
    df_filt = df_filt.iloc[:df_filt['shipped_quantity'].values.searchsorted('1', side='right')]
    df_pre_order = pd.concat([df_pre_order, df_filt])

df_stats = df_pre_order.groupby('definitive_id').agg({'date':'count', 'speaker_npi_ct': 'mean', 'pp_imp_tot': 'mean',
                                                      'rxnt_imp': 'mean', 'webmd_imp_tot': 'mean', 'call_tot': 'mean',
                                                      'call_inperson': 'mean', 'call_inperson_gt1_repl': 'mean',
                                                      'call_phone_gt1_repl': 'mean', 'call_video_gt1_repl': 'mean'}).reset_index()

df_stats.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
definitive_id,69.0,86419.637681,209750.453756,40.0,1725.0,3784.0,5567.0,1047935.0
date,69.0,12.231884,2.880691,2.0,11.0,14.0,14.0,14.0
speaker_npi_ct,69.0,0.006211,0.029346,0.0,0.0,0.0,0.0,0.2142857
pp_imp_tot,69.0,92.511462,134.510072,0.0,0.142857,17.545455,149.928571,546.7143
rxnt_imp,69.0,29.075977,48.128026,0.0,0.0,0.0,45.071429,240.9286
webmd_imp_tot,69.0,0.102484,0.339143,0.0,0.0,0.0,0.0,2.0
call_tot,69.0,1.945513,3.024166,0.0,0.071429,0.857143,2.857143,21.35714
call_inperson,69.0,1.395995,2.634004,0.0,0.0,0.571429,2.0,20.0
call_inperson_gt1_repl,69.0,0.500592,0.63289,0.0,0.0,0.357143,0.7,3.5
call_phone_gt1_repl,69.0,0.435431,0.742407,0.0,0.0,0.142857,0.555556,3.642857


In [56]:
# distribution of promotion activity before on formulary
df_pre_form = df_t[(~df_t['confirmed_date'].isna()) & (df_t['formulary_type_On Formulary'] == 1)]
df_pre_form['confirmed_date'] = pd.to_datetime(df_pre_form['confirmed_date'])
df_pre_form['confirmed_month'] = df_pre_form['confirmed_date'].dt.to_period('M')
df_pre_form = df_pre_form[df_pre_form['date'] < df_pre_form['confirmed_month']]
df_stats = df_pre_form.groupby('definitive_id').agg({'date':'count', 'speaker_npi_ct': 'mean', 'pp_imp_tot': 'mean',
                                                     'rxnt_imp': 'mean', 'webmd_imp_tot': 'mean', 'call_tot': 'mean',
                                                     'call_inperson': 'mean', 'call_inperson_gt1_repl': 'mean',
                                                     'call_phone_gt1_repl': 'mean', 'call_video_gt1_repl': 'mean'}).reset_index()

df_stats.describe().T

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
definitive_id,21.0,28176.380952,117734.475489,40.0,1673.0,2764.0,3594.0,541975.0
date,21.0,8.238095,2.300104,2.0,7.0,9.0,10.0,11.0
speaker_npi_ct,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pp_imp_tot,21.0,24.096104,62.781847,0.0,0.0,0.0,10.363636,270.545455
rxnt_imp,21.0,18.582395,27.985281,0.0,0.0,0.0,48.111111,74.8
webmd_imp_tot,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
call_tot,21.0,1.600895,4.420357,0.0,0.0,0.285714,1.0,20.125
call_inperson,21.0,1.473959,4.313444,0.0,0.0,0.090909,0.636364,19.625
call_inperson_gt1_repl,21.0,0.483158,0.878195,0.0,0.0,0.090909,0.545455,3.5
call_phone_gt1_repl,21.0,0.116655,0.199577,0.0,0.0,0.0,0.111111,0.636364


Unnamed: 0,definitive_id,account_name
0,1,
1,2,
2,3,
3,4,
4,5,
...,...,...
5403,1053005,
5404,1053818,
5405,1053881,
5406,1054648,


In [108]:
# under resourced accounts
cols_to_keep = ['definitive_id', 'call_inperson_gt1_repl', 'shipped_quantity'] + [x for x in df_t.columns if x.startswith('anti')]
under_acc = df_t[cols_to_keep].groupby('definitive_id').mean().reset_index()
acc_names = df[['definitive_id', 'hospital_name']].groupby('definitive_id').first().reset_index()
under_acc = under_acc.merge(acc_names, on='definitive_id', how='left')
under_acc

Unnamed: 0,definitive_id,call_inperson_gt1_repl,shipped_quantity,antipsychotic_im_iv_pack_units,antipsychotic_im_iv_volume_units,antipsychotic_im_iv_wac_dollars,antipsychotic_oral_pack_units,antipsychotic_oral_volume_units,antipsychotic_oral_wac_dollars,hospital_name
0,40,0.428571,0.142857,884.0,1376.0,54046.4400,136.0,9054.0,27811.4497,North Alabama Shoals Hospital (FKA Shoals Hosp...
1,160,0.714286,0.714286,5296.0,5371.0,40153.2700,1007.0,70910.0,256066.0561,St Josephs Hospital and Medical Center
2,402,0.000000,0.142857,710.0,887.0,9259.3602,99.0,7860.0,4007.4918,PIH Health Hospital - Downey (FKA Downey Regio...
3,406,0.000000,0.071429,2368.0,2368.0,9156.6800,162.0,13900.0,6433.6712,PIH Health Good Samaritan Hospital
4,430,0.357143,0.071429,6503.0,8662.0,81394.9335,1390.0,100340.0,110550.8880,Cedars-Sinai Medical Center
...,...,...,...,...,...,...,...,...,...,...
64,551527,0.000000,0.642857,,,,,,,Tripler Army Medical Center
65,553435,0.357143,0.071429,2256.0,3481.0,45328.8500,545.0,27098.0,147795.1727,Bryan West Campus
66,581686,0.000000,0.285714,,,,,,,Barlow Respiratory Hospital - PIH Health Hospi...
67,585559,0.142857,0.071429,4192.0,4368.0,71291.2000,290.0,20320.0,46793.2879,Dell Seton Medical Center at the University of...


In [107]:
under_acc.to_csv('under_acc.csv', index=False)

In [97]:
for i in cols_to_keep:
    print(f"{i}: {np.mean(df_t[i])}")

definitive_id: 86419.63768115942
call_inperson_gt1_repl: 0.546583850931677
shipped_quantity: 0.3188405797101449
antipsychotic_im_iv_pack_units: 2583.6721311475408
antipsychotic_im_iv_volume_units: 7773.7245901639335
antipsychotic_im_iv_wac_dollars: 124938.01070000001
antipsychotic_oral_pack_units: 1015.327868852459
antipsychotic_oral_volume_units: 71886.73770491804
antipsychotic_oral_wac_dollars: 220726.95228196727


In [120]:
# non ordering accounts with high antipsych volume and low call volume (p3)
df_definitive_all = pd.read_csv('bioxcel_pr_definitive_month_level_all.csv')
acc_names_all = df_definitive_all[['definitive_id', 'hospital_name']].groupby('definitive_id').first().reset_index() 
keep = ['definitive_id', 'shipped_quantity', 'call_inperson_gt1_repl'] + [x for x in df_definitive_all.columns if x.startswith('anti')]
df_definitive_all = df_definitive_all[keep]
for i in ['shipped_quantity', 'call_inperson_gt1_repl']:
    df_definitive_all[i] = df_definitive_all[i].replace(np.nan, 0)
p3 = df_definitive_all[keep].groupby('definitive_id').mean().reset_index()
p3 = p3.merge(acc_names_all, on='definitive_id', how='left')
# 254 - all accounts with non null antipsych volume, non ordering, < 0.5 monthly in person call volume, above avg antipsych volume
p3 = p3[(p3['shipped_quantity'] == 0) & (p3['call_inperson_gt1_repl'] < 0.5) & (p3['antipsychotic_im_iv_volume_units'] > 7774)]
p3


Columns (27,58,59,68) have mixed types.Specify dtype option on import or set low_memory=False.



Unnamed: 0,definitive_id,shipped_quantity,call_inperson_gt1_repl,antipsychotic_im_iv_pack_units,antipsychotic_im_iv_volume_units,antipsychotic_im_iv_wac_dollars,antipsychotic_oral_pack_units,antipsychotic_oral_volume_units,antipsychotic_oral_wac_dollars,hospital_name
73,75,0.0,0.071429,7398.0,15266.0,104711.3368,658.0,43398.0,5.581395e+04,Brookwood Baptist Medical Center (FKA Brookwoo...
79,81,0.0,0.071429,8401.0,8755.0,213589.5598,1089.0,75382.0,2.165682e+05,Huntsville Hospital
119,121,0.0,0.000000,79.0,9421.0,4467.2705,513.0,26040.0,7.250060e+04,Providence Alaska Medical Center
313,317,0.0,0.000000,6386.0,10305.0,169443.6802,1368.0,111868.0,2.001180e+05,Contra Costa Regional Medical Center
321,325,0.0,0.428571,9713.0,10385.0,166969.6795,858.0,68510.0,2.125267e+05,Community Regional Medical Center
...,...,...,...,...,...,...,...,...,...,...
8340,842968,0.0,0.000000,4674.0,17272.0,74668.1414,4564.0,424920.0,6.115881e+05,Saint Peter Regional Treatment Center
8386,843026,0.0,0.285714,8403.0,11183.8,156422.3733,1166.0,77224.0,1.357233e+05,Stony Brook Childrens (AKA Stony Brook Childre...
8629,948298,0.0,0.071429,632.0,11556.0,19369.8660,696.0,39442.0,7.401191e+04,St Louis Forensic Treatment Center - North (FK...
9008,1012069,0.0,0.071429,3959.0,10150.9,394360.3703,6446.0,462968.0,2.291737e+06,Oregon State Fair and Exposition Center Field ...


In [121]:
p3.to_csv('p3_group.csv', index=False)

In [133]:
# distribution of promotion activity before on formulary
df_pre_form = pd.read_csv('bioxcel_pr_definitive_month_level_all.csv')
df_pre_form = df_pre_form[(~df_pre_form['confirmed_date'].isna()) & (df_pre_form['formulary_type'] == 'On Formulary')]
df_pre_form['confirmed_date'] = pd.to_datetime(df_pre_form['confirmed_date'])
df_pre_form['confirmed_month'] = df_pre_form['confirmed_date'].dt.to_period('M')
keep = ['speaker_npi_ct', 'pp_imp_tot', 'rxnt_imp', 'webmd_imp_tot', 'call_tot', 'call_inperson', 'call_inperson_gt1_repl', 'call_phone_gt1_repl', 'call_video_gt1_repl']
for i in keep:
    df_pre_form[i] = df_pre_form[i].replace(np.nan, 0)
df_pre_form = df_pre_form[df_pre_form['date'] < df_pre_form['confirmed_month']]
df_stats = df_pre_form.groupby('definitive_id').agg({'date':'count', 'speaker_npi_ct': 'mean', 'pp_imp_tot': 'mean',
                                                     'rxnt_imp': 'mean', 'webmd_imp_tot': 'mean', 'call_tot': 'mean',
                                                     'call_inperson': 'mean', 'call_inperson_gt1_repl': 'mean',
                                                     'call_phone_gt1_repl': 'mean', 'call_video_gt1_repl': 'mean'}).reset_index()

df_stats.describe().T


Columns (27,58,59,68) have mixed types.Specify dtype option on import or set low_memory=False.



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
definitive_id,61.0,61581.540984,176824.84149,40.0,1704.0,2660.0,4459.0,837762.0
date,61.0,8.278689,2.082716,2.0,7.0,8.0,10.0,12.0
speaker_npi_ct,61.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pp_imp_tot,61.0,14.714357,43.44745,0.0,0.0,0.0,0.083333,270.545455
rxnt_imp,61.0,8.507948,19.576286,0.0,0.0,0.0,0.0,74.8
webmd_imp_tot,61.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
call_tot,61.0,0.85559,2.741342,0.0,0.0,0.0,0.555556,20.125
call_inperson,61.0,0.695953,2.611214,0.0,0.0,0.0,0.333333,19.625
call_inperson_gt1_repl,61.0,0.286278,0.602434,0.0,0.0,0.0,0.285714,3.5
call_phone_gt1_repl,61.0,0.129868,0.288957,0.0,0.0,0.0,0.125,1.583333


In [135]:
len(np.unique(df_pre_form['definitive_id']))

61

In [331]:
# scatter plot of predictions
#plot_df = df_t.groupby('date')['shipped_quantity'].sum().reset_index()
plot_df = df_t[['definitive_id', 'date', 'shipped_quantity']]
fig = px.line(plot_df, x="date", y='shipped_quantity', color='definitive_id')
fig.show()

In [332]:
fig = px.histogram(df_t, x="shipped_quantity", nbins=20)
fig.show()

In [239]:
df_t[['shipped_quantity'] + channels + non_media_num].corr().reset_index().sort_values(['shipped_quantity'], ascending=False)

Unnamed: 0,index,shipped_quantity,speaker_npi_ct,pp_imp_tot,rxnt_imp,webmd_imp_tot,call_tot,antipsychotic_im_iv_pack_units,antipsychotic_im_iv_volume_units,antipsychotic_im_iv_wac_dollars,...,payor_mix_medicare_days,payor_mix_medicaid_days,payor_mix_private/self-pay/other_days,net_patient_revenue,net_income,operating_income,cash_on_hand,total_overhead_costs,total_revenues,est_#_of_er_visits
0,shipped_quantity,1.0,0.067373,0.079278,0.067641,0.027075,0.16464,0.064105,0.099907,0.008039,...,-0.037779,0.011113,0.025039,-0.047746,-0.022811,-0.009845,-0.070188,-0.043307,-0.042889,-0.008952
5,call_tot,0.16464,0.085476,0.202189,0.196961,0.02057,1.0,0.037729,-0.05119,-0.016645,...,0.004681,-0.079008,0.020511,-0.01654,-0.045181,-0.031287,-0.000379,-0.014055,-0.023745,0.073048
7,antipsychotic_im_iv_volume_units,0.099907,0.038544,-0.042468,-0.028601,-0.016077,-0.05119,0.473028,1.0,0.364123,...,-0.532891,-0.136608,0.550052,0.05012,-0.317243,-0.181216,-0.08808,0.06537,0.187029,0.468282
2,pp_imp_tot,0.079278,0.069801,1.0,0.525899,0.27847,0.202189,0.148967,-0.042468,-0.028941,...,-0.007389,-0.038293,0.010385,0.14726,-0.008497,-0.081706,0.159521,0.242776,0.16443,0.227455
3,rxnt_imp,0.067641,0.038172,0.525899,1.0,-0.024248,0.196961,0.126532,-0.028601,-0.001173,...,-0.023065,-0.009067,0.011765,0.080788,-0.018797,-0.109218,0.097235,0.198553,0.115077,0.105922
1,speaker_npi_ct,0.067373,1.0,0.069801,0.038172,0.115315,0.085476,-0.00664,0.038544,0.064144,...,-0.058599,0.014215,0.051362,-0.01578,-0.01087,-0.015005,-0.010255,-0.004214,-0.015016,0.064454
6,antipsychotic_im_iv_pack_units,0.064105,-0.00664,0.148967,0.126532,0.019847,0.037729,1.0,0.473028,0.417455,...,-0.380675,0.020973,0.335051,0.547844,-0.215594,-0.305577,0.299512,0.438001,0.592718,0.625068
9,antipsychotic_oral_pack_units,0.033758,0.092893,-0.031011,-0.056254,-0.02529,-0.059943,0.490771,0.717821,0.524831,...,-0.530611,-0.024338,0.515986,0.109888,-0.459626,-0.253032,-0.007957,0.055111,0.236349,0.658455
10,antipsychotic_oral_volume_units,0.027744,0.078103,-0.035185,-0.051317,-0.021411,-0.061753,0.496093,0.669956,0.491971,...,-0.493296,-0.024688,0.491195,0.108635,-0.524371,-0.233486,0.008322,0.041531,0.20968,0.596291
4,webmd_imp_tot,0.027075,0.115315,0.27847,-0.024248,1.0,0.02057,0.019847,-0.016077,-0.03267,...,-0.006536,0.052249,-0.029897,0.036547,-0.009374,-0.025954,0.024061,0.085722,0.03887,0.031759


In [344]:
cat_cols = binary + [x for x in df_t.columns if (x.startswith('formulary_type_')) | (x.startswith('segmentation_')) | (x.startswith('flag_'))]
means, features, cts = [], [], []
for i in cat_cols:
    df_filt = df_t[df_t[i] == 1]
    means.append(np.mean(df_filt['shipped_quantity']))
    cts.append(len(df_filt))
    features.append(i)
out = pd.DataFrame({'feature': features, 'mean': means, 'n': cts}).sort_values(['mean'], ascending=False)
out

Unnamed: 0,feature,mean,n
12,flag_Wave I,0.5,266
7,segmentation_B,0.461538,182
3,formulary_type_On Formulary,0.414966,294
1,formulary_type_Accessible/non-formulary,0.404762,42
4,formulary_type_On Restricted,0.383929,112
0,2023_targets,0.36526,616
10,segmentation_Unknown,0.332512,406
8,segmentation_C,0.328571,140
6,segmentation_A,0.297619,84
13,flag_Wave II,0.285714,252


In [345]:
np.mean(out['mean'])

0.32876318241048186

In [342]:
cat_cols = binary + [x for x in df_t.columns if (x.startswith('formulary_type_')) | (x.startswith('segmentation_')) | (x.startswith('flag_'))]
means, features, cts = [], [], []
for i in cat_cols:
    df_filt = df_t[df_t[i] == 1]
    means.append(np.mean(df_filt.groupby('date')['shipped_quantity'].sum().reset_index()['shipped_quantity']))
    cts.append(len(np.unique(df_filt['definitive_id'])))
    features.append(i)
out = pd.DataFrame({'feature': features, 'mean': means, 'n': cts}).sort_values(['mean'], ascending=False)
out['mean_per_acc'] = out['mean']/out['n']
out

Unnamed: 0,feature,mean,n,mean_per_acc
0,2023_targets,16.071429,44,0.36526
10,segmentation_Unknown,9.642857,29,0.332512
12,flag_Wave I,9.5,19,0.5
3,formulary_type_On Formulary,8.714286,21,0.414966
11,flag_Unknown,7.357143,32,0.229911
5,formulary_type_Unknown,6.428571,26,0.247253
7,segmentation_B,6.0,13,0.461538
13,flag_Wave II,5.142857,18,0.285714
8,segmentation_C,3.285714,10,0.328571
4,formulary_type_On Restricted,3.071429,8,0.383929


In [343]:
np.mean(out['mean'])

5.862244897959185

In [365]:
#df_t[['definitive_id', 'date', 'call_inperson', 'shipped_quantity']].to_csv('out.csv', index=False)
out = df_t[['definitive_id', 'call_inperson', 'shipped_quantity']].groupby('definitive_id').mean().reset_index()
out.to_csv('out.csv', index=False)

In [367]:
out = df_t[['date', 'call_inperson', 'shipped_quantity']].groupby('date').sum().reset_index()
#out
out.to_csv('out.csv', index=False)

# Final Model Fitting

In [110]:
# fillna for non media numeric columns with mean for modeling
df_m = df_t.copy()
for i in non_media_num:
    df_m[i] = df_m[i].fillna(np.mean(df_m[i]))

In [111]:
df_m.to_csv('bioxcel_data_for_modeling.csv', index=False)

In [63]:
modeling = mmm_modeling.MMMModeling()

In [64]:
# modeling
channels = ['speaker_npi_ct', 'pp_imp_tot', 'rxnt_imp', 'webmd_imp_tot', 'call_inperson_gt1_repl']
other = non_media_num + binary + [x for x in df_m.columns if (x.startswith('formulary_type_')) | (x.startswith('segmentation_')) | (x.startswith('flag_'))]
lag_dv = [x for x in df_m.columns if 'shipped_quantity_lag' in x]
lag_channels = [x for x in df_m.columns if ('_lag' in x) & ('shipped_quantity_lag' not in x)]
X = df_m[channels + lag_dv + lag_channels + other]
y = df_m['shipped_quantity']
model = modeling.rf_regressor(df_m, X.columns.tolist(), 'shipped_quantity', 'date')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['preds_train'] = model_test.predict(train_df[x_col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['preds_test'] = model_test.predict(test_df[x_col])


In [65]:
# performance
model['performance']

{'full': {'r2': 0.8480523792896066,
  'rmse': 0.4067470142200442,
  'mape': 0.30651586592112035},
 'train': {'r2': 0.8399521257505953,
  'rmse': 0.39774443671906107,
  'mape': 0.36675408102849516},
 'test': {'r2': -0.4326903008932783,
  'rmse': 1.420934131284887,
  'mape': 0.6670568448087022}}

In [66]:
# importance
model['importance']

Unnamed: 0,feature,importance,std
4,call_inperson_gt1_repl,0.121413,0.067528
11,pp_imp_tot_lag1,0.087053,0.052401
5,shipped_quantity_lag1,0.082209,0.036235
2,rxnt_imp,0.074406,0.049972
1,pp_imp_tot,0.063869,0.051761
36,payor_mix_medicaid_days,0.039019,0.040879
6,shipped_quantity_lag2,0.03592,0.036633
41,cash_on_hand,0.033311,0.036067
24,antipsychotic_im_iv_volume_units,0.031248,0.027657
20,call_inperson_gt1_repl_lag1,0.028499,0.019138


In [205]:
# scatter plot of predictions
plot_df = pd.DataFrame({'date': model['df_preds_test']['date'], 'preds': model['df_preds_test']['preds_test'],
                        'actual': model['df_preds_test']['shipped_quantity']})
fig = px.scatter(plot_df, x="date", y=plot_df.columns.tolist(), title="Future Hold Out Set")
fig.show()

In [206]:
# scatter plot of predictions
plot_df = pd.DataFrame({'date': model['df_preds_full']['date'], 'preds': model['df_preds_full']['preds_full'],
                        'actual': model['df_preds_full']['shipped_quantity']})
fig = px.scatter(plot_df, x="date", y=plot_df.columns.tolist(), title="Full Data - Model trained on full data")
fig.show()

# Response Curves - Overall

In [67]:
response_curves = mmm_response_curves.MMMResponseCurves()

In [68]:
channels = ['speaker_npi_ct', 'pp_imp_tot', 'rxnt_imp', 'webmd_imp_tot', 'call_inperson_gt1_repl']
#channels = ['call_tot']
other = non_media_num + binary + [x for x in df_m.columns if (x.startswith('formulary_type_')) | (x.startswith('segmentation_')) | (x.startswith('flag_'))]
lag_dv = [x for x in df_m.columns if 'shipped_quantity_lag' in x]
#lag_channels = [x for x in df_m.columns if ('call_tot_lag' in x) & ('shipped_quantity_lag' not in x)]
lag_channels = [x for x in df_m.columns if ('_lag' in x) & ('shipped_quantity_lag' not in x)]
X = df_m[channels + lag_dv + lag_channels + other]

In [407]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
speaker_npi_ct,966.0,0.011387,0.139855,0.0,0.0,0.0,0.0,3.0
pp_imp_tot,966.0,107.719462,365.072364,0.0,0.0,0.0,0.0,2801.0
rxnt_imp,966.0,30.993789,123.468290,0.0,0.0,0.0,0.0,1469.0
webmd_imp_tot,966.0,0.162526,1.684271,0.0,0.0,0.0,0.0,28.0
call_inperson_gt1_repl,966.0,0.546584,1.091372,0.0,0.0,0.0,1.0,7.0
...,...,...,...,...,...,...,...,...
segmentation_D,966.0,0.159420,0.366257,0.0,0.0,0.0,0.0,1.0
segmentation_Unknown,966.0,0.420290,0.493861,0.0,0.0,0.0,1.0,1.0
flag_Unknown,966.0,0.463768,0.498944,0.0,0.0,0.0,1.0,1.0
flag_Wave I,966.0,0.275362,0.446928,0.0,0.0,0.0,1.0,1.0


In [71]:
# overall response curves
#channel1 = response_curves.responses(model['full_model'], X, 'speaker_npi_ct', 10, 1)
#channel2 = response_curves.responses(model['full_model'], X, 'pp_imp_tot', 5000, 20)
#channel3 = response_curves.responses(model['full_model'], X, 'rxnt_imp', 3000, 20)
#channel4 = response_curves.responses(model['full_model'], X, 'webmd_imp_tot', 100, 1)
channel5 = response_curves.responses(model['full_model'], X, 'call_inperson_gt1_repl', 20, 0.2)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente

In [210]:
response_curves.plot(channel1['resp_df'], 'touches', ['speaker_npi_ct', 'speaker_npi_ct_hill_estimate'])

In [211]:
response_curves.plot(channel2['resp_df'], 'touches', ['pp_imp_tot', 'pp_imp_tot_hill_estimate'])

In [212]:
response_curves.plot(channel3['resp_df'], 'touches', ['rxnt_imp', 'rxnt_imp_hill_estimate'])

In [213]:
response_curves.plot(channel4['resp_df'], 'touches', ['webmd_imp_tot', 'webmd_imp_tot_hill_estimate'])

In [72]:
response_curves.plot(channel5['resp_df'], 'touches', ['call_inperson_gt1_repl', 'call_inperson_gt1_repl_hill_estimate'])

In [444]:
channel5['resp_df'].to_csv('out.csv', index=False)

In [None]:
#overall_resp = pd.concat([channel1['resp_df'],
#                          channel2['resp_df'].drop(['touches'], axis=1),
#                          channel3['resp_df'].drop(['touches'], axis=1),
#                          channel4['resp_df'].drop(['touches'], axis=1),
#                          channel5['resp_df'].drop(['touches'], axis=1),
#                          channel6['resp_df'].drop(['touches'], axis=1),
#                          channel7['resp_df'].drop(['touches'], axis=1),
#                          channel8['resp_df'].drop(['touches'], axis=1),
#                          channel9['resp_df'].drop(['touches'], axis=1)], axis=1)
#fig = response_curves.plot(overall_resp, 'touches', channels + ['competitor_sales_B'])
#fig.update_layout(xaxis_title='Weekly Spend', yaxis_title='Revenue Impact')
#fig

In [168]:
# overall response curves for top non-media
#nonmedia1 = response_curves.responses(model['full_model'], X, 'payor_mix_medicaid_days', 1, 0.01)
nonmedia2 = response_curves.responses(model['full_model'], X, 'net_patient_revenue', 1000000000, 2000000)
nonmedia3 = response_curves.responses(model['full_model'], X, 'antipsychotic_im_iv_volume_units', 20000, 200)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente

Error - curve_fit failed



divide by zero encountered in true_divide


invalid value encountered in true_divide



In [170]:
response_curves.plot(nonmedia2['resp_df'], 'touches', ['net_patient_revenue', 'net_patient_revenue_hill_estimate'])

In [171]:
response_curves.plot(nonmedia3['resp_df'], 'touches', ['antipsychotic_im_iv_volume_units', 'antipsychotic_im_iv_volume_units_hill_estimate'])

# Response Curves - Segments

In [153]:
segment = 'segmentation'
channels = ['speaker_npi_ct', 'pp_imp_tot', 'rxnt_imp', 'webmd_imp_tot', 'call_tot']
other = non_media_num + binary + [x for x in df_t.columns if (x.startswith('formulary_type_')) | (x.startswith('segmentation_')) | (x.startswith('flag_'))]
lag_dv = [x for x in df_t.columns if 'shipped_quantity_lag' in x]
lag_channels = [x for x in df_t.columns if ('_lag' in x) & ('shipped_quantity_lag' not in x)]
segments = [x for x in df_t.columns if x.startswith(f"{segment}_")]
X = df_t[channels + lag_dv + lag_channels + other + [segment]]
X[segments] = 0

In [154]:
channel1_segment = response_curves.responses_segment(model['full_model'], X, 'speaker_npi_ct', 10, 1, segment)


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power



In [155]:
channel1_segment['fig_hill']

In [156]:
channel2_segment = response_curves.responses_segment(model['full_model'], X, 'pp_imp_tot', 5000, 20, segment)


divide by zero encountered in reciprocal


divide by zero encountered in power



Error - curve_fit failed



divide by zero encountered in reciprocal


divide by zero encountered in power



Error - curve_fit failed



divide by zero encountered in true_divide


invalid value encountered in true_divide


divide by zero encountered in reciprocal


divide by zero encountered in power



Error - curve_fit failed



divide by zero encountered in true_divide


invalid value encountered in true_divide


divide by zero encountered in reciprocal


divide by zero encountered in power



Error - curve_fit failed



divide by zero encountered in true_divide


invalid value encountered in true_divide


divide by zero encountered in reciprocal


divide by zero encountered in power



Error - curve_fit failed


In [157]:
channel2_segment['fig_hill']

In [158]:
channel3_segment = response_curves.responses_segment(model['full_model'], X, 'rxnt_imp', 3000, 20, segment)


divide by zero encountered in reciprocal


divide by zero encountered in power



Error - curve_fit failed



divide by zero encountered in reciprocal


divide by zero encountered in power



Error - curve_fit failed



divide by zero encountered in reciprocal


divide by zero encountered in power



Error - curve_fit failed



divide by zero encountered in reciprocal


divide by zero encountered in power



Error - curve_fit failed



divide by zero encountered in reciprocal


divide by zero encountered in power



Error - curve_fit failed


In [159]:
channel3_segment['fig_hill']

In [160]:
channel4_segment = response_curves.responses_segment(model['full_model'], X, 'webmd_imp_tot', 100, 1, segment)


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power



In [161]:
channel4_segment['fig_hill']

In [162]:
channel5_segment = response_curves.responses_segment(model['full_model'], X, 'call_tot', 100, 1, segment)


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power



In [163]:
channel5_segment['fig_hill']

# Next Steps

In [172]:
# get remaining financial data
# speaker seems to have high impact but low importance for probably noisy curve
# call curve has high accuracy
# segment B responds best to calls
# check responses for other segments
# breakout calls into type
# consider segment level data sets
# fit ridge regression to get directionality
# try cross sectional data