In [1]:
import pandas as pd
import pyodbc
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
from dateutil.rrule import rrule, MONTHLY
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.optimize import minimize
from scipy.optimize import curve_fit
import shap
import numdifftools as nd
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope
shap.initjs()
%load_ext autoreload
%autoreload 2
import mmm_transformations
import mmm_preprocessing
import mmm_feature_selection
import mmm_modeling
import mmm_response_curves
import mmm_optimization

# Data Pull

In [2]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=PRDINS10DB2;'
                      'Port=1433;')

In [3]:
sql_stmt = """SELECT [Insmed_HCP_Id], [PHYSICIAN_NPI], [CallDate_ym], MAX([PrimarySpecialty]) as Specialty, \n
                     count(distinct (case when Product in ('a0058000005VpNDAA0', 'a0058000005UplCAAS') AND Detail_Priority=1.0 then Veeva_call_Id end)) as P1_Arikayce, \n
                     count(distinct (case when Product in ('a0058000005VpNDAA0', 'a0058000005UplCAAS') AND Detail_Priority=2.0 then Veeva_call_Id end)) as P2_Arikayce, \n
                     count(distinct [PatientMasterID] ) as distinct_patients, \n
                     count(distinct [RX_NUMBER]) as rx_count \n
              FROM(SELECT a.[PatientMasterID], a.[PHYSICIAN_NPI], a.[DATE_SHIPPED], \n
                          a.RX_NUMBER, b.[InsmedID], b.[PrimarySpecialty], b.[NpiID], c.[Veeva_Call_ID], c.[Insmed_HCP_Id], \n
                          c.[Detail_Priority], c.[Product], c.[Insmed_Call_Type], c.[CallDate], LEFT(c.[CallDate], 7) AS CallDate_ym \n
                   FROM [Insmed_Adhoc].[dbo].[tblConsolidated_Dispense] a \n
                   FULL OUTER JOIN [Insmed_Adhoc].[dbo].[tblDF_Shyft_dimHCP_Adhoc] b ON A.[PHYSICIAN_NPI] = b.NpiID \n
                   FULL OUTER JOIN [Insmed_Adhoc].[dbo].[tblDF_Shyft_Call_Adhoc] c ON b.[InsmedID]= c.[Insmed_HCP_Id] \n
                   WHERE YEAR(a.[DATE_SHIPPED]) =  YEAR(c.[CallDate]) AND MONTH(a.DATE_SHIPPED) = MONTH(c.[CallDate]) \n
                   and c.Insmed_Call_Type in ('HCP and Staff','HCP Only','In Person','Live – HCP and Staff',\n
                                              'Live – HCP Only','Phone Call','Phone Call w/HCP','Phone Call w/Staff',\n
                                              'Virtual – HCP Only','Virtual Interaction')
                   and a.DISPENSE_TYPE='Product' and a.DISPENSE_STATUS not in ('V','R')) as SUBQUERY
              GROUP BY [Insmed_HCP_Id], [PHYSICIAN_NPI], [CallDate_ym]
              ORDER BY [Insmed_HCP_Id], [CallDate_ym]"""

In [4]:
df = pd.read_sql(sql_stmt, conn)
df

Unnamed: 0,Insmed_HCP_Id,PHYSICIAN_NPI,CallDate_ym,Specialty,P1_Arikayce,P2_Arikayce,distinct_patients,rx_count
0,INSP20000000,1932322948,2018-12,PCC,1,0,2,2
1,INSP20000000,1932322948,2019-02,PCC,1,0,2,2
2,INSP20000000,1932322948,2019-06,PCC,1,0,2,2
3,INSP20000000,1932322948,2019-10,PCC,1,0,6,6
4,INSP20000000,1932322948,2020-06,PCC,1,0,3,3
...,...,...,...,...,...,...,...,...
17682,INSP20135484,1245770163,2023-04,GP,1,0,1,1
17683,INSP20136158,1972900215,2023-02,ID,0,0,1,1
17684,INSP20136505,1952854762,2023-03,ID,1,0,1,1
17685,INSP20136551,1326391913,2023-04,WH,1,0,1,1


In [144]:
df.to_csv('insmed_month_level.csv', index=False)

# Preprocessing

In [137]:
group_by = 'PHYSICIAN_NPI'
segment = 'Specialty'
date = 'CallDate_ym'
channels = ['P1_Arikayce', 'P2_Arikayce']
dv = 'rx_count'
df_pp = df[df['CallDate_ym']<='2023-01']
preprocess = mmm_preprocessing.MMMPreprocessing()
dict_pp = preprocess.aggregation(df_pp, group_by, date, dv, channels, [segment])
df_pp = dict_pp['hcp_time']
df_pp

Unnamed: 0,PHYSICIAN_NPI,CallDate_ym,rx_count,P1_Arikayce,P2_Arikayce,Specialty,Specialty_ADU,Specialty_CCM,Specialty_FM,Specialty_GP,Specialty_ID,Specialty_IM,Specialty_Other,Specialty_PCC,Specialty_PDP,Specialty_PUD
0,1003031691,2018-10,0.0,0.0,0.0,PUD,0,0,0,0,0,0,0,0,0,1
1,1003031691,2018-11,0.0,0.0,0.0,PUD,0,0,0,0,0,0,0,0,0,1
2,1003031691,2018-12,0.0,0.0,0.0,PUD,0,0,0,0,0,0,0,0,0,1
3,1003031691,2019-01,0.0,0.0,0.0,PUD,0,0,0,0,0,0,0,0,0,1
4,1003031691,2019-02,0.0,0.0,0.0,PUD,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129787,1992970958,2022-09,0.0,0.0,0.0,CCM,0,1,0,0,0,0,0,0,0,0
129788,1992970958,2022-10,8.0,2.0,0.0,CCM,0,1,0,0,0,0,0,0,0,0
129789,1992970958,2022-11,12.0,0.0,0.0,CCM,0,1,0,0,0,0,0,0,0,0
129790,1992970958,2022-12,5.0,3.0,0.0,CCM,0,1,0,0,0,0,0,0,0,0


In [138]:
dict_pp['segment_time_Specialty']

Unnamed: 0,Specialty,CallDate_ym,rx_count,P1_Arikayce,P2_Arikayce,Specialty_AC,Specialty_ADU,Specialty_CCE,Specialty_CCM,Specialty_EM,Specialty_FM,Specialty_GP,Specialty_HOS,Specialty_ID,Specialty_IM,Specialty_Other,Specialty_PCC,Specialty_PDP,Specialty_PUD
0,AC,2018-10,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,AC,2018-11,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,AC,2018-12,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,AC,2019-01,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,AC,2019-02,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,PUD,2022-09,143.0,94.0,13.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
724,PUD,2022-10,200.0,120.0,18.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
725,PUD,2022-11,185.0,117.0,14.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
726,PUD,2022-12,162.0,96.0,10.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [139]:
dict_pp['time_filtered_on_segment']

{'time_AC':    Specialty CallDate_ym  rx_count  P1_Arikayce  P2_Arikayce
 0         AC     2018-10       0.0          0.0          0.0
 1         AC     2018-11       0.0          0.0          0.0
 2         AC     2018-12       0.0          0.0          0.0
 3         AC     2019-01       0.0          0.0          0.0
 4         AC     2019-02       0.0          0.0          0.0
 5         AC     2019-03       0.0          0.0          0.0
 6         AC     2019-04       0.0          0.0          0.0
 7         AC     2019-05       1.0          2.0          0.0
 8         AC     2019-06       0.0          0.0          0.0
 9         AC     2019-07       1.0          2.0          0.0
 10        AC     2019-08       2.0          4.0          0.0
 11        AC     2019-09       0.0          0.0          0.0
 12        AC     2019-10       0.0          0.0          0.0
 13        AC     2019-11       1.0          1.0          0.0
 14        AC     2019-12       2.0          2.0          0

In [87]:
# average number per month per hcp by specialty
df_pp[['Specialty', 'rx_count', 'P1_Arikayce', 'P2_Arikayce']].groupby('Specialty').mean().reset_index()

Unnamed: 0,Specialty,rx_count,P1_Arikayce,P2_Arikayce
0,ADU,0.175101,0.118421,0.002024
1,CCM,0.444379,0.251479,0.005621
2,FM,0.131868,0.105181,0.008242
3,GP,0.165385,0.066484,0.000549
4,ID,0.209125,0.138719,0.005809
5,IM,0.222527,0.139361,0.003996
6,Other,0.105769,0.076923,0.004931
7,PCC,0.204254,0.148601,0.003788
8,PDP,0.038462,0.032967,0.000687
9,PUD,0.219093,0.155014,0.005767


In [88]:
df_pp.groupby('Specialty').agg({'P1_Arikayce':['mean','max']}).reset_index()

Unnamed: 0_level_0,Specialty,P1_Arikayce,P1_Arikayce
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max
0,ADU,0.118421,5.0
1,CCM,0.251479,7.0
2,FM,0.105181,5.0
3,GP,0.066484,3.0
4,ID,0.138719,14.0
5,IM,0.139361,7.0
6,Other,0.076923,5.0
7,PCC,0.148601,6.0
8,PDP,0.032967,3.0
9,PUD,0.155014,11.0


In [89]:
df_pp.groupby('Specialty').agg({'P2_Arikayce':['mean','max']}).reset_index()

Unnamed: 0_level_0,Specialty,P2_Arikayce,P2_Arikayce
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max
0,ADU,0.002024,1.0
1,CCM,0.005621,2.0
2,FM,0.008242,2.0
3,GP,0.000549,1.0
4,ID,0.005809,4.0
5,IM,0.003996,3.0
6,Other,0.004931,2.0
7,PCC,0.003788,2.0
8,PDP,0.000687,1.0
9,PUD,0.005767,3.0


# Transformations

In [90]:
transform = mmm_transformations.MMMTransformations()

In [91]:
df_t = transform.lag_dv(df_pp, 'rx_count', 3, 'Specialty')
df_t = transform.lag_dv(df_t, 'P1_Arikayce', 3, 'Specialty')
df_t = transform.lag_dv(df_t, 'P2_Arikayce', 3, 'Specialty')
df_t

Unnamed: 0,PHYSICIAN_NPI,CallDate_ym,rx_count,P1_Arikayce,P2_Arikayce,Specialty,Specialty_ADU,Specialty_CCM,Specialty_FM,Specialty_GP,...,Specialty_PUD,rx_count_lag1,rx_count_lag2,rx_count_lag3,P1_Arikayce_lag1,P1_Arikayce_lag2,P1_Arikayce_lag3,P2_Arikayce_lag1,P2_Arikayce_lag2,P2_Arikayce_lag3
0,1003031691,2018-10,0.0,0.0,0.0,PUD,0,0,0,0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1003031691,2018-11,0.0,0.0,0.0,PUD,0,0,0,0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1003031691,2018-12,0.0,0.0,0.0,PUD,0,0,0,0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1003031691,2019-01,0.0,0.0,0.0,PUD,0,0,0,0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1003031691,2019-02,0.0,0.0,0.0,PUD,0,0,0,0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129787,1992970958,2022-09,0.0,0.0,0.0,CCM,0,1,0,0,...,0,7.0,5.0,4.0,4.0,3.0,2.0,0.0,0.0,0.0
129788,1992970958,2022-10,8.0,2.0,0.0,CCM,0,1,0,0,...,0,0.0,7.0,5.0,0.0,4.0,3.0,0.0,0.0,0.0
129789,1992970958,2022-11,12.0,0.0,0.0,CCM,0,1,0,0,...,0,8.0,0.0,7.0,2.0,0.0,4.0,0.0,0.0,0.0
129790,1992970958,2022-12,5.0,3.0,0.0,CCM,0,1,0,0,...,0,12.0,8.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0


# Final Model Fitting

In [92]:
modeling = mmm_modeling.MMMModeling()

In [93]:
channels = ['P1_Arikayce', 'P2_Arikayce']
lag_dv = [x for x in df_t.columns if 'rx_count_lag' in x]
lag_channels = [x for x in df_t.columns if ('lag' in x) & ('rx_count' not in x)]
specialty = [x for x in df_t.columns if x.startswith('Specialty_')]
X = df_t[channels + lag_channels + specialty]
y = df_t['rx_count']
model = modeling.rf_regressor(df_t, X.columns.tolist(), 'rx_count', 'CallDate_ym')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [94]:
# performance
model['performance']

{'full': {'r2': 0.5357339533361246,
  'rmse': 0.5271578267948946,
  'mape': 0.5404637429343603},
 'train': {'r2': 0.5678144074859719,
  'rmse': 0.50744464950672,
  'mape': 0.5334252228645834},
 'test': {'r2': 0.30122480936356755,
  'rmse': 0.6529384468257953,
  'mape': 0.6363656909835649}}

In [95]:
# importance
model['importance']

Unnamed: 0,feature,importance,std
0,P1_Arikayce,0.67971,0.012787
4,P1_Arikayce_lag3,0.079359,0.00903
3,P1_Arikayce_lag2,0.067193,0.009708
2,P1_Arikayce_lag1,0.060802,0.007512
1,P2_Arikayce,0.02638,0.002493
17,Specialty_PUD,0.013584,0.003518
12,Specialty_ID,0.013384,0.004058
9,Specialty_CCM,0.012892,0.002488
13,Specialty_IM,0.012751,0.003127
5,P2_Arikayce_lag1,0.008869,0.001634


In [96]:
# scatter plot of predictions
plot_df = pd.DataFrame({'date': model['df_preds_test']['CallDate_ym'].astype(str),
                        'preds': model['df_preds_test']['preds_test'],
                        'actual': model['df_preds_test']['rx_count']})
fig = px.scatter(plot_df, x="date", y=plot_df.columns.tolist(), title="Future Hold Out Set")
fig.show()

# Response Curves - Overall

In [97]:
response_curves = mmm_response_curves.MMMResponseCurves()

In [98]:
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lag_dv = [x for x in df_t.columns if 'rx_count_lag' in x]
lag_channels = [x for x in df_t.columns if ('lag' in x) & ('rx_count' not in x)]
X = df_t[channels + lag_channels + specialties]

In [99]:
X.describe()

Unnamed: 0,P1_Arikayce,P2_Arikayce,P1_Arikayce_lag1,P1_Arikayce_lag2,P1_Arikayce_lag3,P2_Arikayce_lag1,P2_Arikayce_lag2,P2_Arikayce_lag3,Specialty_ADU,Specialty_CCM,Specialty_FM,Specialty_GP,Specialty_ID,Specialty_IM,Specialty_Other,Specialty_PCC,Specialty_PDP,Specialty_PUD
count,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0,129792.0
mean,0.142281,0.005486,0.142274,0.142251,0.142251,0.005486,0.005486,0.005486,0.007612,0.026042,0.019631,0.014022,0.478766,0.061699,0.03125,0.026442,0.011218,0.323317
std,0.523191,0.087076,0.523185,0.523125,0.523125,0.087076,0.087076,0.087076,0.086915,0.15926,0.138731,0.117584,0.499551,0.240608,0.173993,0.160447,0.105319,0.467744
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
max,14.0,4.0,14.0,14.0,14.0,4.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [100]:
channel1 = response_curves.responses(model['full_model'], X, 'P1_Arikayce', 20, 1)


divide by zero encountered in reciprocal


divide by zero encountered in power



In [101]:
response_curves.plot(channel1['resp_df'], 'touches', ['P1_Arikayce', 'P1_Arikayce_hill_estimate'])

In [102]:
channel2 = response_curves.responses(model['full_model'], X, 'P2_Arikayce', 20, 1)


divide by zero encountered in reciprocal


divide by zero encountered in power



Error - curve_fit failed


In [69]:
response_curves.plot(channel2['resp_df'], 'touches', ['P2_Arikayce', 'P2_Arikayce_hill_estimate'])

# Response Curves - Segment

In [54]:
response_curves = mmm_response_curves.MMMResponseCurves()

In [55]:
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lag_dv = [x for x in df_t.columns if 'rx_count_lag' in x]
lag_channels = [x for x in df_t.columns if ('lag' in x) & ('rx_count' not in x)]
X = df_t[channels + lag_channels + specialties + ['Specialty']]
X[specialties] = 0



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [56]:
channel1_segment = response_curves.responses_segment(model['full_model'], X, 'P1_Arikayce', 20, 1, 'Specialty')


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power



In [61]:
channel1_segment['fig_raw']

In [57]:
channel1_segment['fig_hill']

In [58]:
channel2_segment = response_curves.responses_segment(model['full_model'], X, 'P2_Arikayce', 10, 1, 'Specialty')


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power


divide by zero encountered in reciprocal


divide by zero encountered in power



In [60]:
channel2_segment['fig_raw']

In [59]:
channel2_segment['fig_hill']

# Channel 1 Optimization

In [57]:
p1_hill = pd.DataFrame(channel1_segment['optimal_hill']).T.reset_index()
p1_hill.columns = ['feature', 'beta', 'gamma', 'alpha']
p1_hill

Unnamed: 0,feature,beta,gamma,alpha
0,Specialty_AC_P1_Arikayce,205.637938,51.313808,4.71236
1,Specialty_ADU_P1_Arikayce,205.517174,51.329996,4.719966
2,Specialty_CCE_P1_Arikayce,204.495296,51.466856,4.785168
3,Specialty_CCM_P1_Arikayce,184.168059,53.853232,6.000614
4,Specialty_EM_P1_Arikayce,205.726333,51.301955,4.706769
5,Specialty_FM_P1_Arikayce,204.875774,51.678124,5.028733
6,Specialty_GP_P1_Arikayce,203.352425,51.619572,4.85892
7,Specialty_HOS_P1_Arikayce,205.886561,51.280453,4.696636
8,Specialty_ID_P1_Arikayce,234.19112,48.921949,5.973733
9,Specialty_IM_P1_Arikayce,207.176496,49.692169,3.784883


In [58]:
# hyperopt hill
optimization = mmm_optimization.MMMOptimization(budget=354, params=p1_hill)
channels = p1_hill['feature'].tolist()
output = optimization.optimize_hyperopt_hill(channels, 2000)
output

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:10<00:00, 28.31trial/s, best loss: 99997133.878953]


{'mix': {'Specialty_AC_P1_Arikayce': 330,
  'Specialty_ADU_P1_Arikayce': 223,
  'Specialty_CCE_P1_Arikayce': 194,
  'Specialty_CCM_P1_Arikayce': 340,
  'Specialty_EM_P1_Arikayce': 174,
  'Specialty_FM_P1_Arikayce': 171,
  'Specialty_GP_P1_Arikayce': 250,
  'Specialty_HOS_P1_Arikayce': 222,
  'Specialty_ID_P1_Arikayce': 266,
  'Specialty_IM_P1_Arikayce': 211,
  'Specialty_Other_P1_Arikayce': 330,
  'Specialty_PCC_P1_Arikayce': 188,
  'Specialty_PDP_P1_Arikayce': 252,
  'Specialty_PUD_P1_Arikayce': 221},
 'trials': [{'loss': 99997911.78291155, 'status': 'ok'},
  {'loss': 99997367.74888934, 'status': 'ok'},
  {'loss': 99997252.43217513, 'status': 'ok'},
  {'loss': 99997818.15737288, 'status': 'ok'},
  {'loss': 99997880.892491, 'status': 'ok'},
  {'loss': 99997516.23347887, 'status': 'ok'},
  {'loss': 99997876.8572574, 'status': 'ok'},
  {'loss': 99997331.26427692, 'status': 'ok'},
  {'loss': 99997140.25295413, 'status': 'ok'},
  {'loss': 99997584.77514112, 'status': 'ok'},
  {'loss': 9999

# ID Budgeting

In [71]:
optimization = mmm_optimization.MMMOptimization(budget=135)
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lag_dv = [x for x in df_t.columns if 'rx_count_lag' in x]
lag_channels = [x for x in df_t.columns if ('lag' in x) & ('rx_count' not in x)]
X = df_t[channels + lag_channels + specialties]
X[specialties] = 0
X['Specialty_ID'] = 1
output = optimization.optimize_predict(X, channels, 2000, model['full_model'])
output

  0%|                                                                                                                             | 0/2000 [00:00<?, ?trial/s, best loss=?]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:02<00:00, 31.98trial/s, best loss: -264.04118068466823]


{'mix': {'P1_Arikayce': 90, 'P2_Arikayce': 3},
 'trials': [{'loss': 99999740.31881931, 'status': 'ok'},
  {'loss': -123.15118068466835, 'status': 'ok'},
  {'loss': 99999738.89881931, 'status': 'ok'},
  {'loss': 99999785.74881932, 'status': 'ok'},
  {'loss': -34.57118068466833, 'status': 'ok'},
  {'loss': -126.65118068466835, 'status': 'ok'},
  {'loss': -6.371180684668324, 'status': 'ok'},
  {'loss': -260.7011806846684, 'status': 'ok'},
  {'loss': 99999740.31881931, 'status': 'ok'},
  {'loss': 99999740.65881932, 'status': 'ok'},
  {'loss': -123.15118068466835, 'status': 'ok'},
  {'loss': -11.851180684668325, 'status': 'ok'},
  {'loss': 99999732.60881932, 'status': 'ok'},
  {'loss': 99999741.44881931, 'status': 'ok'},
  {'loss': 99999732.56881931, 'status': 'ok'},
  {'loss': -19.181180684668327, 'status': 'ok'},
  {'loss': -124.69118068466831, 'status': 'ok'},
  {'loss': 99999872.34881932, 'status': 'ok'},
  {'loss': 99999740.15881932, 'status': 'ok'},
  {'loss': 99999952.48881932, 'stat

# Overall Budget

In [74]:
optimization = mmm_optimization.MMMOptimization(budget=368)
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lag_dv = [x for x in df_t.columns if 'rx_count_lag' in x]
lag_channels = [x for x in df_t.columns if ('lag' in x) & ('rx_count' not in x)]
X = df_t[channels + lag_channels + specialties]
output = optimization.optimize_predict(X, channels, 2000, model['full_model'])
output

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:01<00:00, 32.58trial/s, best loss: -271.404799150921]


{'mix': {'P1_Arikayce': 288, 'P2_Arikayce': 13},
 'trials': [{'loss': 99999736.8873437, 'status': 'ok'},
  {'loss': 99999778.46091513, 'status': 'ok'},
  {'loss': -268.7090848652068, 'status': 'ok'},
  {'loss': -255.94622772234968, 'status': 'ok'},
  {'loss': -221.10908486520682, 'status': 'ok'},
  {'loss': 99999758.573058, 'status': 'ok'},
  {'loss': 99999784.95734371, 'status': 'ok'},
  {'loss': 99999743.05377227, 'status': 'ok'},
  {'loss': 99999743.05377227, 'status': 'ok'},
  {'loss': -8.623370579492526, 'status': 'ok'},
  {'loss': -195.39979915092113, 'status': 'ok'},
  {'loss': 99999790.75448656, 'status': 'ok'},
  {'loss': -262.32837057949257, 'status': 'ok'},
  {'loss': -33.75122772234967, 'status': 'ok'},
  {'loss': 99999736.60591513, 'status': 'ok'},
  {'loss': -208.24551343663535, 'status': 'ok'},
  {'loss': 99999743.05377227, 'status': 'ok'},
  {'loss': 99999743.05377227, 'status': 'ok'},
  {'loss': -218.0883705794926, 'status': 'ok'},
  {'loss': -207.00408486520678, 'stat

In [None]:
# dont need lag dv - point is to estimate the impact of channels as best as possible and that happens when their importance is high