In [89]:
import pandas as pd
import pyodbc
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
from dateutil.rrule import rrule, MONTHLY
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.optimize import minimize
from scipy.optimize import curve_fit
import shap
import numdifftools as nd
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope
shap.initjs()
%load_ext autoreload
%autoreload 2
import mmm_transformations
import mmm_preprocessing
import mmm_feature_selection
import mmm_modeling
import mmm_response_curves
import mmm_optimization

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data Pull

In [136]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=PRDINS10DB2;'
                      'Port=1433;')
sql_stmt = """SELECT [Insmed_HCP_Id], [PHYSICIAN_NPI], [CallDate_ym], MAX([PrimarySpecialty]) as Specialty, \n
                     count(distinct (case when Product in ('a0058000005VpNDAA0', 'a0058000005UplCAAS') AND Detail_Priority=1.0 then Veeva_call_Id end)) as P1_Arikayce, \n
                     count(distinct (case when Product in ('a0058000005VpNDAA0', 'a0058000005UplCAAS') AND Detail_Priority=2.0 then Veeva_call_Id end)) as P2_Arikayce, \n
                     count(distinct [PatientMasterID] ) as distinct_patients, \n
                     count(distinct [RX_NUMBER]) as rx_count \n
              FROM(SELECT a.[PatientMasterID], a.[PHYSICIAN_NPI], a.[DATE_SHIPPED], \n
                          a.RX_NUMBER, b.[InsmedID], b.[PrimarySpecialty], b.[NpiID], c.[Veeva_Call_ID], c.[Insmed_HCP_Id], \n
                          c.[Detail_Priority], c.[Product], c.[Insmed_Call_Type], c.[CallDate], LEFT(c.[CallDate], 7) AS CallDate_ym \n
                   FROM [Insmed_Adhoc].[dbo].[tblConsolidated_Dispense] a \n
                   FULL OUTER JOIN [Insmed_Adhoc].[dbo].[tblDF_Shyft_dimHCP_Adhoc] b ON A.[PHYSICIAN_NPI] = b.NpiID \n
                   FULL OUTER JOIN [Insmed_Adhoc].[dbo].[tblDF_Shyft_Call_Adhoc] c ON b.[InsmedID]= c.[Insmed_HCP_Id] \n
                   WHERE YEAR(a.[DATE_SHIPPED]) =  YEAR(c.[CallDate]) AND MONTH(a.DATE_SHIPPED) = MONTH(c.[CallDate]) \n
                   and c.Insmed_Call_Type in ('HCP and Staff','HCP Only','In Person','Live – HCP and Staff',\n
                                              'Live – HCP Only','Phone Call','Phone Call w/HCP','Phone Call w/Staff',\n
                                              'Virtual – HCP Only','Virtual Interaction')
                   and a.DISPENSE_TYPE='Product' and a.DISPENSE_STATUS not in ('V','R')) as SUBQUERY
              GROUP BY [Insmed_HCP_Id], [PHYSICIAN_NPI], [CallDate_ym]
              ORDER BY [Insmed_HCP_Id], [CallDate_ym]"""
df = pd.read_sql(sql_stmt, conn)
df

Unnamed: 0,Insmed_HCP_Id,PHYSICIAN_NPI,CallDate_ym,Specialty,P1_Arikayce,P2_Arikayce,distinct_patients,rx_count
0,INSP20000000,1932322948,2018-12,PUD,1,0,2,2
1,INSP20000000,1932322948,2019-02,PUD,1,0,2,2
2,INSP20000000,1932322948,2019-06,PUD,1,0,2,2
3,INSP20000000,1932322948,2019-10,PUD,1,0,6,6
4,INSP20000000,1932322948,2020-06,PUD,1,0,3,3
...,...,...,...,...,...,...,...,...
16845,INSP20134388,1144746934,2023-02,ID,0,0,1,1
16846,INSP20134500,1720513641,2023-02,ID,1,0,1,1
16847,INSP20134876,1912038944,2023-01,PHM,0,0,1,1
16848,INSP20135374,1518345289,2023-01,IMD,1,0,1,1


# Preprocessing

In [137]:
group_by = 'Specialty'
segment = 'Specialty'
date = 'CallDate_ym'
media_cols = ['P1_Arikayce', 'P2_Arikayce']
dv = 'rx_count'
preprocess = mmm_preprocessing.MMMPreprocessing(group_by, segment, date, media_cols, dv)

In [138]:
df_pp = df[[segment, date, dv] + media_cols].groupby([segment, date]).sum().reset_index().sort_values([segment, date])
df_pp = preprocess.cleanup(df_pp)
df_pp = df_pp[df_pp['CallDate_ym']<='2023-01']
df_pp

Unnamed: 0,Specialty,CallDate_ym,rx_count,P1_Arikayce,P2_Arikayce,Specialty_AC,Specialty_ADU,Specialty_CCE,Specialty_CCM,Specialty_EM,Specialty_FM,Specialty_GP,Specialty_HOS,Specialty_ID,Specialty_IM,Specialty_Other,Specialty_PCC,Specialty_PDP,Specialty_PUD
0,AC,2018-10,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,AC,2018-11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,AC,2018-12,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,AC,2019-01,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,AC,2019-02,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736,PUD,2022-09,143,94,13,0,0,0,0,0,0,0,0,0,0,0,0,0,1
737,PUD,2022-10,200,120,18,0,0,0,0,0,0,0,0,0,0,0,0,0,1
738,PUD,2022-11,185,117,14,0,0,0,0,0,0,0,0,0,0,0,0,0,1
739,PUD,2022-12,162,96,10,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [139]:
df_pp[['Specialty', 'rx_count', 'P1_Arikayce', 'P2_Arikayce']].groupby('Specialty').mean().reset_index()

Unnamed: 0,Specialty,rx_count,P1_Arikayce,P2_Arikayce
0,AC,0.634615,0.384615,0.057692
1,ADU,3.346154,2.25,0.038462
2,CCE,1.942308,1.038462,0.038462
3,CCM,29.057692,16.461538,0.365385
4,EM,0.75,0.461538,0.019231
5,FM,6.461538,5.153846,0.403846
6,GP,6.538462,2.730769,0.038462
7,HOS,0.711538,0.673077,0.0
8,ID,250.634615,166.423077,6.942308
9,IM,33.615385,20.557692,0.615385


# Transformations

In [140]:
transform = mmm_transformations.MMMTransformations()

In [141]:
df_t = transform.lag_dv(df_pp, 'rx_count', 3, 'Specialty')
df_t

Unnamed: 0,Specialty,CallDate_ym,rx_count,P1_Arikayce,P2_Arikayce,Specialty_AC,Specialty_ADU,Specialty_CCE,Specialty_CCM,Specialty_EM,...,Specialty_HOS,Specialty_ID,Specialty_IM,Specialty_Other,Specialty_PCC,Specialty_PDP,Specialty_PUD,rx_count_lag1,rx_count_lag2,rx_count_lag3
0,AC,2018-10,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AC,2018-11,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AC,2018-12,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AC,2019-01,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AC,2019-02,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736,PUD,2022-09,143,94,13,0,0,0,0,0,...,0,0,0,0,0,0,1,203,152,167
737,PUD,2022-10,200,120,18,0,0,0,0,0,...,0,0,0,0,0,0,1,143,203,152
738,PUD,2022-11,185,117,14,0,0,0,0,0,...,0,0,0,0,0,0,1,200,143,203
739,PUD,2022-12,162,96,10,0,0,0,0,0,...,0,0,0,0,0,0,1,185,200,143


# Final Model Fitting

In [142]:
modeling = mmm_modeling.MMMModeling()

In [143]:
# modeling
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lagged_dv = [x for x in df_t.columns if 'lag' in x]
X = df_t[channels + lagged_dv + specialties]
y = df_t['rx_count']
model = modeling.rf_regressor(df_t, X.columns.tolist(), 'rx_count', 'CallDate_ym')

In [144]:
# performance
model['performance']

{'full': {'r2': 0.9971694277201056,
  'rmse': 4.065413812415974,
  'mape': 197254065529309.53},
 'test': {'r2': 0.9868792427378765,
  'rmse': 9.223767634394713,
  'mape': 219056269679708.6}}

In [145]:
# importance
model['importance']

Unnamed: 0,feature,importance,std
2,rx_count_lag1,0.7020556,0.4068233
3,rx_count_lag2,0.2278133,0.3885524
4,rx_count_lag3,0.02981225,0.1270958
0,P1_Arikayce,0.02371147,0.09043489
18,Specialty_PUD,0.006773646,0.01346406
13,Specialty_ID,0.005569726,0.01278801
1,P2_Arikayce,0.00391124,0.002217892
16,Specialty_PCC,0.000197247,0.0002593196
8,Specialty_CCM,4.608421e-05,5.872549e-05
11,Specialty_GP,3.558238e-05,3.189142e-05


In [146]:
# scatter plot of predictions
plot_df = pd.DataFrame({'preds': model['preds_full'], 'actual': y})
fig = px.scatter(plot_df, x="preds", y="actual")
fig.show()

# Response Curves - Isolated Impact

In [149]:
response_curves = mmm_response_curves.MMMResponseCurves()

In [150]:
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lagged_dv = [x for x in df_t.columns if 'lag' in x]
X = df_t[channels + lagged_dv + specialties + ['Specialty', 'CallDate_ym']]
X[channels] = 0

In [151]:
# overall response curves
channel1 = response_curves.responses('overall', model['full_model'], X, 'P1_Arikayce', 200, 1, 'Specialty', 'CallDate_ym')
channel2 = response_curves.responses('overall', model['full_model'], X, 'P2_Arikayce', 200, 1, 'Specialty', 'CallDate_ym')


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`



In [152]:
response_curves.plot(channel1['resp_df'], 'touches', ['P1_Arikayce', 'P1_Arikayce_hill_estimate'])

In [153]:
response_curves.plot(channel2['resp_df'], 'touches', ['P2_Arikayce', 'P2_Arikayce_hill_estimate'])

In [154]:
overall_resp = pd.concat([channel1['resp_df'], channel2['resp_df'].drop(['touches'], axis=1)], axis=1)
fig = response_curves.plot(overall_resp, 'touches', ['P1_Arikayce', 'P2_Arikayce'])
fig

In [155]:
channel1_segment = response_curves.responses('segment', model['full_model'], X, 'P1_Arikayce', 200, 1, 'Specialty', 'CallDate_ym')


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente

Error - curve_fit failed


In [156]:
channel1_segment['fig_raw']

In [157]:
channel1_segment['fig_hill']

In [158]:
channel2_segment = response_curves.responses('segment', model['full_model'], X, 'P2_Arikayce', 200, 1, 'Specialty', 'CallDate_ym',)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente

Error - curve_fit failed
Error - curve_fit failed
Error - curve_fit failed


In [159]:
channel2_segment['fig_raw']

In [160]:
channel2_segment['fig_hill']

In [161]:
plot_df = channel2_segment['resp_df'].copy()
cols = [x for x in plot_df.columns if ('P2_Arikayce' in x) and ('hill' not in x)]
plot_df = plot_df[plot_df['touches'] <= 20]
fig = response_curves.plot(plot_df, 'touches', cols)
fig

In [162]:
plot_df = channel2_segment['resp_df'].copy()
cols = [x for x in plot_df.columns if x.endswith('hill_estimate')]
plot_df = plot_df[plot_df['touches'] <= 20]
fig = response_curves.plot(plot_df, 'touches', cols)
fig

# Responses - Simulating segments

In [179]:
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lagged_dv = [x for x in df_t.columns if 'lag' in x]
X = df_t[channels + lagged_dv + specialties + ['Specialty']]
X[channels + specialties] = 0
channel1_v2 = response_curves.responses_segment(model['full_model'], X, 'P1_Arikayce', 200, 1, specialties, 'Specialty')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [175]:
channel1_v2['fig_raw']

In [176]:
channel1_v2['fig_hill']

In [178]:
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lagged_dv = [x for x in df_t.columns if 'lag' in x]
X = df_t[channels + lagged_dv + specialties + ['Specialty']]
X[channels + specialties] = 0
channel2_v2 = response_curves.responses_segment(model['full_model'], X, 'P2_Arikayce', 200, 1, specialties, 'Specialty')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [180]:
channel2_v2['fig_raw']

In [181]:
channel2_v2['fig_hill']

# Budgeting - Overall

In [163]:
channel1['optimal_hill']

array([33.2384515 , 51.64850776,  0.97968014])

In [164]:
channel2['optimal_hill']

array([2.92676499, 3.00675176, 1.17268969])

In [165]:
overall_hill = pd.concat([pd.DataFrame(channel1['optimal_hill']).T, pd.DataFrame(channel2['optimal_hill']).T])
overall_hill['feature'] = channels
overall_hill.columns = ['beta', 'alpha', 'gamma', 'feature']
overall_hill

Unnamed: 0,beta,alpha,gamma,feature
0,33.238452,51.648508,0.97968,P1_Arikayce
0,2.926765,3.006752,1.17269,P2_Arikayce


In [166]:
optimization = mmm_optimization.MMMOptimization(budget=368, params=overall_hill)
start_vals = [110, 5]
output = optimization.optimize_hill(start_vals)
output

     fun: -31.443367753207635
     jac: array([-0.01102614, -0.01101136])
 message: 'Optimization terminated successfully'
    nfev: 68
     nit: 22
    njev: 22
  status: 0
 success: True
       x: array([344.47633931,  23.52366069])


# Budgeting - Channel 1

In [167]:
channel1_segment['optimal_hill']

{'AC_P1_Arikayce': array([25.47359328, 17.04828837,  1.73345038]),
 'ADU_P1_Arikayce': array([23.43274027, 18.11193488,  1.71735675]),
 'CCE_P1_Arikayce': array([23.74334153, 18.40299591,  1.8364346 ]),
 'CCM_P1_Arikayce': array([43.81966966, 79.57142952,  1.46162811]),
 'EM_P1_Arikayce': array([25.16306781, 17.18581156,  1.74043424]),
 'FM_P1_Arikayce': array([20.4262238 , 20.59215791,  1.75334447]),
 'GP_P1_Arikayce': array([19.62533398, 20.94367751,  1.8467349 ]),
 'HOS_P1_Arikayce': array([25.37539533, 17.15385449,  1.74099369]),
 'ID_P1_Arikayce': array([ 35.61087121, 142.00023255,   5.18602519]),
 'IM_P1_Arikayce': array([42.45836072, 76.22739064,  1.48254761]),
 'Other_P1_Arikayce': array([23.08515381, 18.1429774 ,  1.74080832]),
 'PCC_P1_Arikayce': array([21.45750978, 24.00748649,  1.22318919]),
 'PDP_P1_Arikayce': array([25.02219607, 17.31498469,  1.74674026]),
 'PUD_P1_Arikayce': [0, 0, 0]}

In [168]:
params = pd.DataFrame(channel1_segment['optimal_hill']).T
params.columns = ['beta','alpha','gamma']
optimization = mmm_optimization.MMMOptimization(budget=354, params=params)
start_vals = [1,1,1,110,1,1,1,1,110,110,1,110,1,1]
output = optimization.optimize_hill(start_vals)
output

     fun: -80.53364658961347
     jac: array([ 0.00000000e+00, -9.53674316e-07,  0.00000000e+00, -2.10057259e-01,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -2.10089684e-01, -2.10050583e-01,  0.00000000e+00, -2.10094452e-01,
        0.00000000e+00,  0.00000000e+00])
 message: 'Optimization terminated successfully'
    nfev: 452
     nit: 30
    njev: 30
  status: 0
 success: True
       x: array([0.00000000e+00, 2.51551035e-16, 1.29386623e-16, 7.61484447e+01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.72385006e+02, 7.49054729e+01, 0.00000000e+00, 3.05610765e+01,
       0.00000000e+00, 0.00000000e+00])



divide by zero encountered in double_scalars


invalid value encountered in double_scalars



# Budgeting - Channel 2

In [169]:
channel2_segment['optimal_hill']

{'AC_P2_Arikayce': array([ 0.54869712,  0.48838909, 10.1338168 ]),
 'ADU_P2_Arikayce': array([0.45498362, 0.7426851 , 3.83274125]),
 'CCE_P2_Arikayce': [0, 0, 0],
 'CCM_P2_Arikayce': array([1.14423200e-02, 7.09581172e+04, 1.37261531e+00]),
 'EM_P2_Arikayce': array([ 0.55021474,  0.82224488, 16.35287387]),
 'FM_P2_Arikayce': array([0.43843325, 0.65459445, 2.48762584]),
 'GP_P2_Arikayce': [0, 0, 0],
 'HOS_P2_Arikayce': array([ 0.47349486,  0.71793998, 14.21006929]),
 'ID_P2_Arikayce': array([29.56562831,  2.76910349,  1.16252971]),
 'IM_P2_Arikayce': array([2.77747412e-03, 6.97452495e+03, 3.82140003e-08]),
 'Other_P2_Arikayce': array([0.38984674, 0.67357693, 3.12171918]),
 'PCC_P2_Arikayce': [0, 0, 0],
 'PDP_P2_Arikayce': array([ 0.5321501 ,  0.89696534, 25.24641325]),
 'PUD_P2_Arikayce': array([7.25278824, 9.82045901, 3.47762656])}

In [170]:
params = pd.DataFrame(channel2_segment['optimal_hill']).T
params.columns = ['beta','alpha','gamma']
optimization = mmm_optimization.MMMOptimization(budget=14, params=params)
start_vals = [1,1,1,4,1,1,1,1,1,4,1,1,1,4]
output = optimization.optimize_hill(start_vals)
output

     fun: -25.665767513971034
     jac: array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.28123212,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ])
 message: 'Optimization terminated successfully'
    nfev: 130
     nit: 8
    njev: 8
  status: 0
 success: True
       x: array([0.00000000e+00, 0.00000000e+00, 4.50945628e-16, 0.00000000e+00,
       0.00000000e+00, 1.68769472e-16, 8.34407283e-17, 6.06694150e-17,
       1.39986574e+01, 1.34260380e-03, 2.10383238e-16, 1.51021528e-17,
       0.00000000e+00, 2.22044605e-16])



overflow encountered in double_scalars



# Budgeting Overall - Raw

In [171]:
optimization = mmm_optimization.MMMOptimization(budget=368)
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lagged_dv = [x for x in df_t.columns if 'lag' in x]
X = df_t[channels + lagged_dv + specialties]
output = optimization.optimize_predict(X, channels, 1000, model['full_model'])
output

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:39<00:00, 10.02trial/s, best loss: -6257777409.9496975]


{'mix': {'P1_Arikayce': 280, 'P2_Arikayce': 11},
 'trials': [{'loss': -5366652418.860948, 'status': 'ok'},
  {'loss': -6126494444.22956, 'status': 'ok'},
  {'loss': -6230366695.93809, 'status': 'ok'},
  {'loss': -4382727978.150742, 'status': 'ok'},
  {'loss': -5176361211.9726515, 'status': 'ok'},
  {'loss': -4796847479.504052, 'status': 'ok'},
  {'loss': -5139965607.941003, 'status': 'ok'},
  {'loss': -5601314504.4264145, 'status': 'ok'},
  {'loss': -6230366695.93809, 'status': 'ok'},
  {'loss': -5704690877.019024, 'status': 'ok'},
  {'loss': -4781195007.133104, 'status': 'ok'},
  {'loss': -6230366695.93809, 'status': 'ok'},
  {'loss': -5599249944.0075, 'status': 'ok'},
  {'loss': -5594466977.022363, 'status': 'ok'},
  {'loss': -4259633199.16191, 'status': 'ok'},
  {'loss': -6058819994.356854, 'status': 'ok'},
  {'loss': -5793222469.540302, 'status': 'ok'},
  {'loss': -5872637303.91099, 'status': 'ok'},
  {'loss': -5662600218.099272, 'status': 'ok'},
  {'loss': -6087977961.098242, 'sta

# Budgeting Segment 1 - Raw

In [135]:
optimization = mmm_optimization.MMMOptimization(budget=368)
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lagged_dv = [x for x in df_t.columns if 'lag' in x]
X = df_t[channels + lagged_dv + specialties]
X[specialties] = 0
X['Specialty_ID'] = 1
output = optimization.optimize_predict(X, channels, 1000, model['full_model'])
output

  0%|                                                                                                                           | 0/1000 [00:00<?, ?trial/s, best loss=?]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:37<00:00, 10.29trial/s, best loss: -6515951857.917404]


{'mix': {'P1_Arikayce': 299, 'P2_Arikayce': 13},
 'trials': [{'loss': -5545475219.270521, 'status': 'ok'},
  {'loss': -5334877693.903969, 'status': 'ok'},
  {'loss': -6170488949.28412, 'status': 'ok'},
  {'loss': -5864798018.275096, 'status': 'ok'},
  {'loss': -3798570177.215763, 'status': 'ok'},
  {'loss': -5559418900.449766, 'status': 'ok'},
  {'loss': -5881968347.773722, 'status': 'ok'},
  {'loss': -6504032901.992638, 'status': 'ok'},
  {'loss': -6504032901.992638, 'status': 'ok'},
  {'loss': -5810865326.50673, 'status': 'ok'},
  {'loss': -6175747190.98978, 'status': 'ok'},
  {'loss': -6504032901.992638, 'status': 'ok'},
  {'loss': -5051212861.575783, 'status': 'ok'},
  {'loss': -3672030320.4225535, 'status': 'ok'},
  {'loss': -6504032901.992638, 'status': 'ok'},
  {'loss': -5860358457.879931, 'status': 'ok'},
  {'loss': -6355234826.557541, 'status': 'ok'},
  {'loss': -6221230707.018462, 'status': 'ok'},
  {'loss': -6503292517.384656, 'status': 'ok'},
  {'loss': -5864798018.275096, 

# Insight

In [None]:
# Troubleshooting
    # Getting hyperopt to optimize mix <= budget
    # Segment level hyperopt mix
        # This should be trying to get interactions among channels within a specified specialty
        # But then how do we get optimal mix across segments within media channel
    # To include multiple segments we either need to build one hcp level model or segment level model one for each segment type
        # Problem with hcp level is data is too sparse
    # Response curves with interactions
    # the impact when broken out by segments is sometimes higher than overall - how can we rationalize this?\
    # simulating segments also changes response curves
    # corrected issue with lag dv
    # need to account for lagged channels inside of prediction optimization
    # results of mix dont make sense - optimal spend is double digits but im specifying increments of 10000
    # sometimes hill doesnt fit

In [67]:
# compared to the first model differences are
    # impact of media shifts towards lag dv features 
    # segment level insight changes
    # no adstock changes shape of curves