In [1]:
import pandas as pd
import pyodbc
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
from dateutil.rrule import rrule, MONTHLY
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.optimize import minimize
from scipy.optimize import curve_fit
import shap
import numdifftools as nd
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope
shap.initjs()
%load_ext autoreload
%autoreload 2
import mmm_transformations
import mmm_preprocessing
import mmm_feature_selection
import mmm_modeling
import mmm_response_curves
import mmm_optimization

# Data Pull

In [2]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=PRDINS10DB2;'
                      'Port=1433;')
sql_stmt = """SELECT [Insmed_HCP_Id], [PHYSICIAN_NPI], [CallDate_ym], MAX([PrimarySpecialty]) as Specialty, \n
                     count(distinct (case when Product in ('a0058000005VpNDAA0', 'a0058000005UplCAAS') AND Detail_Priority=1.0 then Veeva_call_Id end)) as P1_Arikayce, \n
                     count(distinct (case when Product in ('a0058000005VpNDAA0', 'a0058000005UplCAAS') AND Detail_Priority=2.0 then Veeva_call_Id end)) as P2_Arikayce, \n
                     count(distinct [PatientMasterID] ) as distinct_patients, \n
                     count(distinct [RX_NUMBER]) as rx_count \n
              FROM(SELECT a.[PatientMasterID], a.[PHYSICIAN_NPI], a.[DATE_SHIPPED], \n
                          a.RX_NUMBER, b.[InsmedID], b.[PrimarySpecialty], b.[NpiID], c.[Veeva_Call_ID], c.[Insmed_HCP_Id], \n
                          c.[Detail_Priority], c.[Product], c.[Insmed_Call_Type], c.[CallDate], LEFT(c.[CallDate], 7) AS CallDate_ym \n
                   FROM [Insmed_Adhoc].[dbo].[tblConsolidated_Dispense] a \n
                   FULL OUTER JOIN [Insmed_Adhoc].[dbo].[tblDF_Shyft_dimHCP_Adhoc] b ON A.[PHYSICIAN_NPI] = b.NpiID \n
                   FULL OUTER JOIN [Insmed_Adhoc].[dbo].[tblDF_Shyft_Call_Adhoc] c ON b.[InsmedID]= c.[Insmed_HCP_Id] \n
                   WHERE YEAR(a.[DATE_SHIPPED]) =  YEAR(c.[CallDate]) AND MONTH(a.DATE_SHIPPED) = MONTH(c.[CallDate]) \n
                   and c.Insmed_Call_Type in ('HCP and Staff','HCP Only','In Person','Live – HCP and Staff',\n
                                              'Live – HCP Only','Phone Call','Phone Call w/HCP','Phone Call w/Staff',\n
                                              'Virtual – HCP Only','Virtual Interaction')
                   and a.DISPENSE_TYPE='Product' and a.DISPENSE_STATUS not in ('V','R')) as SUBQUERY
              GROUP BY [Insmed_HCP_Id], [PHYSICIAN_NPI], [CallDate_ym]
              ORDER BY [Insmed_HCP_Id], [CallDate_ym]"""
df = pd.read_sql(sql_stmt, conn)
df

Unnamed: 0,Insmed_HCP_Id,PHYSICIAN_NPI,CallDate_ym,Specialty,P1_Arikayce,P2_Arikayce,distinct_patients,rx_count
0,INSP20000000,1932322948,2018-12,PUD,1,0,2,2
1,INSP20000000,1932322948,2019-02,PUD,1,0,2,2
2,INSP20000000,1932322948,2019-06,PUD,1,0,2,2
3,INSP20000000,1932322948,2019-10,PUD,1,0,6,6
4,INSP20000000,1932322948,2020-06,PUD,1,0,3,3
...,...,...,...,...,...,...,...,...
16845,INSP20134388,1144746934,2023-02,ID,0,0,1,1
16846,INSP20134500,1720513641,2023-02,ID,1,0,1,1
16847,INSP20134876,1912038944,2023-01,PHM,0,0,1,1
16848,INSP20135374,1518345289,2023-01,IMD,1,0,1,1


# Preprocessing

In [3]:
group_by = 'Specialty'
segment = 'Specialty'
date = 'CallDate_ym'
media_cols = ['P1_Arikayce', 'P2_Arikayce']
dv = 'rx_count'
preprocess = mmm_preprocessing.MMMPreprocessing(group_by, segment, date, media_cols, dv)

In [8]:
df_pp = df[[segment, date, dv] + media_cols].groupby([segment, date]).sum().reset_index().sort_values([segment, date])
df_pp = preprocess.cleanup(df_pp)
df_pp = df_pp[df_pp['CallDate_ym']<='2023-01']
df_pp

Unnamed: 0,Specialty,CallDate_ym,rx_count,P1_Arikayce,P2_Arikayce,Specialty_AC,Specialty_ADU,Specialty_CCE,Specialty_CCM,Specialty_EM,Specialty_FM,Specialty_GP,Specialty_HOS,Specialty_ID,Specialty_IM,Specialty_Other,Specialty_PCC,Specialty_PDP,Specialty_PUD
0,AC,2018-10,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,AC,2018-11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,AC,2018-12,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,AC,2019-01,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,AC,2019-02,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736,PUD,2022-09,143,94,13,0,0,0,0,0,0,0,0,0,0,0,0,0,1
737,PUD,2022-10,200,120,18,0,0,0,0,0,0,0,0,0,0,0,0,0,1
738,PUD,2022-11,185,117,14,0,0,0,0,0,0,0,0,0,0,0,0,0,1
739,PUD,2022-12,162,96,10,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [9]:
df_pp[['Specialty', 'rx_count', 'P1_Arikayce', 'P2_Arikayce']].groupby('Specialty').mean().reset_index()

Unnamed: 0,Specialty,rx_count,P1_Arikayce,P2_Arikayce
0,AC,0.634615,0.384615,0.057692
1,ADU,3.346154,2.25,0.038462
2,CCE,1.942308,1.038462,0.038462
3,CCM,29.057692,16.461538,0.365385
4,EM,0.75,0.461538,0.019231
5,FM,6.461538,5.153846,0.403846
6,GP,6.538462,2.730769,0.038462
7,HOS,0.711538,0.673077,0.0
8,ID,250.634615,166.423077,6.942308
9,IM,33.615385,20.557692,0.615385


# Transformations

In [10]:
transform = mmm_transformations.MMMTransformations()

In [11]:
df_t = transform.lag_dv(df_pp, 'P1_Arikayce', 3, 'Specialty')
df_t = transform.lag_dv(df_t, 'P2_Arikayce', 3, 'Specialty')
df_t

Unnamed: 0,Specialty,CallDate_ym,rx_count,P1_Arikayce,P2_Arikayce,Specialty_AC,Specialty_ADU,Specialty_CCE,Specialty_CCM,Specialty_EM,...,Specialty_Other,Specialty_PCC,Specialty_PDP,Specialty_PUD,P1_Arikayce_lag1,P1_Arikayce_lag2,P1_Arikayce_lag3,P2_Arikayce_lag1,P2_Arikayce_lag2,P2_Arikayce_lag3
0,AC,2018-10,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AC,2018-11,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AC,2018-12,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AC,2019-01,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AC,2019-02,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736,PUD,2022-09,143,94,13,0,0,0,0,0,...,0,0,0,1,139,82,74,9,16,5
737,PUD,2022-10,200,120,18,0,0,0,0,0,...,0,0,0,1,94,139,82,13,9,16
738,PUD,2022-11,185,117,14,0,0,0,0,0,...,0,0,0,1,120,94,139,18,13,9
739,PUD,2022-12,162,96,10,0,0,0,0,0,...,0,0,0,1,117,120,94,14,18,13


# Final Model Fitting

In [12]:
modeling = mmm_modeling.MMMModeling()

In [13]:
# modeling
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lagged_channels = [x for x in df_t.columns if 'lag' in x]
X = df_t[channels + lagged_channels + specialties]
y = df_t['rx_count']
model = modeling.rf_regressor(df_t, X.columns.tolist(), 'rx_count', 'CallDate_ym')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
# performance
model['performance']

{'full': {'r2': 0.9964602082176761,
  'rmse': 4.546283268104926,
  'mape': 292570675605882.4},
 'train': {'r2': 0.9958068948605274,
  'rmse': 4.882367775537863,
  'mape': 240095387664515.25},
 'test': {'r2': 0.9818245237841435,
  'rmse': 10.85604918962073,
  'mape': 421997658274874.2}}

In [15]:
# importance
model['importance']

Unnamed: 0,feature,importance,std
3,P1_Arikayce_lag2,0.465348,0.450093
0,P1_Arikayce,0.210702,0.358655
2,P1_Arikayce_lag1,0.169238,0.347524
4,P1_Arikayce_lag3,0.107211,0.282092
16,Specialty_ID,0.021741,0.017725
21,Specialty_PUD,0.012585,0.017212
1,P2_Arikayce,0.004027,0.002822
6,P2_Arikayce_lag2,0.003009,0.002964
5,P2_Arikayce_lag1,0.001879,0.001577
11,Specialty_CCM,0.001669,0.000916


In [22]:
# scatter plot of predictions
plot_df = pd.DataFrame({'date': model['df_preds_test']['CallDate_ym'].astype(str),
                        'preds': model['df_preds_test']['preds_test'],
                        'actual': model['df_preds_test']['rx_count']})
fig = px.scatter(plot_df, x="date", y=plot_df.columns.tolist(), title="Future Hold Out Set")
fig.show()

In [23]:
# scatter plot of predictions
plot_df = pd.DataFrame({'date': model['df_preds_full']['CallDate_ym'].astype(str),
                        'preds': model['df_preds_full']['preds_full'],
                        'actual': model['df_preds_full']['rx_count']})
fig = px.scatter(plot_df, x="date", y=plot_df.columns.tolist(), title="Full Data - Model trained on full data")
fig.show()

# Response Curves - Isolated Impact

In [51]:
response_curves = mmm_response_curves.MMMResponseCurves()

In [35]:
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lagged_channels = [x for x in df_t.columns if 'lag' in x]
X = df_t[channels + lagged_channels + specialties + ['Specialty', 'CallDate_ym']]
X[channels + lagged_channels] = 0
p1_lags = [x for x in lagged_channels if 'P1' in x]
p2_lags = [x for x in lagged_channels if 'P2' in x]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [36]:
# overall response curves
channel1 = response_curves.responses('overall', model['full_model'], X, 'P1_Arikayce', 200, 1, 'Specialty', 'CallDate_ym', p1_lags)
channel2 = response_curves.responses('overall', model['full_model'], X, 'P2_Arikayce', 200, 1, 'Specialty', 'CallDate_ym', p2_lags)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente

In [37]:
response_curves.plot(channel1['resp_df'], 'touches', ['P1_Arikayce', 'P1_Arikayce_hill_estimate'])

In [38]:
response_curves.plot(channel2['resp_df'], 'touches', ['P2_Arikayce', 'P2_Arikayce_hill_estimate'])

In [39]:
overall_resp = pd.concat([channel1['resp_df'], channel2['resp_df'].drop(['touches'], axis=1)], axis=1)
fig = response_curves.plot(overall_resp, 'touches', ['P1_Arikayce', 'P2_Arikayce'])
fig

In [42]:
channel1_segment = response_curves.responses('segment', model['full_model'], X, 'P1_Arikayce', 200, 1, 'Specialty', 'CallDate_ym', p1_lags)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente

In [43]:
channel1_segment['fig_raw']

In [44]:
channel1_segment['fig_hill']

In [45]:
channel2_segment = response_curves.responses('segment', model['full_model'], X, 'P2_Arikayce', 200, 1, 'Specialty', 'CallDate_ym',p2_lags)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente

In [46]:
channel2_segment['fig_raw']

In [47]:
channel2_segment['fig_hill']

In [48]:
plot_df = channel2_segment['resp_df'].copy()
cols = [x for x in plot_df.columns if ('P2_Arikayce' in x) and ('hill' not in x)]
plot_df = plot_df[plot_df['touches'] <= 20]
fig = response_curves.plot(plot_df, 'touches', cols)
fig

In [49]:
plot_df = channel2_segment['resp_df'].copy()
cols = [x for x in plot_df.columns if x.endswith('hill_estimate')]
plot_df = plot_df[plot_df['touches'] <= 20]
fig = response_curves.plot(plot_df, 'touches', cols)
fig

# Responses - Simulating segments

In [52]:
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lagged_channels = [x for x in df_t.columns if 'lag' in x]
X = df_t[channels + lagged_channels + specialties + ['Specialty']]
X[channels + lagged_channels + specialties] = 0
p1_lags = [x for x in lagged_channels if 'P1' in x]
p2_lags = [x for x in lagged_channels if 'P2' in x]

channel1_v2 = response_curves.responses_segment(model['full_model'], X, 'P1_Arikayce', 200, 1, specialties, 'Specialty', p1_lags)

In [53]:
channel1_v2['fig_raw']

In [54]:
channel1_v2['fig_hill']

In [55]:
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lagged_channels = [x for x in df_t.columns if 'lag' in x]
X = df_t[channels + lagged_channels + specialties + ['Specialty']]
X[channels + lagged_channels + specialties] = 0
p1_lags = [x for x in lagged_channels if 'P1' in x]
p2_lags = [x for x in lagged_channels if 'P2' in x]

channel2_v2 = response_curves.responses_segment(model['full_model'], X, 'P2_Arikayce', 200, 1, specialties, 'Specialty', p2_lags)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [56]:
channel2_v2['fig_raw']

In [57]:
channel2_v2['fig_hill']

# Budgeting - Overall

In [58]:
channel1['optimal_hill']

array([202.10722837,  52.15527887,   3.83040915])

In [59]:
channel2['optimal_hill']

array([1.55443705, 0.11107429, 1.52457067])

In [60]:
overall_hill = pd.concat([pd.DataFrame(channel1['optimal_hill']).T, pd.DataFrame(channel2['optimal_hill']).T])
overall_hill['feature'] = channels
overall_hill.columns = ['beta', 'alpha', 'gamma', 'feature']
overall_hill

Unnamed: 0,beta,alpha,gamma,feature
0,202.107228,52.155279,3.830409,P1_Arikayce
0,1.554437,0.111074,1.524571,P2_Arikayce


In [61]:
optimization = mmm_optimization.MMMOptimization(budget=368, params=overall_hill)
start_vals = [110, 5]
output = optimization.optimize_hill(start_vals)
output

     fun: -203.53720671885486
     jac: array([-0.00128174, -0.00086975])
 message: 'Optimization terminated successfully'
    nfev: 39
     nit: 13
    njev: 13
  status: 0
 success: True
       x: array([361.92174074,   6.07825926])


# Budgeting - Channel 1

In [62]:
channel1_segment['optimal_hill']

{'AC_P1_Arikayce': array([203.60729296,  52.13593401,   3.68870388]),
 'ADU_P1_Arikayce': array([203.31788332,  52.17365744,   3.7022898 ]),
 'CCE_P1_Arikayce': array([202.64609905,  52.26129895,   3.73420135]),
 'CCM_P1_Arikayce': array([186.78126322,  54.5472815 ,   4.54199472]),
 'EM_P1_Arikayce': array([203.67511217,  52.12711022,   3.68552244]),
 'FM_P1_Arikayce': array([202.05711152,  52.48043083,   3.98136716]),
 'GP_P1_Arikayce': array([201.01270573,  52.47590292,   3.81305948]),
 'HOS_P1_Arikayce': array([203.3794836 ,  52.16560071,   3.69940482]),
 'ID_P1_Arikayce': array([230.23750512,  49.82715017,   5.61097429]),
 'IM_P1_Arikayce': array([205.49226394,  51.57644269,   3.20962264]),
 'Other_P1_Arikayce': array([203.11562983,  52.1983535 ,   3.71539885]),
 'PCC_P1_Arikayce': array([200.63540698,  53.10644524,   4.28541733]),
 'PDP_P1_Arikayce': array([203.62066822,  52.13411446,   3.68815725]),
 'PUD_P1_Arikayce': array([209.90086784,  61.47688347,   1.99884505])}

In [63]:
params = pd.DataFrame(channel1_segment['optimal_hill']).T
params.columns = ['beta','alpha','gamma']
optimization = mmm_optimization.MMMOptimization(budget=354, params=params)
start_vals = [1,1,1,110,1,1,1,1,110,110,1,110,1,1]
output = optimization.optimize_hill(start_vals)
output

     fun: -745.7984099523067
     jac: array([ 0.        ,  0.        ,  0.        , -0.82597351,  0.        ,
        0.        ,  0.        ,  0.        , -0.82539368, -0.82542419,
        0.        , -0.8249054 ,  0.        ,  0.        ])
 message: 'Optimization terminated successfully'
    nfev: 195
     nit: 13
    njev: 13
  status: 0
 success: True
       x: array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 8.93200590e+01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       8.24849654e+01, 9.23883274e+01, 3.56442299e-17, 8.98066481e+01,
       0.00000000e+00, 0.00000000e+00])



divide by zero encountered in double_scalars



# Budgeting - Channel 2

In [64]:
channel2_segment['optimal_hill']

{'AC_P2_Arikayce': array([ 0.94715711,  0.28255352, 15.07851204]),
 'ADU_P2_Arikayce': array([8.31873016e-01, 1.49139341e-04, 2.17752862e+00]),
 'CCE_P2_Arikayce': array([ 2.73081957,  0.42670747, 23.47710349]),
 'CCM_P2_Arikayce': array([1.4477999 , 1.0514508 , 2.05890935]),
 'EM_P2_Arikayce': array([ 1.15339339,  0.47095019, 24.79447163]),
 'FM_P2_Arikayce': array([ 1.04100404,  0.44454423, 23.08834658]),
 'GP_P2_Arikayce': array([ 5.68      ,  0.55968321, 36.1539159 ]),
 'HOS_P2_Arikayce': array([ 0.98733982,  0.40821515, 21.00660896]),
 'ID_P2_Arikayce': array([ 1.16796186,  0.47297728, 24.95319441]),
 'IM_P2_Arikayce': array([ 1.16796186,  0.47297728, 24.95319441]),
 'Other_P2_Arikayce': array([ 1.28555159,  0.48558865, 25.85927011]),
 'PCC_P2_Arikayce': array([ 1.0170795 ,  0.43268773, 22.41747673]),
 'PDP_P2_Arikayce': array([ 1.1351337 ,  0.46815868, 24.57955409]),
 'PUD_P2_Arikayce': array([ 1.16796186,  0.47297728, 24.95319441])}

In [65]:
params = pd.DataFrame(channel2_segment['optimal_hill']).T
params.columns = ['beta','alpha','gamma']
optimization = mmm_optimization.MMMOptimization(budget=14, params=params)
start_vals = [1,1,1,4,1,1,1,1,1,4,1,1,1,4]
output = optimization.optimize_hill(start_vals)
output

     fun: -21.7223052546182
     jac: array([-0.01083851, -0.0117979 , -0.01056767, -0.01115417, -0.01112509,
       -0.01128578, -0.01012707, -0.01142502, -0.01117182, -0.01119232,
       -0.01126671, -0.01114178, -0.01114106, -0.01119256])
 message: 'Optimization terminated successfully'
    nfev: 1180
     nit: 75
    njev: 75
  status: 0
 success: True
       x: array([0.47730736, 0.01163613, 0.62996786, 6.3252891 , 0.65749371,
       0.63208213, 0.74193889, 0.59834268, 0.65942455, 0.6595728 ,
       0.67168395, 0.62107262, 0.65461532, 0.65957289])



overflow encountered in double_scalars



# Budgeting Overall - Raw

In [68]:
optimization = mmm_optimization.MMMOptimization(budget=368)
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lagged_channels = [x for x in df_t.columns if 'lag' in x]
X = df_t[channels + lagged_channels + specialties]
output = optimization.optimize_predict(X, channels, 1000, model['full_model'])
output

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:39<00:00, 10.04trial/s, best loss: -113.85387362637363]


{'mix': {'P1_Arikayce': 266, 'P2_Arikayce': 12},
 'trials': [{'loss': -106.85899725274724, 'status': 'ok'},
  {'loss': -113.1979532967033, 'status': 'ok'},
  {'loss': -113.16458791208792, 'status': 'ok'},
  {'loss': -91.3567857142857, 'status': 'ok'},
  {'loss': -113.17251373626374, 'status': 'ok'},
  {'loss': -113.1979532967033, 'status': 'ok'},
  {'loss': -110.49739010989009, 'status': 'ok'},
  {'loss': -110.47364010989011, 'status': 'ok'},
  {'loss': -104.2478434065934, 'status': 'ok'},
  {'loss': -74.81375, 'status': 'ok'},
  {'loss': -103.49776098901098, 'status': 'ok'},
  {'loss': -113.1979532967033, 'status': 'ok'},
  {'loss': -45.979354395604396, 'status': 'ok'},
  {'loss': -85.11623626373625, 'status': 'ok'},
  {'loss': -113.1979532967033, 'status': 'ok'},
  {'loss': -42.51254120879121, 'status': 'ok'},
  {'loss': -53.15375, 'status': 'ok'},
  {'loss': -41.924079670329675, 'status': 'ok'},
  {'loss': -113.16153846153847, 'status': 'ok'},
  {'loss': -113.16458791208792, 'status

# Budgeting Segment 1 - Raw

In [135]:
optimization = mmm_optimization.MMMOptimization(budget=368)
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lagged_dv = [x for x in df_t.columns if 'lag' in x]
X = df_t[channels + lagged_dv + specialties]
X[specialties] = 0
X['Specialty_ID'] = 1
output = optimization.optimize_predict(X, channels, 1000, model['full_model'], lagged_channels)
output

  0%|                                                                                                                           | 0/1000 [00:00<?, ?trial/s, best loss=?]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:37<00:00, 10.29trial/s, best loss: -6515951857.917404]


{'mix': {'P1_Arikayce': 299, 'P2_Arikayce': 13},
 'trials': [{'loss': -5545475219.270521, 'status': 'ok'},
  {'loss': -5334877693.903969, 'status': 'ok'},
  {'loss': -6170488949.28412, 'status': 'ok'},
  {'loss': -5864798018.275096, 'status': 'ok'},
  {'loss': -3798570177.215763, 'status': 'ok'},
  {'loss': -5559418900.449766, 'status': 'ok'},
  {'loss': -5881968347.773722, 'status': 'ok'},
  {'loss': -6504032901.992638, 'status': 'ok'},
  {'loss': -6504032901.992638, 'status': 'ok'},
  {'loss': -5810865326.50673, 'status': 'ok'},
  {'loss': -6175747190.98978, 'status': 'ok'},
  {'loss': -6504032901.992638, 'status': 'ok'},
  {'loss': -5051212861.575783, 'status': 'ok'},
  {'loss': -3672030320.4225535, 'status': 'ok'},
  {'loss': -6504032901.992638, 'status': 'ok'},
  {'loss': -5860358457.879931, 'status': 'ok'},
  {'loss': -6355234826.557541, 'status': 'ok'},
  {'loss': -6221230707.018462, 'status': 'ok'},
  {'loss': -6503292517.384656, 'status': 'ok'},
  {'loss': -5864798018.275096, 

# Insight

In [None]:
# Troubleshooting
    # Getting hyperopt to optimize mix <= budget
    # Segment level hyperopt mix
        # This should be trying to get interactions among channels within a specified specialty
        # But then how do we get optimal mix across segments within media channel
    # To include multiple segments we either need to build one hcp level model or segment level model one for each segment type
        # Problem with hcp level is data is too sparse
    # Response curves with interactions
    # the impact when broken out by segments is sometimes higher than overall - how can we rationalize this?\
    # simulating segments also changes response curves
    # corrected issue with lag dv
    # new media and demo features

In [67]:
# compared to the first model differences are
    # impact of media shifts towards lag dv features 
    # segment level insight changes
    # no adstock changes shape of curves