In [2]:
import pandas as pd
import pyodbc
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
from dateutil.rrule import rrule, MONTHLY
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.optimize import minimize
from scipy.optimize import curve_fit
import shap
import numdifftools as nd
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope
shap.initjs()
%load_ext autoreload
%autoreload 2
import mmm_transformations
import mmm_preprocessing
import mmm_modeling
import mmm_response_curves
import mmm_optimization

# Summary

In [223]:
# description of what this notebook does
# should be minimal code and jsut outputs

# Data Pull

In [2]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=PRDSPPI10DB1;'
                      'Port=1433;')

In [3]:
sql_stmt = """SELECT	r.HCOId,
		hcoAddr.BaseZip AS HCOZip,
		r.MonthDesc,
		r.YearDesc,
		r.BrandName,
		r.Emails,
		r.Phones,
		r.FTF,
		r.Virtual,
		r.Mssg,
		r.Other,
		r.Quantity,
		fncl.*,
		hco_bed_total.*,
		hco_bus.*

FROM (
SELECT	ISNULL(a.HCOId, b.HCOId) AS HCOId,
		ISNULL(a.YearDesc, b.YearDesc) AS YearDesc,
		ISNULL(a.MonthDesc, b.MonthDesc) AS MonthDesc,
		ISNULL(a.BrandName, b.BrandName) AS BrandName,
		ISNULL(b.Emails, 0) AS Emails,
		ISNULL(b.Phones, 0) AS Phones,
		ISNULL(b.FTF, 0) AS FTF,
		ISNULL(b.Virtual, 0) AS Virtual,
		ISNULL(b.Mssg, 0) AS Mssg,
		ISNULL(b.Other, 0) AS Other,
		ISNULL(a.Quantity, 0) AS Quantity 
FROM (
	SELECT	s.HCOId, 
		d.YearDesc, 
		d.MonthDesc,
		p.BrandName,
		SUM(s.QuantityAdjusted) AS Quantity		
		FROM meas.tblFactSalesTransaction s
		INNER JOIN meas.tblDimDate d ON s.SalesTransactionDateId = d.DateId
		INNER JOIN	meas.tblDimProduct p	ON	s.ProductId = p.ProductId
		WHERE s.HCOId <> -1
		GROUP BY s.HCOId, 
				d.YearDesc, 
				d.MonthDesc,
				p.BrandName
) a
/* ProductId =1 (Rolvedon). All ProductId in FasctSalesTrasaction equals to 1 
 FactSalesTransaction (hco-date level) has 2819 rows and
 a (hco-month level) has 945 rows*/ 


FULL OUTER JOIN (
SELECT
    fc.HCOId, 
    d.YearDesc,
    d.MonthDesc,
    p.BrandName,
    COUNT(DISTINCT CASE WHEN cc.CallChannelName = 'Email_vod' THEN fc.CallId END) AS Emails,
    COUNT(DISTINCT CASE WHEN cc.CallChannelName = 'Phone_vod' THEN fc.CallId END) AS Phones,
    COUNT(DISTINCT CASE WHEN cc.CallChannelName = 'Face_to_face_vod' THEN fc.CallId END) AS FTF,
    COUNT(DISTINCT CASE WHEN cc.CallChannelName = 'Video_vod' THEN fc.CallId END) AS Virtual,
    COUNT(DISTINCT CASE WHEN cc.CallChannelName = 'Message_vod' THEN fc.CallId END) AS Mssg,
    COUNT(DISTINCT CASE WHEN cc.CallChannelName = 'Other_vod' THEN fc.CallId END) AS Other
FROM
    (
        SELECT * 
		/*
            CASE
                WHEN HCPId <> -1 THEN HCPId
                ELSE HCOId
            END AS ChildId,
			CASE 
				WHEN HCPId <> -1 THEN 1
				ELSE 2
			END AS AffiliationTypeId */
        FROM meas.tblFactCall
		WHERE HCOId <> -1
/* It is confirmed that FactCAll is either HCPId or HCOId level 
HCOId = -1 has 8173 rows; HCO<>-1 has 5685. tblFactCall has a total of 13858 rows*/         
    ) fc
/*LEFT JOIN
    meas.tblDimAffiliation da ON fc.ChildId = da.ChildId AND fc.AffiliationTypeId=da.AffiliationTypeId and da.[Primary]=1 and da.Active=1
	 type=1, HCP-HCO, type=2, HCO-HCO  */

INNER JOIN
    dbo.tblDimCallChannel cc ON fc.CallChannelId = cc.CallChannelId
INNER JOIN
    meas.tblDimDate d ON fc.CallDateId = d.DateId
INNER JOIN
    meas.tblFactCallDetail fcd ON fc.CallSourceId = fcd.CallSourceId
LEFT JOIN
    meas.tblDimProduct p ON fcd.ProductId = p.ProductId
GROUP BY
    fc.HCOId,
    d.YearDesc,
    d.MonthDesc,
    p.BrandName
) b 
/*FactCall has 13858 (date-level) rows and b (month level) has 2885 rows*/
ON a.HCOId = b.HCOId AND a.YearDesc = b.YearDesc
	AND a.MonthDesc = b.MonthDesc
	AND a.BrandName = b.BrandName
) r
INNER JOIN MEAS.tblDimHCO hco ON r.HCOId = hco.HCOId
LEFT OUTER JOIN 
(SELECT TPSEntityId as HCOSourceId, iq_hco_fncl.*
FROM [SPECTRUM_ADHOC].[dbo].[tblDataFeed_IQVIA_OneKey_HCO_FNCL_FACT] iq_hco_fncl
--tblDataFeed_IQVIA_OneKey_HCO_FNCL_FACT has 4149 rows
INNER JOIN (
    SELECT *
    FROM [SPECTRUM_CM].[CM].[tblOutletAlternateId]
    WHERE TPSDataFeedId = 6
) oa ON iq_hco_fncl.HCO_HCE_ID = oa.DataProviderUniqueIdentifier COLLATE SQL_Latin1_General_CP1_CI_AS
-- "SQL_Latin1_General_CP1_CI_AS" indicates to read collumn via "class insensitive"
  ) fncl ON hco.HCOSourceID = fncl.HCOSourceId 
--fncl only has 4149 rows

LEFT OUTER JOIN 
(SELECT TPSEntityId as HCOSourceId, bed.*
FROM [SPECTRUM_ADHOC].[dbo].[tblDataFeed_IQVIA_OneKey_HCO_BED_FACT] bed 
INNER JOIN (
    SELECT *
    FROM [SPECTRUM_CM].[CM].[tblOutletAlternateId]
    WHERE TPSDataFeedId = 6
) oa ON bed.HCO_HCE_ID = oa.DataProviderUniqueIdentifier COLLATE SQL_Latin1_General_CP1_CI_AS
WHERE BED_DESC = 'Total' 
  ) hco_bed_total ON hco.HCOSourceID = hco_bed_total.HCOSourceId 
 --hco_bed_total has 4725 rows


LEFT OUTER JOIN 
(SELECT TPSEntityId as HCOSourceId, bus.* FROM (SELECT
    HCO_HCE_ID,
    MAX(CASE WHEN DETL_CD = 'avgoccrate' THEN DETL_NBR END) AS avgoccrate,
    MAX(CASE WHEN DETL_CD = 'routineday' THEN DETL_NBR END) AS routineday,
    MAX(CASE WHEN DETL_CD = 'H_pdays' THEN DETL_NBR END) AS H_pdays,
    MAX(CASE WHEN DETL_CD = 'avglos' THEN DETL_NBR END) AS avglos,
    MAX(CASE WHEN DETL_CD = 'mparmix' THEN DETL_NBR END) AS mparmix,
    MAX(CASE WHEN DETL_CD = 'mcaredrg' THEN DETL_NBR END) AS mcaredrg,
    MAX(CASE WHEN DETL_CD = 'ttldrg' THEN DETL_NBR END) AS ttldrg,
    MAX(CASE WHEN DETL_CD = 'MCR_alos' THEN DETL_NBR END) AS MCR_alos,
    MAX(CASE WHEN DETL_CD = 'hdrg' THEN DETL_NBR END) AS hdrg,
    MAX(CASE WHEN DETL_CD = 'MCR_pdays' THEN DETL_NBR END) AS MCR_pdays
FROM
    [SPECTRUM_ADHOC].[dbo].[tblDataFeed_IQVIA_OneKey_HCO_BUS_DETL_FACT]
GROUP BY
    HCO_HCE_ID
HAVING
    COALESCE(MAX(CASE WHEN DETL_CD = 'avgoccrate' THEN DETL_NBR END),
             MAX(CASE WHEN DETL_CD = 'routineday' THEN DETL_NBR END),
             MAX(CASE WHEN DETL_CD = 'H_pdays' THEN DETL_NBR END),
             MAX(CASE WHEN DETL_CD = 'avglos' THEN DETL_NBR END),
             MAX(CASE WHEN DETL_CD = 'mparmix' THEN DETL_NBR END),
             MAX(CASE WHEN DETL_CD = 'mcaredrg' THEN DETL_NBR END),
             MAX(CASE WHEN DETL_CD = 'ttldrg' THEN DETL_NBR END),
             MAX(CASE WHEN DETL_CD = 'MCR_alos' THEN DETL_NBR END),
             MAX(CASE WHEN DETL_CD = 'hdrg' THEN DETL_NBR END),
             MAX(CASE WHEN DETL_CD = 'MCR_pdays' THEN DETL_NBR END)) IS NOT NULL
) bus
INNER JOIN (
    SELECT *
    FROM [SPECTRUM_CM].[CM].[tblOutletAlternateId]
    WHERE TPSDataFeedId = 6
) oa ON bus.HCO_HCE_ID = oa.DataProviderUniqueIdentifier COLLATE SQL_Latin1_General_CP1_CI_AS
) hco_bus ON hco.HCOSourceID = hco_bus.HCOSourceId 
--hco_bus has 4145 rows 

LEFT OUTER JOIN MEAS.tblDimEntityAddress hcoEntAddr
	ON hco.HCOId = hcoEntAddr.EntityId 
	AND hcoEntAddr.EntityType = 'HCO'
	AND hcoEntAddr.Active = 1
	AND hcoEntAddr.IsPrimary = 1
LEFT OUTER JOIN MEAS.tblDimAddress hcoAddr
	ON hcoEntAddr.AddressId = hcoAddr.AddressId
	AND hcoAddr.Active = 1
ORDER BY r.YearDesc DESC, r.MonthDesc DESC

/* MEAS.tblDimHCO has 53747 distinct id 
using the latest b query, the output of this query is now 8182 rows. (0719)
using the latest b query (consider only hco), the output of this query is now 3845 rows
using the latest b query (consider only hco) + fncl, the output of this query is now 290 rows */ 
"""

In [4]:
df = pd.read_sql(sql_stmt, conn)
df

DatabaseError: Execution failed on sql 'SELECT r.HCOId,
                     hcoAddr.BaseZip AS HCOZip,
                     r.MonthDesc,
                     r.YearDesc,
                     r.ProductGroupName,
                     r.Emails,
                     r.Phones,
                     r.FTF,
                     r.Virtual,
                     r.Mssg,
                     r.Other,
                     r.Quantity

              FROM (
              SELECT  ISNULL(a.HCOId, b.HCOId) AS HCOId,
                      ISNULL(a.YearDesc, b.YearDesc) AS YearDesc,
                      ISNULL(a.MonthDesc, b.MonthDesc) AS MonthDesc,
                      ISNULL(a.ProductGroupName, b.ProductGroupName) AS ProductGroupName,
                      ISNULL(b.Emails, 0) AS Emails,
                      ISNULL(b.Phones, 0) AS Phones,
                      ISNULL(b.FTF, 0) AS FTF,
                      ISNULL(b.Virtual, 0) AS Virtual,
                      ISNULL(b.Mssg, 0) AS Mssg,
                      ISNULL(b.Other, 0) AS Other,
                      ISNULL(a.Quantity, 0) AS Quantity 
              FROM (
                  SELECT  r.HCOId,
                          r.YearDesc,
                          r.MonthDesc,
                          r.ProductGroupName,
                          SUM(r.Quantity) AS Quantity
                  FROM (
                      SELECT s.HCOId, 
                      d.YearDesc, 
                      d.MonthDesc,
                      p.ProductGroupName,
                      SUM(s.QuantityAdjusted) AS Quantity
                      FROM meas.tblFactSalesTransaction s
                      INNER JOIN meas.tblDimDate d ON s.SalesTransactionDateId = d.DateId
                      INNER JOIN meas.vwEZDimProduct p on s.ProductId = p.ProductId and p.IsPrimary = 1
                      WHERE s.HCOId <> -1 AND p.ProductId = 1
                      GROUP BY s.HCOId, 
                              d.YearDesc, 
                              d.MonthDesc,
                              p.ProductGroupName
              
                  ) r
                  GROUP BY r.HCOId, 
                              r.YearDesc, 
                              r.MonthDesc,
                              r.ProductGroupName
              ) a
              FULL OUTER JOIN (
                  SELECT
                          aff.ParentId as HCOId, 
                          d.YearDesc, 
                          d.MonthDesc,
                          p.ProductGroupName,
                          COUNT(DISTINCT CASE WHEN cc.CallChannelName = 'Email_vod' THEN fc.CallId END ) AS Emails,
                          COUNT(DISTINCT CASE WHEN cc.CallChannelName = 'Phone_vod' THEN fc.CallId END ) AS Phones,
                          COUNT(DISTINCT CASE WHEN cc.CallChannelName = 'Face_to_face_vod' THEN fc.CallId END ) AS FTF,
                          COUNT(DISTINCT CASE WHEN cc.CallChannelName = 'Video_vod' THEN fc.CallId END ) AS Virtual, 
                          COUNT(DISTINCT CASE WHEN cc.CallChannelName = 'Message_vod' THEN fc.CallId END ) AS Mssg,
                          COUNT(DISTINCT CASE WHEN cc.CallChannelName = 'Other_vod' THEN fc.CallId END ) AS Other 
                  FROM meas.tblFactCall fc
                  INNER JOIN dbo.tblDimCallChannel cc 
                  ON fc.CallChannelId = cc.CallChannelId
                  INNER JOIN meas.tblDimDate d ON fc.CallDateId = d.DateId
                  INNER JOIN meas.tblFactCallDetail fcd ON fc.CallSourceId = fcd.CallSourceId
                  LEFT JOIN	meas.vwEZDimProduct p ON fcd.ProductId = p.ProductId AND p.IsPrimary = 1
                  LEFT JOIN meas.tblDimAffiliation aff ON fc.HCPId = aff.ChildId 
                  WHERE fc.HCPId <> -1
                  GROUP BY aff.ParentId,
                          d.YearDesc, 
                          d.MonthDesc,
                          p.ProductGroupName
              ) b
              ON a.HCOId = b.HCOId AND a.YearDesc = b.YearDesc
                  AND a.MonthDesc = b.MonthDesc
                  AND a.ProductGroupName = b.ProductGroupName
              ) r
              INNER JOIN MEAS.tblDimHCO hco ON r.HCOId = hco.HCOId
              LEFT OUTER JOIN MEAS.tblDimEntityAddress hcoEntAddr
                  ON hco.HCOId = hcoEntAddr.EntityId 
                  AND hcoEntAddr.EntityType = 'HCO'
                  AND hcoEntAddr.Active = 1
                  AND hcoEntAddr.IsPrimary = 1
              LEFT OUTER JOIN MEAS.tblDimAddress hcoAddr
                  ON hcoEntAddr.AddressId = hcoAddr.AddressId
                  AND hcoAddr.Active = 1
              ORDER BY r.YearDesc DESC, r.MonthDesc DESC': ('42S02', "[42S02] [Microsoft][ODBC SQL Server Driver][SQL Server]Invalid object name 'meas.tblFactSalesTransaction'. (208) (SQLExecDirectW)")

# Preprocessing

In [3]:
df = pd.read_csv('spectrum_balanced.csv')
df

Unnamed: 0,SystemID,Date,BaseZip,SystemName,Emails,Phones,FTF,Virtual,Mssg,Other,...,avgoccrate,routineday,H_pdays,avglos,mparmix,mcaredrg,ttldrg,MCR_alos,hdrg,MCR_pdays
0,4769,2022-10,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
1,4769,2022-11,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",0.0,0.0,1.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
2,4769,2022-12,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",0.0,1.0,1.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
3,4769,2023-01,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",2.0,2.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
4,4769,2023-02,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",1.0,2.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8775,44280,2023-03,78229.0,"SOUTH TEXAS ONCOLOGY & HEMATOLOGY, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
8776,44280,2023-04,78229.0,"SOUTH TEXAS ONCOLOGY & HEMATOLOGY, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
8777,44280,2023-05,78229.0,"SOUTH TEXAS ONCOLOGY & HEMATOLOGY, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
8778,44280,2023-06,78229.0,"SOUTH TEXAS ONCOLOGY & HEMATOLOGY, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8


In [4]:
len(np.unique(df['SystemID']))

878

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SystemID,8780.0,28593.33,10051.22,4746.0,22865.0,27063.5,32535.0,54053.0
BaseZip,8510.0,54601.29,28494.12,1605.0,32503.0,54303.0,78550.0,99701.0
Emails,8780.0,0.3439636,1.332655,0.0,0.0,0.0,0.0,58.0
Phones,8780.0,0.1420273,0.6852133,0.0,0.0,0.0,0.0,20.0
FTF,8780.0,0.1178815,0.5612088,0.0,0.0,0.0,0.0,14.0
Virtual,8780.0,0.02562642,0.1882873,0.0,0.0,0.0,0.0,4.0
Mssg,8780.0,0.01343964,0.179358,0.0,0.0,0.0,0.0,6.0
Other,8780.0,0.07084282,0.3282527,0.0,0.0,0.0,0.0,6.0
Quantity_adj,8780.0,1.439294,24.06048,0.0,0.0,0.0,0.0,1614.0
DDS_IN_PD_CNT,8780.0,2468.131,3013.296,183.0,1456.0,2468.13,2468.13,42080.0


In [6]:
df_pp = df.copy()
df_pp.columns = df_pp.columns.str.lower()
df_pp

Unnamed: 0,systemid,date,basezip,systemname,emails,phones,ftf,virtual,mssg,other,...,avgoccrate,routineday,h_pdays,avglos,mparmix,mcaredrg,ttldrg,mcr_alos,hdrg,mcr_pdays
0,4769,2022-10,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
1,4769,2022-11,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",0.0,0.0,1.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
2,4769,2022-12,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",0.0,1.0,1.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
3,4769,2023-01,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",2.0,2.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
4,4769,2023-02,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",1.0,2.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8775,44280,2023-03,78229.0,"SOUTH TEXAS ONCOLOGY & HEMATOLOGY, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
8776,44280,2023-04,78229.0,"SOUTH TEXAS ONCOLOGY & HEMATOLOGY, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
8777,44280,2023-05,78229.0,"SOUTH TEXAS ONCOLOGY & HEMATOLOGY, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
8778,44280,2023-06,78229.0,"SOUTH TEXAS ONCOLOGY & HEMATOLOGY, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,2.66,232387.72,232387.72,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8


# Transformations

In [67]:
df_pp.corr()

Unnamed: 0,systemid,basezip,emails,phones,ftf,virtual,mssg,other,quantity_adj,dds_in_pd_cnt,...,avgoccrate,routineday,h_pdays,avglos,mparmix,mcaredrg,ttldrg,mcr_alos,hdrg,mcr_pdays
systemid,1.0,-0.043763,0.040585,-0.003854,-0.005933,0.018727,0.022427,0.000899,0.002510066,-0.00965,...,-0.012684,-0.004847,-0.004847,-0.011932,-0.010724,-0.005917,-0.006421,-0.012221,-0.006421,-0.005495
basezip,-0.043763,1.0,0.064302,-0.035111,0.008994,-0.025476,-0.026787,-0.05529,-0.01578954,-0.019945,...,-0.034348,-0.057696,-0.057696,-0.016496,-0.021777,-0.059342,-0.046678,-0.018225,-0.046678,-0.071538
emails,0.040585,0.064302,1.0,0.345419,0.388833,0.262664,0.045946,0.040375,0.1211873,0.053756,...,0.049353,0.037531,0.037531,0.030018,0.052461,0.042606,0.03768,0.0403,0.03768,0.041979
phones,-0.003854,-0.035111,0.345419,1.0,0.315172,0.169554,0.070664,0.001347,0.07865521,0.052043,...,0.040839,0.026769,0.026769,0.033545,0.057516,0.037824,0.027914,0.044735,0.027914,0.036122
ftf,-0.005933,0.008994,0.388833,0.315172,1.0,0.115857,0.069132,-0.005146,0.2995243,0.038774,...,0.034982,0.025,0.025,0.021496,0.038172,0.030603,0.025011,0.030556,0.025011,0.030338
virtual,0.018727,-0.025476,0.262664,0.169554,0.115857,1.0,0.030276,0.031442,0.05288126,0.041169,...,0.036004,0.032677,0.032677,0.018461,0.038982,0.036516,0.030669,0.025047,0.030669,0.036033
mssg,0.022427,-0.026787,0.045946,0.070664,0.069132,0.030276,1.0,-0.00263,0.007289471,-0.001199,...,-0.000701,-0.004596,-0.004596,-0.00356,-0.001024,-0.00296,-0.00352,-0.003611,-0.00352,-0.00411
other,0.000899,-0.05529,0.040375,0.001347,-0.005146,0.031442,-0.00263,1.0,-0.01151268,0.148585,...,0.14565,0.133758,0.133758,0.07666,0.141513,0.143625,0.139864,0.100131,0.139864,0.136486
quantity_adj,0.00251,-0.01579,0.121187,0.078655,0.299524,0.052881,0.007289,-0.011513,1.0,-8.5e-05,...,-6e-05,0.000177,0.000177,-0.000112,-0.000103,7.3e-05,0.000122,-0.000106,0.000122,0.000149
dds_in_pd_cnt,-0.00965,-0.019945,0.053756,0.052043,0.038774,0.041169,-0.001199,0.148585,-8.508558e-05,1.0,...,0.973564,0.862991,0.862991,0.715139,0.97284,0.885615,0.863585,0.856246,0.863585,0.882595


In [7]:
transform = mmm_transformations.MMMTransformations()

In [8]:
df_pp.describe()

Unnamed: 0,systemid,basezip,emails,phones,ftf,virtual,mssg,other,quantity_adj,dds_in_pd_cnt,...,avgoccrate,routineday,h_pdays,avglos,mparmix,mcaredrg,ttldrg,mcr_alos,hdrg,mcr_pdays
count,8780.0,8510.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,...,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0
mean,28593.328018,54601.286722,0.343964,0.142027,0.117882,0.025626,0.01344,0.070843,1.439294,2468.13131,...,2.658833,232387.7,232387.7,37.749661,2.759348,14664.210626,57194.79,39.788235,57194.79,68911.8
std,10051.21836,28494.122385,1.332655,0.685213,0.561209,0.188287,0.179358,0.328253,24.060477,3013.296154,...,3.23579,278814.3,278814.3,63.987459,3.430046,17629.717248,71585.59,56.341959,71585.59,80013.17
min,4746.0,1605.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,183.0,...,0.0363,553.0,553.0,1.5228,0.0037,5.0,76.0,1.0,76.0,17.0
25%,22865.0,32503.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1456.0,...,1.4879,175900.0,175900.0,18.5219,1.4225,9998.0,42288.0,22.0,42288.0,48078.0
50%,27063.5,54303.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2468.13,...,2.66,232387.7,232387.7,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
75%,32535.0,78550.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2468.13,...,2.66,232387.7,232387.7,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
max,54053.0,99701.0,58.0,20.0,14.0,4.0,6.0,6.0,1614.0,42080.0,...,50.0498,5414923.0,5414923.0,1238.7125,49.5089,334041.0,1424002.0,1069.0,1424002.0,1466733.0


In [132]:
df_t = transform.winsorize(df_pp, ['quantity_adj', 'emails', 'phones', 'ftf', 'virtual', 'mssg', 'other'], 99.9)
df_t.describe()

Unnamed: 0,systemid,basezip,emails,phones,ftf,virtual,mssg,other,quantity_adj,dds_in_pd_cnt,...,avgoccrate,routineday,h_pdays,avglos,mparmix,mcaredrg,ttldrg,mcr_alos,hdrg,mcr_pdays
count,8780.0,8510.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,...,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0
mean,28593.328018,54601.286722,0.336105,0.138383,0.113894,0.024715,0.011276,0.06959,1.130978,2468.13131,...,2.658833,232387.7,232387.7,37.749661,2.759348,14664.210626,57194.79,39.788235,57194.79,68911.8
std,10051.21836,28494.122385,1.148857,0.614113,0.495614,0.175254,0.130664,0.313438,11.39558,3013.296154,...,3.23579,278814.3,278814.3,63.987459,3.430046,17629.717248,71585.59,56.341959,71585.59,80013.17
min,4746.0,1605.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,183.0,...,0.0363,553.0,553.0,1.5228,0.0037,5.0,76.0,1.0,76.0,17.0
25%,22865.0,32503.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1456.0,...,1.4879,175900.0,175900.0,18.5219,1.4225,9998.0,42288.0,22.0,42288.0,48078.0
50%,27063.5,54303.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2468.13,...,2.66,232387.7,232387.7,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
75%,32535.0,78550.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2468.13,...,2.66,232387.7,232387.7,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
max,54053.0,99701.0,15.0,9.0,6.221,2.0,2.0,3.0,252.221,42080.0,...,50.0498,5414923.0,5414923.0,1238.7125,49.5089,334041.0,1424002.0,1069.0,1424002.0,1466733.0


In [115]:
#df_t = df_pp.copy()

In [110]:
df_t.to_csv('spectrum_balanced_winsorize.csv', index=False)

In [133]:
df_t = transform.lag_dv(df_t, 'quantity_adj', 3, 'systemid')
df_t = transform.lag_dv(df_t, 'emails', 3, 'systemid')
df_t = transform.lag_dv(df_t, 'phones', 3, 'systemid')
df_t = transform.lag_dv(df_t, 'ftf', 3, 'systemid')
df_t = transform.lag_dv(df_t, 'virtual', 3, 'systemid')
df_t = transform.lag_dv(df_t, 'mssg', 3, 'systemid')
df_t = transform.lag_dv(df_t, 'other', 3, 'systemid')
df_t

Unnamed: 0,systemid,date,basezip,systemname,emails,phones,ftf,virtual,mssg,other,...,ftf_lag3,virtual_lag1,virtual_lag2,virtual_lag3,mssg_lag1,mssg_lag2,mssg_lag3,other_lag1,other_lag2,other_lag3
0,4769,2022-10,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4769,2022-11,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4769,2022-12,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4769,2023-01,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",2.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4769,2023-02,29732.0,"CAROLINA BLOOD AND CANCER CARE ASSOCIATES, PA",1.0,2.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8775,44280,2023-03,78229.0,"SOUTH TEXAS ONCOLOGY & HEMATOLOGY, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8776,44280,2023-04,78229.0,"SOUTH TEXAS ONCOLOGY & HEMATOLOGY, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8777,44280,2023-05,78229.0,"SOUTH TEXAS ONCOLOGY & HEMATOLOGY, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8778,44280,2023-06,78229.0,"SOUTH TEXAS ONCOLOGY & HEMATOLOGY, PA",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
df_t.corr()

Unnamed: 0,systemid,basezip,emails,phones,ftf,virtual,mssg,other,quantity_adj,dds_in_pd_cnt,...,ftf_lag3,virtual_lag1,virtual_lag2,virtual_lag3,mssg_lag1,mssg_lag2,mssg_lag3,other_lag1,other_lag2,other_lag3
systemid,1.000000,-0.043763,0.037912,-0.004015,-0.008886,0.019179,0.029247,0.000058,0.000115,-0.009650,...,-0.004148,0.014108,0.012320,0.014456,0.032976,0.034548,0.032095,0.002737,0.015993,0.019747
basezip,-0.043763,1.000000,0.076187,-0.035517,0.013940,-0.030617,-0.013541,-0.056003,-0.012282,-0.019945,...,0.030942,-0.026328,-0.023861,-0.027322,-0.008357,-0.005584,-0.001913,-0.056191,-0.066771,-0.067540
emails,0.037912,0.076187,1.000000,0.387262,0.358599,0.282910,0.074155,0.034682,0.171971,0.062356,...,0.235979,0.155933,0.143641,0.098248,0.068746,0.048191,0.096907,0.024154,0.021577,0.074092
phones,-0.004015,-0.035517,0.387262,1.000000,0.317963,0.171426,0.062886,0.003224,0.102287,0.058069,...,0.155250,0.114231,0.056955,0.067330,0.030826,0.017036,0.016439,0.011754,0.015131,0.023222
ftf,-0.008886,0.013940,0.358599,0.317963,1.000000,0.114316,0.067133,-0.009802,0.201242,0.043905,...,0.216584,0.069925,0.074945,0.031985,0.049675,0.026840,0.068419,0.002437,0.019967,0.012953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mssg_lag2,0.034548,-0.005584,0.048191,0.017036,0.026840,0.026657,0.388849,-0.005704,-0.000451,0.007872,...,0.041874,0.063926,0.079888,0.009939,0.391694,1.000000,0.347443,-0.007747,0.006960,-0.006903
mssg_lag3,0.032095,-0.001913,0.096907,0.016439,0.068419,0.038733,0.307427,0.004502,0.020571,0.002279,...,0.052894,0.036099,0.073115,0.091948,0.370274,0.347443,1.000000,-0.013755,-0.011882,0.014300
other_lag1,0.002737,-0.056191,0.024154,0.011754,0.002437,0.029660,-0.015007,0.193214,-0.008394,0.134683,...,0.016642,0.036804,0.024515,0.015405,0.005480,-0.007747,-0.013755,1.000000,0.221857,0.124460
other_lag2,0.015993,-0.066771,0.021577,0.015131,0.019967,-0.010890,-0.009042,0.105354,-0.014860,0.104748,...,0.016763,0.039157,0.045256,0.036159,-0.010812,0.006960,-0.011882,0.221857,1.000000,0.227803


# Final Model Fitting

In [21]:
modeling = mmm_modeling.MMMModeling()

In [134]:
channels = ['emails', 'phones', 'ftf', 'virtual', 'mssg', 'other']
lag_dv = [x for x in df_t.columns if 'quantity_adj_lag' in x]
lag_channels = [x for x in df_t.columns if ('lag' in x) & ('quantity_adj' not in x)]
non_media = df_t.drop(channels + lag_dv + lag_channels + ['quantity_adj', 'systemid', 'date', 'basezip', 'systemname'], axis=1).columns.tolist()
#X = df_t[channels + lag_channels + lag_dv]
X = df_t[channels + lag_channels + lag_dv + non_media]
y = df_t['quantity_adj']
model = modeling.rf_regressor(df_t, X.columns.tolist(), 'quantity_adj', 'date')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [135]:
# performance
model['performance']

{'full': {'r2': 0.8149142924326336,
  'rmse': 4.902280325252752,
  'mape': 0.7611652541431498},
 'train': {'r2': 0.7945625436653243,
  'rmse': 5.005837636659457,
  'mape': 0.9562686897424938},
 'test': {'r2': -0.03079198784116821,
  'rmse': 12.865220762215548,
  'mape': 2.47601336027041}}

In [136]:
# importance
model['importance'].head(50)

Unnamed: 0,feature,importance,std
26,quantity_adj_lag3,0.483867,0.160617
24,quantity_adj_lag1,0.169639,0.117612
25,quantity_adj_lag2,0.093078,0.125739
2,ftf,0.032168,0.036563
7,emails_lag2,0.026398,0.037081
0,emails,0.024597,0.030972
12,ftf_lag1,0.024151,0.030866
1,phones,0.020396,0.03016
6,emails_lag1,0.020264,0.022501
11,phones_lag3,0.015334,0.024211


In [65]:
model['importance'].to_csv('imp.csv', index=False)

# Response Curves

In [28]:
response_curves = mmm_response_curves.MMMResponseCurves()

In [140]:
channels = ['emails', 'phones', 'ftf', 'virtual', 'mssg', 'other']
lag_dv = [x for x in df_t.columns if 'quantity_adj_lag' in x]
lag_channels = [x for x in df_t.columns if ('lag' in x) & ('quantity_adj' not in x)]
non_media = df_t.drop(channels + lag_dv + lag_channels + ['quantity_adj', 'systemid', 'date', 'basezip', 'systemname'], axis=1).columns.tolist()
X = df_t[channels + lag_channels + lag_dv + non_media]

In [94]:
X.describe()

Unnamed: 0,emails,phones,ftf,virtual,mssg,other,emails_lag1,emails_lag2,emails_lag3,phones_lag1,...,avgoccrate,routineday,h_pdays,avglos,mparmix,mcaredrg,ttldrg,mcr_alos,hdrg,mcr_pdays
count,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,...,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0,8780.0
mean,0.343964,0.142027,0.117882,0.025626,0.01344,0.070843,0.297494,0.222323,0.187016,0.115945,...,2.658833,232387.7,232387.7,37.749661,2.759348,14664.210626,57194.79,39.788235,57194.79,68911.8
std,1.332655,0.685213,0.561209,0.188287,0.179358,0.328253,1.265386,0.91786,0.831992,0.616067,...,3.23579,278814.3,278814.3,63.987459,3.430046,17629.717248,71585.59,56.341959,71585.59,80013.17
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0363,553.0,553.0,1.5228,0.0037,5.0,76.0,1.0,76.0,17.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.4879,175900.0,175900.0,18.5219,1.4225,9998.0,42288.0,22.0,42288.0,48078.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.66,232387.7,232387.7,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.66,232387.7,232387.7,37.75,2.76,14664.21,57194.79,39.79,57194.79,68911.8
max,58.0,20.0,14.0,4.0,6.0,6.0,58.0,22.0,18.0,20.0,...,50.0498,5414923.0,5414923.0,1238.7125,49.5089,334041.0,1424002.0,1069.0,1424002.0,1466733.0


In [145]:
channel1 = response_curves.responses(model['full_model'], X, 'emails', 30, 1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:04<00:00,  6.90it/s]

divide by zero encountered in reciprocal


divide by zero encountered in power



In [146]:
response_curves.plot(channel1['resp_df'], 'touches', ['emails', 'emails_hill_estimate'])

In [59]:
channel1['resp_df'].to_csv('spectrum_emails.csv', index=False)

In [126]:
channel2 = response_curves.responses(model['full_model'], X, 'phones', 18, 1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [00:02<00:00,  7.47it/s]

divide by zero encountered in reciprocal


divide by zero encountered in power



In [128]:
response_curves.plot(channel2['resp_df'], 'touches', ['phones', 'phones_hill_estimate'])

In [60]:
channel2['resp_df'].to_csv('spectrum_phones.csv', index=False)

In [147]:
channel3 = response_curves.responses(model['full_model'], X, 'ftf', 12, 1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  7.17it/s]

divide by zero encountered in reciprocal


divide by zero encountered in power



In [148]:
response_curves.plot(channel3['resp_df'], 'touches', ['ftf', 'ftf_hill_estimate'])

In [61]:
channel3['resp_df'].to_csv('spectrum_ftf.csv', index=False)

In [53]:
channel4 = response_curves.responses(model['full_model'], X, 'virtual', 20, 0.1)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente

In [54]:
response_curves.plot(channel4['resp_df'], 'touches', ['virtual', 'virtual_hill_estimate'])

In [62]:
channel4['resp_df'].to_csv('spectrum_virtual.csv', index=False)

In [55]:
channel5 = response_curves.responses(model['full_model'], X, 'mssg', 20, 0.1)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente

In [56]:
response_curves.plot(channel5['resp_df'], 'touches', ['mssg', 'mssg_hill_estimate'])

In [63]:
channel5['resp_df'].to_csv('spectrum_mssg.csv', index=False)

In [57]:
channel6 = response_curves.responses(model['full_model'], X, 'other', 20, 0.1)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmente

In [58]:
response_curves.plot(channel6['resp_df'], 'touches', ['other', 'other_hill_estimate'])

In [64]:
channel6['resp_df'].to_csv('spectrum_other.csv', index=False)

# Channel 1 Optimization

In [57]:
p1_hill = pd.DataFrame(channel1_segment['optimal_hill']).T.reset_index()
p1_hill.columns = ['feature', 'beta', 'gamma', 'alpha']
p1_hill

Unnamed: 0,feature,beta,gamma,alpha
0,Specialty_AC_P1_Arikayce,205.637938,51.313808,4.71236
1,Specialty_ADU_P1_Arikayce,205.517174,51.329996,4.719966
2,Specialty_CCE_P1_Arikayce,204.495296,51.466856,4.785168
3,Specialty_CCM_P1_Arikayce,184.168059,53.853232,6.000614
4,Specialty_EM_P1_Arikayce,205.726333,51.301955,4.706769
5,Specialty_FM_P1_Arikayce,204.875774,51.678124,5.028733
6,Specialty_GP_P1_Arikayce,203.352425,51.619572,4.85892
7,Specialty_HOS_P1_Arikayce,205.886561,51.280453,4.696636
8,Specialty_ID_P1_Arikayce,234.19112,48.921949,5.973733
9,Specialty_IM_P1_Arikayce,207.176496,49.692169,3.784883


In [58]:
# hyperopt hill
optimization = mmm_optimization.MMMOptimization(budget=354, params=p1_hill)
channels = p1_hill['feature'].tolist()
output = optimization.optimize_hyperopt_hill(channels, 2000)
output

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:10<00:00, 28.31trial/s, best loss: 99997133.878953]


{'mix': {'Specialty_AC_P1_Arikayce': 330,
  'Specialty_ADU_P1_Arikayce': 223,
  'Specialty_CCE_P1_Arikayce': 194,
  'Specialty_CCM_P1_Arikayce': 340,
  'Specialty_EM_P1_Arikayce': 174,
  'Specialty_FM_P1_Arikayce': 171,
  'Specialty_GP_P1_Arikayce': 250,
  'Specialty_HOS_P1_Arikayce': 222,
  'Specialty_ID_P1_Arikayce': 266,
  'Specialty_IM_P1_Arikayce': 211,
  'Specialty_Other_P1_Arikayce': 330,
  'Specialty_PCC_P1_Arikayce': 188,
  'Specialty_PDP_P1_Arikayce': 252,
  'Specialty_PUD_P1_Arikayce': 221},
 'trials': [{'loss': 99997911.78291155, 'status': 'ok'},
  {'loss': 99997367.74888934, 'status': 'ok'},
  {'loss': 99997252.43217513, 'status': 'ok'},
  {'loss': 99997818.15737288, 'status': 'ok'},
  {'loss': 99997880.892491, 'status': 'ok'},
  {'loss': 99997516.23347887, 'status': 'ok'},
  {'loss': 99997876.8572574, 'status': 'ok'},
  {'loss': 99997331.26427692, 'status': 'ok'},
  {'loss': 99997140.25295413, 'status': 'ok'},
  {'loss': 99997584.77514112, 'status': 'ok'},
  {'loss': 9999

# ID Budgeting

In [71]:
optimization = mmm_optimization.MMMOptimization(budget=135)
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lag_dv = [x for x in df_t.columns if 'rx_count_lag' in x]
lag_channels = [x for x in df_t.columns if ('lag' in x) & ('rx_count' not in x)]
X = df_t[channels + lag_channels + specialties]
X[specialties] = 0
X['Specialty_ID'] = 1
output = optimization.optimize_predict(X, channels, 2000, model['full_model'])
output

  0%|                                                                                                                             | 0/2000 [00:00<?, ?trial/s, best loss=?]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:02<00:00, 31.98trial/s, best loss: -264.04118068466823]


{'mix': {'P1_Arikayce': 90, 'P2_Arikayce': 3},
 'trials': [{'loss': 99999740.31881931, 'status': 'ok'},
  {'loss': -123.15118068466835, 'status': 'ok'},
  {'loss': 99999738.89881931, 'status': 'ok'},
  {'loss': 99999785.74881932, 'status': 'ok'},
  {'loss': -34.57118068466833, 'status': 'ok'},
  {'loss': -126.65118068466835, 'status': 'ok'},
  {'loss': -6.371180684668324, 'status': 'ok'},
  {'loss': -260.7011806846684, 'status': 'ok'},
  {'loss': 99999740.31881931, 'status': 'ok'},
  {'loss': 99999740.65881932, 'status': 'ok'},
  {'loss': -123.15118068466835, 'status': 'ok'},
  {'loss': -11.851180684668325, 'status': 'ok'},
  {'loss': 99999732.60881932, 'status': 'ok'},
  {'loss': 99999741.44881931, 'status': 'ok'},
  {'loss': 99999732.56881931, 'status': 'ok'},
  {'loss': -19.181180684668327, 'status': 'ok'},
  {'loss': -124.69118068466831, 'status': 'ok'},
  {'loss': 99999872.34881932, 'status': 'ok'},
  {'loss': 99999740.15881932, 'status': 'ok'},
  {'loss': 99999952.48881932, 'stat

# Overall Budget

In [74]:
optimization = mmm_optimization.MMMOptimization(budget=368)
channels = ['P1_Arikayce', 'P2_Arikayce']
specialties = [x for x in df_t.columns if x.startswith('Specialty_')]
lag_dv = [x for x in df_t.columns if 'rx_count_lag' in x]
lag_channels = [x for x in df_t.columns if ('lag' in x) & ('rx_count' not in x)]
X = df_t[channels + lag_channels + specialties]
output = optimization.optimize_predict(X, channels, 2000, model['full_model'])
output

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:01<00:00, 32.58trial/s, best loss: -271.404799150921]


{'mix': {'P1_Arikayce': 288, 'P2_Arikayce': 13},
 'trials': [{'loss': 99999736.8873437, 'status': 'ok'},
  {'loss': 99999778.46091513, 'status': 'ok'},
  {'loss': -268.7090848652068, 'status': 'ok'},
  {'loss': -255.94622772234968, 'status': 'ok'},
  {'loss': -221.10908486520682, 'status': 'ok'},
  {'loss': 99999758.573058, 'status': 'ok'},
  {'loss': 99999784.95734371, 'status': 'ok'},
  {'loss': 99999743.05377227, 'status': 'ok'},
  {'loss': 99999743.05377227, 'status': 'ok'},
  {'loss': -8.623370579492526, 'status': 'ok'},
  {'loss': -195.39979915092113, 'status': 'ok'},
  {'loss': 99999790.75448656, 'status': 'ok'},
  {'loss': -262.32837057949257, 'status': 'ok'},
  {'loss': -33.75122772234967, 'status': 'ok'},
  {'loss': 99999736.60591513, 'status': 'ok'},
  {'loss': -208.24551343663535, 'status': 'ok'},
  {'loss': 99999743.05377227, 'status': 'ok'},
  {'loss': 99999743.05377227, 'status': 'ok'},
  {'loss': -218.0883705794926, 'status': 'ok'},
  {'loss': -207.00408486520678, 'stat

In [None]:
# dont need lag dv - point is to estimate the impact of channels as best as possible and that happens when their importance is high