In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv("../Data/pharmacy_tx.csv")
df 

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay
0,2022-01-02,Pharmacy #6,G99.93,branded tanoclolol,725700,1UQC,,False,13.39
1,2022-01-02,Pharmacy #42,U60.52,branded oxasoted,664344,,52H8KH0F83K,False,7.02
2,2022-01-02,Pharmacy #37,Q85.91,branded cupitelol,725700,1UQC,,False,13.39
3,2022-01-02,Pharmacy #30,U60.52,generic oxasoted,571569,KB38N,6BYJBW,False,10.84
4,2022-01-02,Pharmacy #18,N55.01,branded mamate,664344,,ZX2QUWR,False,47.00
...,...,...,...,...,...,...,...,...,...
13910239,2022-12-30,Pharmacy #42,U27.71,branded colifunene,322463,,HO8HUGL,True,0.00
13910240,2022-12-30,Pharmacy #45,N59.44,generic tafistitrisin,664344,,TFZOR5R49,False,6.28
13910241,2022-12-30,Pharmacy #54,W50.87,generic tanoclolol,691847,N098KI,6SP1DG,False,6.94
13910242,2022-12-30,Pharmacy #0,I68.27,branded prazinib,96934,S76J7V6,,False,13.93


## Splitting the data into testing and training

In [4]:
# Creating a data frame with only the rows corresponding to a false value for rejected column in df
# Then dropping the rejected column from the new data frame

df2 = df[df['rejected'] == False]
df2 = df2.drop('rejected', axis=1)
df2

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,patient_pay
0,2022-01-02,Pharmacy #6,G99.93,branded tanoclolol,725700,1UQC,,13.39
1,2022-01-02,Pharmacy #42,U60.52,branded oxasoted,664344,,52H8KH0F83K,7.02
2,2022-01-02,Pharmacy #37,Q85.91,branded cupitelol,725700,1UQC,,13.39
3,2022-01-02,Pharmacy #30,U60.52,generic oxasoted,571569,KB38N,6BYJBW,10.84
4,2022-01-02,Pharmacy #18,N55.01,branded mamate,664344,,ZX2QUWR,47.00
...,...,...,...,...,...,...,...,...
13910238,2022-12-30,Pharmacy #39,Q72.66,branded momudobatin,571569,KB38N,6BYJBW,66.47
13910240,2022-12-30,Pharmacy #45,N59.44,generic tafistitrisin,664344,,TFZOR5R49,6.28
13910241,2022-12-30,Pharmacy #54,W50.87,generic tanoclolol,691847,N098KI,6SP1DG,6.94
13910242,2022-12-30,Pharmacy #0,I68.27,branded prazinib,96934,S76J7V6,,13.93


In [4]:
# Splitting the dataframe into X and y with y being the last column and X being all the other columns except the first one and last one

X = df2.iloc[:, 1:-1]
y = df2.iloc[:, -1]

# One hot encoding all the columns of X

X = pd.get_dummies(X, drop_first=True)

# Splitting the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Model 0: Baseline Model

For the baseline model, we'll predicted the average price for the apporved drugs (since the rejected ones always have 0 copay).

In [5]:
# Creating a baseline model which predicts the mean of the training data for all the test data

y_pred = np.full(y_test.shape, y_train.mean())

# Calculating the RMSE of the baseline model

baseline_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
baseline_rmse

40.54468283950625

## Model 1: Basic Decision Tree

Here, we build a decision tree with no depth constraints

In [5]:
# Creating the decision tree model and fitting it to the training data 

model = DecisionTreeRegressor(max_depth = 10 )
model.fit(X_train, y_train)

# Predicting the values for the testing data

y_pred = model.predict(X_test)

# Visualizing the decision tree

from sklearn.tree import plot_tree

plt.figure(figsize=(20, 10))

plot_tree(model, feature_names=X.columns, filled=True, rounded=True)

plt.show()



: 

: 

In [12]:
# Calculating the root mean squared error

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

29.023188522999703

In [14]:
# Computing the mean of training patient_pay for baseline model

average = np.mean(y_train)
baseline_pred = np.full(len(y_test), average)

In [15]:
# Computing root mean square of differences between baseline and y_test

baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_pred))
baseline_rmse



39.6408946313921

In [16]:
X_train

Unnamed: 0,diagnosis_A13.39,diagnosis_A14.01,diagnosis_A22.87,diagnosis_A45.07,diagnosis_B03.27,diagnosis_B05.36,diagnosis_B42.10,diagnosis_B45.03,diagnosis_B63.86,diagnosis_B84.86,...,drug_generic satrade,drug_generic simarol,drug_generic sorine,drug_generic suvinicuvir,drug_generic tafistitrisin,drug_generic tanoclolol,drug_generic thiostasteglume,drug_generic todiadianic,drug_generic tovane,drug_generic vocopirin
1838932,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7421015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10270016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13841482,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11248619,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13315092,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4304572,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
10081351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6550634,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
model.get_depth(), model.get_n_leaves(), model.get_params()

(155,
 760,
 {'ccp_alpha': 0.0,
  'criterion': 'squared_error',
  'max_depth': None,
  'max_features': None,
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'random_state': None,
  'splitter': 'best'})