# Pre-processing

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from datetime import datetime
from datetime import date

In [16]:
#create dataframe
df = pd.read_csv("../Data/pharmacy_tx.csv")
df.head()

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay
0,2022-01-02,Pharmacy #6,G99.93,branded tanoclolol,725700,1UQC,,False,13.39
1,2022-01-02,Pharmacy #42,U60.52,branded oxasoted,664344,,52H8KH0F83K,False,7.02
2,2022-01-02,Pharmacy #37,Q85.91,branded cupitelol,725700,1UQC,,False,13.39
3,2022-01-02,Pharmacy #30,U60.52,generic oxasoted,571569,KB38N,6BYJBW,False,10.84
4,2022-01-02,Pharmacy #18,N55.01,branded mamate,664344,,ZX2QUWR,False,47.0


In [17]:
def get_doy(d):
    return d.dayofyear

In [18]:
#convert date to integer between 1 and 365
df['day'] = pd.to_datetime(df.tx_date).apply(get_doy)
df = df.drop(columns = ['tx_date'])
df.head()

Unnamed: 0,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,day
0,Pharmacy #6,G99.93,branded tanoclolol,725700,1UQC,,False,13.39,2
1,Pharmacy #42,U60.52,branded oxasoted,664344,,52H8KH0F83K,False,7.02,2
2,Pharmacy #37,Q85.91,branded cupitelol,725700,1UQC,,False,13.39,2
3,Pharmacy #30,U60.52,generic oxasoted,571569,KB38N,6BYJBW,False,10.84,2
4,Pharmacy #18,N55.01,branded mamate,664344,,ZX2QUWR,False,47.0,2


In [19]:
#drop rejected column
df = df[df['rejected']==False]
df = df.drop(columns = ['rejected'])

# Simple Average

In [None]:
avg_pay = np.mean(df['patient_pay'])
rmse = np.sqrt(mean_squared_error(avg_pay*np.ones_like(df['patient_pay']), df['patient_pay']))
rmsle = mean_squared_log_error(avg_pay*np.ones_like(df['patient_pay']), df['patient_pay'], squared = False)
rmse, rmsle

# Average of Averages

In [20]:
df.head()

Unnamed: 0,pharmacy,diagnosis,drug,bin,pcn,group,patient_pay,day
0,Pharmacy #6,G99.93,branded tanoclolol,725700,1UQC,,13.39,2
1,Pharmacy #42,U60.52,branded oxasoted,664344,,52H8KH0F83K,7.02,2
2,Pharmacy #37,Q85.91,branded cupitelol,725700,1UQC,,13.39,2
3,Pharmacy #30,U60.52,generic oxasoted,571569,KB38N,6BYJBW,10.84,2
4,Pharmacy #18,N55.01,branded mamate,664344,,ZX2QUWR,47.0,2


In [21]:
df['pcn'] = df['pcn'].fillna(df['group'])
df=df.drop(columns = 'group')

Unnamed: 0,pharmacy,diagnosis,drug,bin,pcn,group,patient_pay,day
0,Pharmacy #6,G99.93,branded tanoclolol,725700,1UQC,,13.39,2
1,Pharmacy #42,U60.52,branded oxasoted,664344,52H8KH0F83K,52H8KH0F83K,7.02,2
2,Pharmacy #37,Q85.91,branded cupitelol,725700,1UQC,,13.39,2
3,Pharmacy #30,U60.52,generic oxasoted,571569,KB38N,6BYJBW,10.84,2
4,Pharmacy #18,N55.01,branded mamate,664344,ZX2QUWR,ZX2QUWR,47.0,2


In [23]:
df['pcn'] = df['pcn'].fillna('None')

In [24]:
df_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [25]:
df_train.head()

Unnamed: 0,pharmacy,diagnosis,drug,bin,pcn,patient_pay,day
5197337,Pharmacy #28,I68.27,branded prazinib,322463,HO8HUGL,11.09,146
643154,Pharmacy #18,G99.93,branded bovirol,664344,52H8KH0F83K,12.46,20
11999316,Pharmacy #5,I68.27,branded prazinib,664344,TPJD,19.82,317
2548632,Pharmacy #22,I59.87,generic oxasoted,664344,STGRDKR1J5RD,5.94,75
13471059,Pharmacy #24,Q72.66,branded ratin,757349,MSCXSG,23.83,351


In [26]:
D={}
for i in list(range(5)) + [6]:
    D[df_train.columns[i][:3]] = {}
    for c in df_train[df_train.columns[i]].unique():
        avg_pay = df_train[df_train[df_train.columns[i]]==c]['patient_pay'].mean()
        D[df_train.columns[i][:3]][c] = avg_pay

In [27]:
y_pred = np.zeros_like(df_train['patient_pay'])
y_pred = [np.mean([D[df_train.columns[j][:3]][df_train.iloc[i][df_train.columns[j]]] 
                   for j in list(range(5))+[6]]) for i in range(len(df_train))]   

In [None]:
rmse = np.sqrt(mean_squared_error(df_train['patient_pay'], y_pred))
rmsle = mean_squared_log_error(df_train['patient_pay'], y_pred, squared = False)
rmse, rmsle