In [1]:
import pandas as pd
import numpy as np
#Data analysis
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

# For model buidling
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# Custom module for prediction and model evalution
from utils import predict_and_evaluate

In [2]:
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import Row
#from pyspark.sql.functions import *
from pyspark.ml.feature import *
import pickle
#import functions
from pyspark.sql import functions
from pymongo import MongoClient
import warnings
warnings.filterwarnings("ignore")


ModuleNotFoundError: No module named 'pyspark'

## Loading data

### Making sparksession object

In [2]:
input_uri="mongodb://127.0.0.1:27017/"
output_uri="mongodb://127.0.0.1:27017/"

In [3]:
spark=SparkSession.builder.appName("Drug_data").config("spark.mongodb.input.uri",input_uri) \
        .config("spark.mongodb.output.uri",output_uri) \
        .config("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:2.4.2") \
        .getOrCreate()

In [2]:
spark = SparkSession.builder.appName("DataLoad").getOrCreate()

### Loading data with readcsv

In [None]:
df = pd.read_csv("MUP_DPR_RY21_P04_V10_DY20_NPIBN_0.csv")

In [None]:
df.head()

In [35]:
df_final = pd.read_csv("../data/final_dataframe.csv")

In [36]:
df_final.drop("Unnamed: 0",axis = 1,inplace = True)

In [37]:
df_final.head(1)

Unnamed: 0,NPI,City,State,Speciality,max_Tot_Drug_Cst,sum_Tot_Drug_Cst,avg_Tot_Drug_Cst,max_Total_claims,sum_Total_claims,avg_Total_claims,max_Tot_Day_Suply,sum_Tot_Day_Suply,avg_Tot_Day_Suply,max_Tot_30day_Fills,sum_Tot_30day_Fills,avg_Tot_30day_Fills,Total_payments,FRAUD
0,1003000126,bethesda,md,internal medicine,4792.85,5979.07,747.3837,30,124,15.5,937,3721,465.125,32.4,138.2,17.275,20.48,0


In [29]:
df_final1 = df_final.loc[:,["NPI","FRAUD"]]

In [30]:
df_final1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965056 entries, 0 to 965055
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   NPI     965056 non-null  int64
 1   FRAUD   965056 non-null  int64
dtypes: int64(2)
memory usage: 14.7 MB


In [39]:
df_final1.FRAUD.value_counts()

0    964170
1       886
Name: FRAUD, dtype: int64

## Pandas

In [None]:
df.info()

In [None]:
df = df.loc[:,["Prscrbr_NPI","Gnrc_Name","Tot_Clms","Tot_30day_Fills","Tot_Day_Suply","Tot_Drug_Cst","Prscrbr_Type"]]

In [None]:
df.info()

In [None]:
df.columns = ['NPI', 'Drug_Name', 'Tot_Clms', 'Tot_30day_Fills','Tot_Day_Suply','Tot_Drug_Cst','Speciality']

In [None]:
df.info()

In [None]:
df.head(1)

In [None]:
partD_drug_train = df.merge(df_final1,on='NPI',how='inner')

In [None]:
partD_drug_train.info()

In [None]:
partD_drug_train.head()

In [None]:
partD_drug_train.to_csv("Drug_20")

## Loading data : csv data : dataframe with original drug data and fraud

In [4]:
partD_drug = pd.read_csv("../data/Drug_20.csv")

In [5]:
partD_drug.head(1)

Unnamed: 0.1,Unnamed: 0,NPI,Drug_Name,Tot_Clms,Tot_30day_Fills,Tot_Day_Suply,Tot_Drug_Cst,Speciality,FRAUD
0,0,1003000126,Amiodarone Hcl,13,13.8,399,135.38,Internal Medicine,0


In [6]:
partD_drug.drop("Unnamed: 0",axis = 1,inplace = True)

In [7]:
partD_drug.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25048504 entries, 0 to 25048503
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   NPI              int64  
 1   Drug_Name        object 
 2   Tot_Clms         int64  
 3   Tot_30day_Fills  float64
 4   Tot_Day_Suply    int64  
 5   Tot_Drug_Cst     float64
 6   Speciality       object 
 7   FRAUD            int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 1.5+ GB


## Grouping data 

In [8]:
partD_drug_Group = partD_drug.groupby(["Drug_Name","FRAUD"])

In [9]:
partD_drug_Group.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,NPI,Tot_Clms,Tot_30day_Fills,Tot_Day_Suply,Tot_Drug_Cst,Speciality
Drug_Name,FRAUD,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.9 % Sodium Chloride,0,1003008533,16,16.0,401,120.56,Neurology
0.9 % Sodium Chloride,1,1285627828,20,20.0,315,333.86,Family Practice
Aa 4.25%/Calcium/Lytes/Dex 10%,0,1033137518,11,11.0,76,5136.48,Hematology-Oncology
Aa 5 %/Calcium/Lytes/Dext 20 %,0,1356304224,33,33.0,161,12101.10,General Surgery
Aa 5%/D15w/Electrolytes,0,1104087469,23,23.0,161,7782.00,Internal Medicine
...,...,...,...,...,...,...,...
Zolpidem Tartrate,0,1003000522,22,46.0,1380,273.87,Family Practice
Zolpidem Tartrate,1,1003806431,41,45.0,1310,97.95,Internal Medicine
Zonisamide,0,1003000902,12,12.0,360,654.89,Family Practice
Zonisamide,1,1629042304,11,11.0,330,264.45,Internal Medicine


## Pre process Hypothesis test

In [10]:
# storing unique drug names in set
#is instance will return false if drug name is not a string && set datatype - for no duplicacy
drugs = set([ drugx for drugx in partD_drug['Drug_Name'].values if isinstance(drugx, str)])
print(len(drugs))

1722


In [11]:
drug_keys = partD_drug_Group.groups.keys()
print(len(drug_keys))

2409


In [12]:
list(drug_keys)[0:10]

[('0.9 % Sodium Chloride', 0),
 ('0.9 % Sodium Chloride', 1),
 ('Aa 4.25%/Calcium/Lytes/Dex 10%', 0),
 ('Aa 5 %/Calcium/Lytes/Dext 20 %', 0),
 ('Aa 5%/D15w/Electrolytes', 0),
 ('Abacavir Sulfate', 0),
 ('Abacavir Sulfate/Lamivudine', 0),
 ('Abacavir/Dolutegravir/Lamivudi', 0),
 ('Abacavir/Dolutegravir/Lamivudi', 1),
 ('Abacavir/Lamivudine/Zidovudine', 0)]

In [13]:
#Getting list of drug names which where bought by both fraud and non fraud, to test hypothesis

drug_with_isfraud = [drugx for drugx in drugs if ((drugx,0.0) in drug_keys ) & ( (drugx,1.0) in drug_keys)]
drug_with_isfraud[0:10]

['Olmesartan/Hydrochlorothiazide',
 'Penicillin V Potassium',
 'Flunisolide',
 'Methotrexate Sodium',
 'Cefadroxil',
 'Tamsulosin Hcl',
 'Epinephrine',
 'Amitriptyline Hcl',
 'Doxepin Hcl',
 'Prasugrel Hcl']

In [14]:
#687 unique drug names with both fraud and non fraud group
len(drug_with_isfraud)

687

## Hypothesis test

In [15]:
from scipy.stats import ttest_ind

In [16]:
partD_drug_Group.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,NPI,Tot_Clms,Tot_30day_Fills,Tot_Day_Suply,Tot_Drug_Cst,Speciality
Drug_Name,FRAUD,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.9 % Sodium Chloride,0,1003008533,16,16.0,401,120.56,Neurology
0.9 % Sodium Chloride,1,1285627828,20,20.0,315,333.86,Family Practice
Aa 4.25%/Calcium/Lytes/Dex 10%,0,1033137518,11,11.0,76,5136.48,Hematology-Oncology
Aa 5 %/Calcium/Lytes/Dext 20 %,0,1356304224,33,33.0,161,12101.10,General Surgery
Aa 5%/D15w/Electrolytes,0,1104087469,23,23.0,161,7782.00,Internal Medicine
...,...,...,...,...,...,...,...
Zolpidem Tartrate,0,1003000522,22,46.0,1380,273.87,Family Practice
Zolpidem Tartrate,1,1003806431,41,45.0,1310,97.95,Internal Medicine
Zonisamide,0,1003000902,12,12.0,360,654.89,Family Practice
Zonisamide,1,1629042304,11,11.0,330,264.45,Internal Medicine


### Hypothesis
### Null hypothesis : Values in all feature columns for Fraud 0 and 1 category are identical and there is no difference 
### Alternate hypothesis : They is a difference in pattern and not identical

In [17]:
partD_drug.columns

Index(['NPI', 'Drug_Name', 'Tot_Clms', 'Tot_30day_Fills', 'Tot_Day_Suply',
       'Tot_Drug_Cst', 'Speciality', 'FRAUD'],
      dtype='object')

In [18]:
#Cols of interest
cols = ['Tot_Drug_Cst','Tot_Clms','Tot_Day_Suply']

In [19]:
#Empty dict for 
re_drug_tt = dict()

In [20]:
for drugx in drug_with_isfraud:
    for colx in cols:
        fraud_0 = partD_drug_Group.get_group((drugx,0.0))[colx].values
        fraud_1 = partD_drug_Group.get_group((drugx,1.0))[colx].values
        #print(len(fraud_0), len(fraud_1))
        
        #If column has values more than 2 then do tt test on those and add result to dictionary with drug name
        if (len(fraud_0)>2) & (len(fraud_1)>2) :
            tt = ttest_ind(fraud_0, fraud_1)
            re_drug_tt[(drugx, colx)] = tt

In [21]:
#Key is a set of Drug name and column
#Value is a set of Stat and pvalue

re_drug_tt.get(('Atorvastatin Calcium','Tot_Drug_Cst'))

Ttest_indResult(statistic=-2.860861957291366, pvalue=0.004225177525825225)

In [22]:
#Setting Probilities
#If p value is less than alpha = 0.05 then rejecting null hypothesis and adding that column in to list of prob_005

Prob_005 = [(key, pvalue) for (key, (stat, pvalue)) in re_drug_tt.items() if pvalue <=0.01]  
print(len(Prob_005))

102


In [32]:
#Check random key : drug name and column name
#at list index 100, randomly
index = 100
drug_name = Prob_005[index][0][0]
column_name = Prob_005[index][0][1]
print(drug_name,column_name)

Tramadol Hcl Tot_Clms


## Getting features from dataframe with grouped data of drug cost,supply,daily supply

In [40]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965056 entries, 0 to 965055
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   NPI                  965056 non-null  int64  
 1   City                 965056 non-null  object 
 2   State                965056 non-null  object 
 3   Speciality           965055 non-null  object 
 4   max_Tot_Drug_Cst     965056 non-null  float64
 5   sum_Tot_Drug_Cst     965056 non-null  float64
 6   avg_Tot_Drug_Cst     965056 non-null  float64
 7   max_Total_claims     965056 non-null  int64  
 8   sum_Total_claims     965056 non-null  int64  
 9   avg_Total_claims     965056 non-null  float64
 10  max_Tot_Day_Suply    965056 non-null  int64  
 11  sum_Tot_Day_Suply    965056 non-null  int64  
 12  avg_Tot_Day_Suply    965056 non-null  float64
 13  max_Tot_30day_Fills  965056 non-null  float64
 14  sum_Tot_30day_Fills  965056 non-null  float64
 15  avg_Tot_30day_Fil

In [41]:
FeaturesAll_pd = df_final.drop("City",axis = 1)

In [42]:
type(FeaturesAll_pd)

pandas.core.frame.DataFrame

In [43]:
FeaturesAll_pd['claim_max-mean'] = FeaturesAll_pd['max_Total_claims'] - FeaturesAll_pd['avg_Total_claims']

FeaturesAll_pd['supply_max-mean'] = FeaturesAll_pd['max_Tot_Day_Suply'] - FeaturesAll_pd['avg_Tot_Day_Suply']

FeaturesAll_pd['drug_max-mean'] = FeaturesAll_pd['max_Tot_Drug_Cst'] - FeaturesAll_pd['avg_Tot_Drug_Cst']

In [44]:
FeaturesAll_pd.tail(3)

Unnamed: 0,NPI,State,Speciality,max_Tot_Drug_Cst,sum_Tot_Drug_Cst,avg_Tot_Drug_Cst,max_Total_claims,sum_Total_claims,avg_Total_claims,max_Tot_Day_Suply,sum_Tot_Day_Suply,avg_Tot_Day_Suply,max_Tot_30day_Fills,sum_Tot_30day_Fills,avg_Tot_30day_Fills,Total_payments,FRAUD,claim_max-mean,supply_max-mean,drug_max-mean
965053,1972655934,va,medical oncology,18080.71,24226.07,3460.8672,87,263,37.571429,6262,17353,2479.0,208.7,591.7,84.52857,,0,49.428571,3783.0,14619.8428
965054,1972662203,ms,dentist,72.16,118.12,59.06,13,24,12.0,83,115,57.5,13.0,24.0,12.0,,0,1.0,25.5,13.1
965055,1972669075,ok,dentist,100.68,100.68,100.68,34,34,34.0,165,165,165.0,34.0,34.0,34.0,13.79,0,0.0,0.0,0.0


## Adding columns to main dataframe

In [45]:

Feature_DrugWeighted = []
new_col_all =[]

for i, p005x in enumerate(Prob_005):
    #drug name
    drug_name = p005x[0][0]
    #Column name
    cat_name = p005x[0][1] 
    
    #Adding two strings and appending it to list
    new_col = drug_name+'_'+cat_name
    new_col_all.append(new_col)
    
    #Getting columns for fraud and non fraud
    drug_0 = partD_drug_Group.get_group((drug_name,0.0))[['NPI', cat_name]]
    drug_1 = partD_drug_Group.get_group((drug_name,1.0))[['NPI', cat_name]]
    
    #Row wise
    drug_01 = pd.concat([drug_0, drug_1])
    #print("drug_01 : ",drug_01)
    
    #Chances of Single NPI making double claims for similar medicines
    drug_02 = drug_01.groupby("NPI").sum()
    drug_02["NPI"] = drug_02.index
    drug_02.index.names = ["index"]
    
    
    #Renaming column name to new column name
    drug_02.rename(columns={cat_name: new_col}, inplace=True)
    
    #Appending dataframe to list
    Feature_DrugWeighted.append(drug_02)

In [46]:
#List with all required drug names and there values wrt NPI
Feature_DrugWeighted[0]

Unnamed: 0_level_0,Methocarbamol_Tot_Drug_Cst,NPI
index,Unnamed: 1_level_1,Unnamed: 2_level_1
1003001363,832.49,1003001363
1003004201,136.93,1003004201
1003008095,686.76,1003008095
1003012428,272.20,1003012428
1003013608,155.89,1003013608
...,...,...
1972663144,79.24,1972663144
1972665321,752.56,1972665321
1972665594,188.39,1972665594
1972668416,382.96,1972668416


In [47]:
TEST = Feature_DrugWeighted[0]

In [48]:
#Without groupby
TEST[TEST.NPI == 1003013616]

Unnamed: 0_level_0,Methocarbamol_Tot_Drug_Cst,NPI
index,Unnamed: 1_level_1,Unnamed: 2_level_1


In [49]:
#Chances of Single NPI making double claims for similar medicines
TEST[TEST.NPI == 1003013616]

Unnamed: 0_level_0,Methocarbamol_Tot_Drug_Cst,NPI
index,Unnamed: 1_level_1,Unnamed: 2_level_1


In [50]:
#Merging all columns into final dataframe on NPI

npi_col = FeaturesAll_pd[['NPI']]

w_npi = []

for n, nx in enumerate(Feature_DrugWeighted):
      nggx = pd.merge(npi_col, nx.drop_duplicates(['NPI']), on='NPI', how='left')
      w_npi.append(nggx)

In [51]:
#list w_npi has all columns with values
w_npi[10]

Unnamed: 0,NPI,Risperidone Microspheres_Tot_Drug_Cst
0,1003000126,
1,1003000423,
2,1003000720,
3,1003001785,
4,1003001884,
...,...,...
965051,1972641801,
965052,1972643476,
965053,1972655934,
965054,1972662203,


In [52]:
FeaturesAll_pd1 = FeaturesAll_pd

In [53]:
for wx in w_npi:
    
    #Selecting drugname_category
    col_n = wx.columns[1]
    
    #adding new column with values to dataframe
    FeaturesAll_pd1[col_n] = wx[col_n].values
    FeaturesAll_pd1[col_n].fillna(0,inplace=True)

  FeaturesAll_pd1[col_n] = wx[col_n].values
  FeaturesAll_pd1[col_n] = wx[col_n].values
  FeaturesAll_pd1[col_n] = wx[col_n].values
  FeaturesAll_pd1[col_n] = wx[col_n].values
  FeaturesAll_pd1[col_n] = wx[col_n].values


In [54]:
FeaturesAll_pd1.columns

Index(['NPI', 'State', 'Speciality', 'max_Tot_Drug_Cst', 'sum_Tot_Drug_Cst',
       'avg_Tot_Drug_Cst', 'max_Total_claims', 'sum_Total_claims',
       'avg_Total_claims', 'max_Tot_Day_Suply',
       ...
       'Quetiapine Fumarate_Tot_Day_Suply',
       'Lisinopril/Hydrochlorothiazide_Tot_Clms',
       'Lisinopril/Hydrochlorothiazide_Tot_Day_Suply', 'Diazepam_Tot_Clms',
       'Diazepam_Tot_Day_Suply', 'Pen Needle, Diabetic_Tot_Drug_Cst',
       'Pen Needle, Diabetic_Tot_Day_Suply', 'Tramadol Hcl_Tot_Drug_Cst',
       'Tramadol Hcl_Tot_Clms', 'Tramadol Hcl_Tot_Day_Suply'],
      dtype='object', length=122)

In [55]:
FeaturesAll_pd1.isna().sum()

NPI                                   0
State                                 0
Speciality                            1
max_Tot_Drug_Cst                      0
sum_Tot_Drug_Cst                      0
                                     ..
Pen Needle, Diabetic_Tot_Drug_Cst     0
Pen Needle, Diabetic_Tot_Day_Suply    0
Tramadol Hcl_Tot_Drug_Cst             0
Tramadol Hcl_Tot_Clms                 0
Tramadol Hcl_Tot_Day_Suply            0
Length: 122, dtype: int64

In [56]:
FeaturesAll_pd1.columns

Index(['NPI', 'State', 'Speciality', 'max_Tot_Drug_Cst', 'sum_Tot_Drug_Cst',
       'avg_Tot_Drug_Cst', 'max_Total_claims', 'sum_Total_claims',
       'avg_Total_claims', 'max_Tot_Day_Suply',
       ...
       'Quetiapine Fumarate_Tot_Day_Suply',
       'Lisinopril/Hydrochlorothiazide_Tot_Clms',
       'Lisinopril/Hydrochlorothiazide_Tot_Day_Suply', 'Diazepam_Tot_Clms',
       'Diazepam_Tot_Day_Suply', 'Pen Needle, Diabetic_Tot_Drug_Cst',
       'Pen Needle, Diabetic_Tot_Day_Suply', 'Tramadol Hcl_Tot_Drug_Cst',
       'Tramadol Hcl_Tot_Clms', 'Tramadol Hcl_Tot_Day_Suply'],
      dtype='object', length=122)

In [57]:
FeaturesAll_pd1.to_csv("../data/hyp_0.1_data.csv")

In [None]:
df_ml = pd.read_csv("../data/hyp_final_data.csv")
df_ml.drop("Unnamed: 0",axis =1,inplace= True)
df_ml.round(2)

### Missing values treatment

In [None]:
df_ml.isna().sum()

In [None]:
print("Total values in Payments : ",df_ml.index.max())
print("Missing values in Payments : ",df_ml["Total_payments"].isna().sum())
print("percentage of missing values",df_ml["Total_payments"].isna().sum()/df_ml.index.max()*100,"%")

#Over 60% of payments  data is missing in total dataframe

In [None]:
print("Total values in Fraud Payments : ",df_ml[df_ml.FRAUD == 1].count().head(1))
print("Missing values in Payments : ",df_ml[df_ml["FRAUD"] == 1]["Total_payments"].isna().sum()) 
print("percentage of missing values",df_ml[df_ml["FRAUD"] == 1]["Total_payments"].isna().sum()/df_ml[df_ml.FRAUD == 1].count().head(1)*100)

#Over 50% of payments data is missing for fraud =1

In [None]:
#Dropping payments attribute as more than 50% data is missing.

#df_ml = test.drop("Total_payments",axis =1)
#df_ml = test.copy()

In [None]:
#Only one missing value in Speciality

df_ml.isnull().sum().to_dict()

In [None]:
#FFilled missing value with most occuring speciality

df_ml["Speciality"].fillna(df_ml.Speciality.mode,inplace = True)
df_ml["Speciality"].isnull().sum()

### Encoding

In [None]:
print("Unique speciality throughout dataset:" )
print(df_ml["Speciality"].value_counts().count())

#speciality_cat = pd.DataFrame.from_dict(df_ml["Speciality"].value_counts().to_dict(),orient='index',columns=["count"])
#print(df_ml[df_ml["FRAUD"] == 1]["Speciality"].value_counts().to_dict())

#df_ml["Speciality"].value_counts().sort_values(ascending=False).head(60)

In [None]:
df_ml["Speciality"].value_counts().to_dict()

#req_cat = ['nurse practitioner','internal medicine','family practice','dentist','general practice','physician assistant','emergency medicine',
          #'psychiatry']

In [None]:
# make list with top 30 variables
top_30 = [x for x in df_ml.Speciality.value_counts().sort_values(ascending=False).head(30).index]
#top_30.append('anesthesiology')
top_30

In [None]:
# for all categorical variables we selected
def top_x(df2,variable,top_x_labels):
    for label in top_x_labels:
        df2[variable+'_'+label] = np.where(df_ml[variable]==label,1,0)

In [None]:
#encode Nighborhood into the 10 most frequent categories
top_x(df_ml,'Speciality',top_30)
# display data
df_ml.tail()

In [None]:
df_ml["Speciality_nurse practitioner"].value_counts()

In [None]:
df_ml["fraud"] = df_ml["FRAUD"]
df_ml.drop("FRAUD",axis = 1 ,inplace = True)

In [None]:
#Total columns 226 , fraud column at the end

df_ml.columns

In [None]:
df_ml.drop(["Speciality","Total_payments"],axis =1,inplace=True)

### Model building

### Splitting dataframe : stratified

In [None]:
ss = StratifiedShuffleSplit(n_splits=1,
                            test_size=0.2,
                            train_size=0.8,
                            random_state=42)

In [None]:
X = df_ml.drop('fraud', axis=1)
y = df_ml['fraud']

In [None]:
for train_index, test_index in ss.split(X, y):
    train_df = df_ml.iloc[train_index]
    test_df = df_ml.iloc[test_index]