In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

#import functions
from pyspark.sql import functions

from pyspark.sql.types import StructField, StringType, IntegerType, StructType, DoubleType, LongType,FloatType

import pickle

from pyspark.sql.functions import col,isnan, when, count

In [2]:
spark = SparkSession.builder.appName("Datamerge").getOrCreate()

## Loading data

### provider_drug data

In [3]:
data_schema = [StructField("NPI", IntegerType(),True),
               StructField("First_name_drug", StringType(),True),
               StructField("City", StringType(),True),
               StructField("State", StringType(),True),
               StructField("Speciality", StringType(),True),
               StructField("max_Tot_Drug_Cst", FloatType(),True),
               StructField("sum_Tot_Drug_Cst", FloatType(),True),
               StructField("avg_Tot_Drug_Cst", FloatType(),True),
               StructField("max_Total_claims", IntegerType(),True),
               StructField("sum_Total_claims", LongType(),True),
               StructField("avg_Total_claims", DoubleType(),True),
               StructField("max_Tot_Day_Suply", IntegerType(),True),
               StructField("sum_Tot_Day_Suply", LongType(),True),
               StructField("avg_Tot_Day_Suply", FloatType(),True),
               StructField("max_Tot_30day_Fills", FloatType(),True),
               StructField("sum_Tot_30day_Fills", FloatType(),True),
               StructField("avg_Tot_30day_Fills", FloatType(),True)
              ]

In [4]:
final_schema = StructType(fields=data_schema)

In [5]:
df_drug = spark.read.csv("../data/spark_csv/drug_data_final.csv",schema=final_schema)

In [6]:
df_drug.printSchema()

root
 |-- NPI: integer (nullable = true)
 |-- First_name_drug: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Speciality: string (nullable = true)
 |-- max_Tot_Drug_Cst: float (nullable = true)
 |-- sum_Tot_Drug_Cst: float (nullable = true)
 |-- avg_Tot_Drug_Cst: float (nullable = true)
 |-- max_Total_claims: integer (nullable = true)
 |-- sum_Total_claims: long (nullable = true)
 |-- avg_Total_claims: double (nullable = true)
 |-- max_Tot_Day_Suply: integer (nullable = true)
 |-- sum_Tot_Day_Suply: long (nullable = true)
 |-- avg_Tot_Day_Suply: float (nullable = true)
 |-- max_Tot_30day_Fills: float (nullable = true)
 |-- sum_Tot_30day_Fills: float (nullable = true)
 |-- avg_Tot_30day_Fills: float (nullable = true)



### Payment data

In [7]:
# pay_schema = [StructField("NPI", IntegerType(),True),
#                StructField("First_name", StringType(),True),
#                StructField("Last_name", StringType(),True),
#                StructField("Total_payments", FloatType(),True),
#               ]

In [8]:
# final_pay_schema = StructType(fields=pay_schema)

In [9]:
# df_pay = spark.read.csv("../data/spark_csv/payment_npi_name.csv",schema=final_pay_schema)

In [None]:
# pay_schema1 = [StructField("NPI", IntegerType(),True),
#                StructField("Total_payments", FloatType(),True)
#               ]

In [None]:
# final_pay_schema1 = StructType(fields=pay_schema1)

In [None]:
df_pay = spark.read.csv("../data/spark_csv/payment_npi.csv",schema=final_pay_schema1)

In [None]:
df_pay.printSchema()

In [None]:
df_pay.show()

### Joining data drug and payment 

### With spark

In [12]:
#pay_data_cols = df_pay.select("NPI","Total_payments")

In [11]:
pay_data_cols = df_pay.withColumnRenamed("NPI","NPI_1")

In [12]:
df_drug_pay = df_drug.join(pay_data_cols,df_drug.NPI == pay_data_cols.NPI_1,"left")

In [13]:
df_drug_pay.printSchema()

root
 |-- NPI: integer (nullable = true)
 |-- First_name_drug: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Speciality: string (nullable = true)
 |-- max_Tot_Drug_Cst: float (nullable = true)
 |-- sum_Tot_Drug_Cst: float (nullable = true)
 |-- avg_Tot_Drug_Cst: float (nullable = true)
 |-- max_Total_claims: integer (nullable = true)
 |-- sum_Total_claims: long (nullable = true)
 |-- avg_Total_claims: double (nullable = true)
 |-- max_Tot_Day_Suply: integer (nullable = true)
 |-- sum_Tot_Day_Suply: long (nullable = true)
 |-- avg_Tot_Day_Suply: float (nullable = true)
 |-- max_Tot_30day_Fills: float (nullable = true)
 |-- sum_Tot_30day_Fills: float (nullable = true)
 |-- avg_Tot_30day_Fills: float (nullable = true)
 |-- NPI_1: integer (nullable = true)
 |-- First_name: string (nullable = true)
 |-- Last_name: string (nullable = true)
 |-- Total_payments: float (nullable = true)



In [20]:
df_drug_pay.show()

+----------+---------------+--------------+-----+--------------------+----------------+----------------+----------------+----------------+----------------+------------------+-----------------+-----------------+-----------------+-------------------+-------------------+-------------------+----------+----------+------------+--------------+
|       NPI|First_name_drug|          City|State|          Speciality|max_Tot_Drug_Cst|sum_Tot_Drug_Cst|avg_Tot_Drug_Cst|max_Total_claims|sum_Total_claims|  avg_Total_claims|max_Tot_Day_Suply|sum_Tot_Day_Suply|avg_Tot_Day_Suply|max_Tot_30day_Fills|sum_Tot_30day_Fills|avg_Tot_30day_Fills|     NPI_1|First_name|   Last_name|Total_payments|
+----------+---------------+--------------+-----+--------------------+----------------+----------------+----------------+----------------+----------------+------------------+-----------------+-----------------+-----------------+-------------------+-------------------+-------------------+----------+----------+------------

In [16]:
#drug data and payment data total rows

df_drug_pay.count()

959546

In [17]:
#Payment data rows

pay_data_cols.count()

487110

In [18]:
#954546

df_drug.count()

959546

In [44]:
#Dropping duplicate column

df_drug_pay = df_drug_pay.drop("NPI_1")

In [45]:
df_drug_pay = df_drug_pay.drop("Name")

In [None]:
#df_drug_pay = df_drug_pay.drop("First_name")

In [14]:
#Checking missing values in total dataframe

df_drug_pay.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_drug_pay.columns]
   ).show()

+---+---------------+----+-----+----------+----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+-----------------+-------------------+-------------------+-------------------+------+----------+---------+--------------+
|NPI|First_name_drug|City|State|Speciality|max_Tot_Drug_Cst|sum_Tot_Drug_Cst|avg_Tot_Drug_Cst|max_Total_claims|sum_Total_claims|avg_Total_claims|max_Tot_Day_Suply|sum_Tot_Day_Suply|avg_Tot_Day_Suply|max_Tot_30day_Fills|sum_Tot_30day_Fills|avg_Tot_30day_Fills| NPI_1|First_name|Last_name|Total_payments|
+---+---------------+----+-----+----------+----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+-----------------+-------------------+-------------------+-------------------+------+----------+---------+--------------+
|  0|             40|   0|    0|         1|               0|               0|              

### Converting main data to pandas

In [15]:
df_drug_panda = df_drug_pay.toPandas()

In [16]:
df_drug_panda.isnull().sum()

NPI                         0
First_name_drug            13
City                        0
State                       0
Speciality                  1
max_Tot_Drug_Cst            0
sum_Tot_Drug_Cst            0
avg_Tot_Drug_Cst            0
max_Total_claims            0
sum_Total_claims            0
avg_Total_claims            0
max_Tot_Day_Suply           0
sum_Tot_Day_Suply           0
avg_Tot_Day_Suply           0
max_Tot_30day_Fills         0
sum_Tot_30day_Fills         0
avg_Tot_30day_Fills         0
NPI_1                  624631
First_name             624631
Last_name              624631
Total_payments         624633
dtype: int64

In [17]:
df_drug_panda["City"] =  df_drug_panda["City"].str.lower()

In [18]:
df_drug_panda["State"] =  df_drug_panda["State"].str.lower()

In [19]:
df_drug_panda.head()

Unnamed: 0,NPI,First_name_drug,City,State,Speciality,max_Tot_Drug_Cst,sum_Tot_Drug_Cst,avg_Tot_Drug_Cst,max_Total_claims,sum_Total_claims,...,max_Tot_Day_Suply,sum_Tot_Day_Suply,avg_Tot_Day_Suply,max_Tot_30day_Fills,sum_Tot_30day_Fills,avg_Tot_30day_Fills,NPI_1,First_name,Last_name,Total_payments
0,1003000126,ardalan,bethesda,md,Internal Medicine,4792.850098,5979.069824,747.383728,30,124,...,937,3721,465.125,32.400002,138.199997,17.275,1003000000.0,ardalan,enkeshafi,20.48
1,1003000423,jennifer,cleveland,oh,Obstetrics & Gynecology,8276.44043,15389.69043,2564.948242,39,122,...,2412,5217,869.5,82.0,213.699997,35.616665,1003000000.0,jennifer,velotta,171.289993
2,1003000720,otniel,clemmons,nc,Nurse Practitioner,657.01001,3052.840088,254.403336,51,230,...,337,2166,180.5,51.0,230.0,19.166666,,,,
3,1003001785,jaclyn,tulsa,ok,Orthopedic Surgery,2141.23999,7812.279785,781.228027,126,546,...,2610,8250,825.0,126.0,569.0,56.900002,1003002000.0,jaclyn,jones,969.070007
4,1003001884,lucille,flint,mi,Family Practice,1324.26001,2449.090088,349.869995,31,118,...,1560,4979,711.285706,53.0,199.0,28.428572,,,,


In [21]:
df_drug_panda.drop(["First_name_drug","NPI_1"],axis=1,inplace=True)

In [22]:
df_drug_panda.head(1)

Unnamed: 0,NPI,City,State,Speciality,max_Tot_Drug_Cst,sum_Tot_Drug_Cst,avg_Tot_Drug_Cst,max_Total_claims,sum_Total_claims,avg_Total_claims,max_Tot_Day_Suply,sum_Tot_Day_Suply,avg_Tot_Day_Suply,max_Tot_30day_Fills,sum_Tot_30day_Fills,avg_Tot_30day_Fills,First_name,Last_name,Total_payments
0,1003000126,bethesda,md,Internal Medicine,4792.850098,5979.069824,747.383728,30,124,15.5,937,3721,465.125,32.400002,138.199997,17.275,ardalan,enkeshafi,20.48


In [None]:
#df_drug_panda.drop(["Total_payments","Total_pay"],axis=1,inplace=True)

### Saving dataframe

In [24]:
# drug pay data with first ,last and payment

df_drug_panda.to_csv("../data/spark_csv/drug_pay_final.csv")