In [1]:
import pandas as pd
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import Row
#from pyspark.sql.functions import *
from pyspark.ml.feature import *
import pickle
#import functions
from pyspark.sql import functions
import warnings
warnings.filterwarnings("ignore")

from pyspark.sql.functions import col,isnan, when, count,sequence

In [2]:
import findspark
findspark.init()
findspark.find()

'C:\\Spark\\spark3'

In [3]:
input_uri="mongodb://127.0.0.1:27017/"
output_uri="mongodb://127.0.0.1:27017/"

In [4]:
spark=SparkSession.builder.appName("payment_data").config("spark.mongodb.input.uri",input_uri) \
        .config("spark.mongodb.output.uri",output_uri) \
        .config("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:2.4.2") \
        .getOrCreate()

In [3]:
spark = SparkSession.builder.appName("Dataprocess").getOrCreate()

### Loading payments data

In [4]:
df_payment= spark.read.csv("../data/OP_DTL_GNRL_PGYR2020_P06302022.csv",inferSchema=True,header = True)


#PaymentRawData = "c:\\FIP\\OP_DTL_GNRL_PGYR2017_P01172020.csv"

In [5]:
df_payment.printSchema()

root
 |-- Change_Type: string (nullable = true)
 |-- Covered_Recipient_Type: string (nullable = true)
 |-- Teaching_Hospital_CCN: integer (nullable = true)
 |-- Teaching_Hospital_ID: integer (nullable = true)
 |-- Teaching_Hospital_Name: string (nullable = true)
 |-- Covered_Recipient_Profile_ID: integer (nullable = true)
 |-- Covered_Recipient_NPI: integer (nullable = true)
 |-- Covered_Recipient_First_Name: string (nullable = true)
 |-- Covered_Recipient_Middle_Name: string (nullable = true)
 |-- Covered_Recipient_Last_Name: string (nullable = true)
 |-- Covered_Recipient_Name_Suffix: string (nullable = true)
 |-- Recipient_Primary_Business_Street_Address_Line1: string (nullable = true)
 |-- Recipient_Primary_Business_Street_Address_Line2: string (nullable = true)
 |-- Recipient_City: string (nullable = true)
 |-- Recipient_State: string (nullable = true)
 |-- Recipient_Zip_Code: string (nullable = true)
 |-- Recipient_Country: string (nullable = true)
 |-- Recipient_Province: string

In [8]:
df_payment.show(5)

+-----------+----------------------+---------------------+--------------------+----------------------+----------------------------+---------------------+----------------------------+-----------------------------+---------------------------+-----------------------------+-----------------------------------------------+-----------------------------------------------+--------------+---------------+------------------+-----------------+------------------+---------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------------+-------------------------------------+-------------------------------------+---------------------------------

In [6]:
df_payment.createOrReplaceTempView("df_sql2")

In [7]:
df_pay_req_cols = spark.sql("SELECT Covered_Recipient_NPI,lower(Covered_Recipient_First_Name) FirstName,lower(Covered_Recipient_Last_Name) LastName,lower(Recipient_City) city,lower(Recipient_State) state,float(Total_Amount_of_Payment_USDollars) from df_sql2")

In [8]:
df_pay_req_cols.printSchema()

root
 |-- Covered_Recipient_NPI: integer (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- Total_Amount_of_Payment_USDollars: float (nullable = true)



In [8]:
#Name,city,state are not required

df_pay_req_cols.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_pay_req_cols.columns]
   ).show()

+---------------------+---------+--------+----+-----+---------------------------------+
|Covered_Recipient_NPI|FirstName|LastName|city|state|Total_Amount_of_Payment_USDollars|
+---------------------+---------+--------+----+-----+---------------------------------+
|                35838|    32637|   32542|   0|  242|                                3|
+---------------------+---------+--------+----+-----+---------------------------------+



In [14]:
#filtering dataframe and selecting rows which have NPI. 
#NPI is the only valid column to on which we can join drug data and payment data

df_not_null = df_pay_req_cols.filter(df_pay_req_cols.Covered_Recipient_NPI.isNotNull())

In [15]:
df_not_null.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_not_null.columns]
   ).show()

+---------------------+---------+--------+----+-----+---------------------------------+
|Covered_Recipient_NPI|FirstName|LastName|city|state|Total_Amount_of_Payment_USDollars|
+---------------------+---------+--------+----+-----+---------------------------------+
|                    0|      187|      92|   0|  239|                                3|
+---------------------+---------+--------+----+-----+---------------------------------+



In [18]:
df_not_null = df_not_null.drop("city","state")

In [19]:
df_not_null.printSchema()

root
 |-- Covered_Recipient_NPI: integer (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- Total_Amount_of_Payment_USDollars: float (nullable = true)



In [22]:
#Grouping by NPI, name,city,state
#A another option
df_npi_name_agg = df_not_null.groupby("Covered_Recipient_NPI","FirstName","LastName").agg(functions.sum('Total_Amount_of_Payment_USDollars').astype("float"))

In [9]:
df_name_panda = df_name_agg.toPandas()

In [10]:
df_name_panda.isnull().sum()

Covered_Recipient_NPI                                    1814
Namee                                                     797
city                                                        0
state                                                      68
CAST(sum(Total_Amount_of_Payment_USDollars) AS FLOAT)       3
dtype: int64

In [11]:
df_name_panda.count()

Covered_Recipient_NPI                                    614761
Namee                                                    615778
city                                                     616575
state                                                    616507
CAST(sum(Total_Amount_of_Payment_USDollars) AS FLOAT)    616572
dtype: int64

In [37]:
#df_name_panda.drop(["Covered_Recipient_NPI"],axis=1,inplace = True)

In [39]:
#df_name_panda.dropna(inplace=True)


In [40]:
df_name_panda["Total_pay"] = df_name_panda["CAST(sum(Total_Amount_of_Payment_USDollars) AS FLOAT)"]

In [41]:
df_name_panda.drop(["CAST(sum(Total_Amount_of_Payment_USDollars) AS FLOAT)"],axis = 1, inplace=True)

In [42]:
df_name_panda.head()

Unnamed: 0,Namee,city,state,Total_pay
0,robert glover,buffalo,ny,4706.819824
1,john cattaneo,tulsa,ok,17.68
2,mark mostovych,jacksonville,fl,447.209991
3,connie taylor,minneapolis,mn,26.139999
4,satoshi furukawa,philadelphia,pa,89.139999


In [21]:
#Grouping with NPI

df_group = df_not_null.groupby("Covered_Recipient_NPI").agg(functions.sum('Total_Amount_of_Payment_USDollars').astype("float"))

In [22]:
df_group.printSchema()

root
 |-- Covered_Recipient_NPI: integer (nullable = true)
 |-- CAST(sum(Total_Amount_of_Payment_USDollars) AS FLOAT): float (nullable = true)



In [23]:
df_group.show()

+---------------------+-----------------------------------------------------+
|Covered_Recipient_NPI|CAST(sum(Total_Amount_of_Payment_USDollars) AS FLOAT)|
+---------------------+-----------------------------------------------------+
|           1174523807|                                                23.88|
|           1477545572|                                              6446.39|
|           1063472678|                                              3904.13|
|           1407880198|                                                65.85|
|           1538555099|                                               626.33|
|           1942389515|                                                12.21|
|           1649444266|                                              2218.95|
|           1760563407|                                              2512.32|
|           1194871780|                                                20.48|
|           1023188463|                                         

In [25]:
df_group = df_group.withColumnRenamed("CAST(sum(Total_Amount_of_Payment_USDollars) AS FLOAT)","Payments")

In [26]:
df_group.printSchema()

root
 |-- Covered_Recipient_NPI: integer (nullable = true)
 |-- Payments: float (nullable = true)



In [27]:
#Npi
#saving file with dataframe grouped on NPI
df_group.write.csv("../data/spark_csv/payment_npi.csv")

In [24]:
#NPI,name

# df_npi_name_agg.write.csv("../data/spark_csv/payment_npi_name.csv")