Import All Spark related libraries and settings here

In [1]:
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
sc = pyspark.SparkContext(appName="Pi")
sqlcontext = SQLContext(sc)
sqlcontext.sql("set spark.sql.shuffle.partitions=10")
import pyspark.sql.functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import *
from pyspark.ml.feature import *
from pyspark.sql import Row
from pyspark.sql.window import Window

In [2]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import _infer_schema

In [3]:
spark = SparkSession.builder\
.master("local")\
.appName("Pi")\
.config("spark.some.config.option","some-value")\
.getOrCreate()

Import all other python libraries here

In [4]:
import numpy as np
import pickle as pk
import csv
import json

Begin Code

In [5]:
xref = sqlcontext.read.parquet('_with_address')

In [1]:
xref.columns

In [None]:
npiDB = sqlcontext.read.parquet('npi_with_address1')

In [None]:
npiDB.columns

In [None]:
npiFiltered = npiDB.select('*')\
.where("business_zip==33125").orderBy('business_address_string')

In [None]:
xrefFiltered = xref.select('*')\
.where("business_zip==33125").orderBy('business_address_string')

In [None]:
npiFiltered = npiFiltered.withColumn('business_address_string',\
                                       F.regexp_replace('business_address_string', ' ', ''))
npiFiltered = npiFiltered.withColumn('practice_address_string',\
                                       F.regexp_replace('practice_address_string', ' ', ''))

In [None]:
addresses = npiFiltered.groupby(F.col('business_zip').alias('zip'),  F.col('business_address_string').alias('address'))\
.agg(F.collect_set(F.struct('npi','provider_last_name_legal_name','provider_first_name',\
              'business_address1','business_address2','business_city',\
               'business_state','business_zip','practice_address1','practice_address2',\
              'practice_city','practice_state','practice_zip')).alias('struct'))


#.select(struct('npi','provider_last_name_legal_name','provider_first_name',\
#              'business_address1','business_address2','business_city',\
#               'business_state','business_zip','practice_address1','practice_address2',\
#              'practice_city','practice_state','practice_zip'))

^^^^^^Forms a combination of relevant fields from NPI^^^^^

The schema is shown below

In [None]:
addresses.printSchema()

May ignore the code below for now

In [None]:
xref_group = xrefFiltered.groupby(F.col('zip').alias('zip'),   F.col('business_address_string').alias('address'))\
.agg(F.collect_set('rem_act_id').alias('r_id'))

In [None]:
addresses = addresses.join(xref_group, ["zip", "address"], how='full')

In [None]:
#addresses.show(truncate=False)
xref.select('cust_name').where('rem_act_id==xxxx').show(truncate=False)

In [None]:
npi_practice = npiFiltered.groupby(F.col('practice_zip').alias('zip'), F.col('practice_address_string').alias('address')).agg(F.collect_set('npi').alias('practice_npis'))

In [None]:
npi_practice.show(5)

In [None]:
addresses = addresses.join(npi_practice, ["zip", "address"], how='full')

In [None]:
addresses.show(5)

In [None]:
addresses = addresses.withColumn('No',F.monotonically_increasing_id())

In [None]:
tmpDf = addresses.select('business_npis','practice_npis').\
rdd.map(lambda x: (x.business_npis or x.practice_npis\
        if x.business_npis==None or x.practice_npis==None\
        else x.business_npis+x.practice_npis,)).toDF(["All_npis"])

In [None]:
tmpDf = tmpDf.withColumn('No',F.monotonically_increasing_id())

In [None]:
addresses = addresses.join(tmpDf,on=['No'])\
.drop('No','business_npis','practice_npis')

In [None]:
addresses = addresses.orderBy('address')
addresses.where(F.col('remedy_id').isNotNull()).show()

In [None]:
#dx = addresses.select('remedy_id').head(20)[0].remedy_id
#print(dx)

addresses.select(F.explode('remedy_id')).show(20)


In [None]:
baddr = BsPracAddr.groupby('business_address_string').agg(F.collect_set('npi')\
                                                          .alias("npis"))

In [None]:
baddr.show()

In [None]:
paddr = BsPracAddr.groupby('practice_address_string').agg(F.collect_set('npi')\
                                                          .alias("npis2"))

In [None]:
baddr = baddr.withColumn('No',F.monotonically_increasing_id())
paddr = paddr.withColumn('No',F.monotonically_increasing_id())

In [None]:
paddr = paddr.join(baddr["No","npis"],on=["No"])

In [None]:
paddr.show()

In [None]:
paddr1 = paddr.select('npis2','npis').rdd.map(lambda x:(x.npis2+x.npis,))\
.toDF(['combin'])

In [None]:
paddr.show(truncate=False)