In [0]:
spark.conf.set("fs.azure.account.auth.type.frauddetectiondata.dfs.core.windows.net", "SAS")
spark.conf.set("fs.azure.sas.token.provider.type.frauddetectiondata.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set("fs.azure.sas.fixed.token.frauddetectiondata.dfs.core.windows.net", "sp=racwdl&st=2023-12-15T23:01:11Z&se=2024-02-06T07:01:11Z&spr=https&sv=2022-11-02&sr=c&sig=OyZujTJ7jlpaNymECjSXJ%2BqCILIpqSmz3Zysn4HjJTM%3D")

In [0]:
display(dbutils.fs.ls("abfss://transformed@frauddetectiondata.dfs.core.windows.net/final_data.csv"))

path,name,size,modificationTime
abfss://transformed@frauddetectiondata.dfs.core.windows.net/final_data.csv/_committed_2907636837832767909,_committed_2907636837832767909,197,1702685658000
abfss://transformed@frauddetectiondata.dfs.core.windows.net/final_data.csv/_committed_99951080079569082,_committed_99951080079569082,208,1702681206000
abfss://transformed@frauddetectiondata.dfs.core.windows.net/final_data.csv/_started_2907636837832767909,_started_2907636837832767909,0,1702685654000
abfss://transformed@frauddetectiondata.dfs.core.windows.net/final_data.csv/part-00000-tid-2907636837832767909-51efab35-5fbf-4c1a-a7bb-17ffcb78f1e2-94-1-c000.csv,part-00000-tid-2907636837832767909-51efab35-5fbf-4c1a-a7bb-17ffcb78f1e2-94-1-c000.csv,62984953,1702685657000


In [0]:
df_training_final = spark.read.csv(path="abfss://transformed@frauddetectiondata.dfs.core.windows.net/final_data.csv/part-00000-tid-2907636837832767909-51efab35-5fbf-4c1a-a7bb-17ffcb78f1e2-94-1-c000.csv",header=True, inferSchema=True)

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
spark = SparkSession.builder.appName("Model_building").getOrCreate()
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
df_training_final.printSchema()

root
 |-- BeneID: string (nullable = true)
 |-- Provider: string (nullable = true)
 |-- InscClaimAmtReimbursed: integer (nullable = true)
 |-- DeductibleAmtPaid: integer (nullable = true)
 |-- Admitted: integer (nullable = true)
 |-- ClaimStartDt: date (nullable = true)
 |-- ClaimEndDt: date (nullable = true)
 |-- Gender: integer (nullable = true)
 |-- Race: integer (nullable = true)
 |-- RenalDiseaseIndicator: integer (nullable = true)
 |-- State: integer (nullable = true)
 |-- NoOfMonths_PartACov: integer (nullable = true)
 |-- NoOfMonths_PartBCov: integer (nullable = true)
 |-- ChronicCond_Alzheimer: integer (nullable = true)
 |-- ChronicCond_Heartfailure: integer (nullable = true)
 |-- ChronicCond_KidneyDisease: integer (nullable = true)
 |-- ChronicCond_Cancer: integer (nullable = true)
 |-- ChronicCond_ObstrPulmonary: integer (nullable = true)
 |-- ChronicCond_Depression: integer (nullable = true)
 |-- ChronicCond_Diabetes: integer (nullable = true)
 |-- ChronicCond_IschemicHeart

In [0]:
df_training_final.select('DeductibleAmtPaid').distinct().show()

+-----------------+
|DeductibleAmtPaid|
+-----------------+
|              876|
|             1068|
|               20|
|               40|
|              100|
|               10|
|               50|
|               80|
|               70|
|               60|
|               90|
|              200|
|              865|
|               30|
|                0|
|              886|
|              897|
+-----------------+



In [0]:
finalColumns = ['InscClaimAmtReimbursed','DeductibleAmtPaid', 'Admitted', 'Gender', 'Race','RenalDiseaseIndicator', 'State', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov','ChronicCond_Alzheimer', 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease', 'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke', 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'PotentialFraud', 'DaystoSettleClaim', 'DaysHospitalized', 'NoOfProceduresDone','NoOfPhyciansAttended',
'NoOfDiagnosisDone']

df_training_final = df_training_final.select(*finalColumns)



In [0]:
assembler = VectorAssembler(inputCols=[col for col in finalColumns if col != 'PotentialFraud'], outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
pipeline = Pipeline(stages=[assembler, scaler])
df_transformed = pipeline.fit(df_training_final).transform(df_training_final)
train_data, test_data = df_transformed.randomSplit([0.8, 0.2], seed=42)

In [0]:
gbt = GBTClassifier(labelCol="PotentialFraud", featuresCol="scaledFeatures", maxIter=200,maxDepth=5)
gbt_model = gbt.fit(train_data)

In [0]:
predictions = gbt_model.transform(test_data)

In [0]:
evaluator = MulticlassClassificationEvaluator(labelCol="PotentialFraud", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6601236337573911
