## Spark


###  Import library

In [90]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col
import time


### Init spark

In [91]:
spark = SparkSession.builder \
    .appName("DecisionTreeExample") \
    .getOrCreate()  

In [92]:
spark

### pre-processing data

In [93]:
# Membaca data
data = spark.read.format("csv").option("header", "true").load("C:\compare\compare\cirrhosis.csv")
data.head(5)

[Row(ID='1', N_Days='400', Status='D', Drug='D-penicillamine', Age='21464', Sex='F', Ascites='Y', Hepatomegaly='Y', Spiders='Y', Edema='Y', Bilirubin='14.5', Cholesterol='261', Albumin='2.6', Copper='156', Alk_Phos='1718', SGOT='137.95', Tryglicerides='172', Platelets='190', Prothrombin='12.2', Stage='4'),
 Row(ID='2', N_Days='4500', Status='C', Drug='D-penicillamine', Age='20617', Sex='F', Ascites='N', Hepatomegaly='Y', Spiders='Y', Edema='N', Bilirubin='1.1', Cholesterol='302', Albumin='4.14', Copper='54', Alk_Phos='7394.8', SGOT='113.52', Tryglicerides='88', Platelets='221', Prothrombin='10.6', Stage='3'),
 Row(ID='3', N_Days='1012', Status='D', Drug='D-penicillamine', Age='25594', Sex='M', Ascites='N', Hepatomegaly='N', Spiders='N', Edema='S', Bilirubin='1.4', Cholesterol='176', Albumin='3.48', Copper='210', Alk_Phos='516', SGOT='96.1', Tryglicerides='55', Platelets='151', Prothrombin='12', Stage='4'),
 Row(ID='4', N_Days='1925', Status='D', Drug='D-penicillamine', Age='19994', Sex

In [94]:
# Mengonversi tipe data kolom-kolom yang seharusnya numerik
columns_to_convert = ["Age", "Bilirubin", "Cholesterol", "Albumin", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin", "Stage"]
for column_name in columns_to_convert:
    data = data.withColumn(column_name, col(column_name).cast(DoubleType()))

In [95]:
# Indexing kolom stage
indexer = StringIndexer(inputCol="Stage", outputCol="StageIndex", handleInvalid="skip")
indexed_data = indexer.fit(data).transform(data)

In [96]:
# Menghapus baris dengan nilai NULL
indexed_data = indexed_data.na.drop()

In [97]:
# Mengubah data dengan melakukan vector
assembler = VectorAssembler(
    inputCols=["Age", "Bilirubin", "Cholesterol", "Albumin", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin"],
    outputCol="features")
output = assembler.transform(indexed_data)

In [98]:
start_time = time.time()

### split test-data

In [99]:
# Memilih kolom yang diperlukan untuk melatih model
train_data = output.select("features", "StageIndex")

### train model

In [100]:
# Membuat model Decision Tree
dt = DecisionTreeClassifier(labelCol="StageIndex", featuresCol="features")
model = dt.fit(train_data)

### evaluate model

In [101]:
# Membuat prediksi menggunakan data validasi
predictions = model.transform(output)

In [102]:
# Menghitung akurasi
evaluator = MulticlassClassificationEvaluator(labelCol="StageIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

# Menghitung precision
evaluator = MulticlassClassificationEvaluator(labelCol="StageIndex", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator.evaluate(predictions)
print("Precision:", precision)

# Menghitung recall
evaluator = MulticlassClassificationEvaluator(labelCol="StageIndex", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
print("Recall:", recall)

# Menghitung F1-score
evaluator = MulticlassClassificationEvaluator(labelCol="StageIndex", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(predictions)
print("F1-score:", f1_score)

end_time = time.time()

print("Waktu komputasi: {:.2f} detik".format(end_time - start_time))

Accuracy: 0.7137681159420289
Precision: 0.7260032099087304
Recall: 0.713768115942029
F1-score: 0.7027288381633572
Waktu komputasi: 0.67 detik
