In [3]:
# 1. Import Libraries & Start Spark Session
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Start Spark
spark = SparkSession.builder.appName("Homework-MLlib-Classification").getOrCreate()

In [10]:
# 2. Load Dataset
df = spark.read.csv(
    "file:///home/ilyasa/anaconda3/envs/Ilyasa/lib/python3.13/site-packages/sklearn/datasets/data/iris.csv",
    header=True,
    inferSchema=True
)

df.show(5)
df.printSchema()

+---+---+------+----------+---------+
|150|  4|setosa|versicolor|virginica|
+---+---+------+----------+---------+
|5.1|3.5|   1.4|       0.2|        0|
|4.9|3.0|   1.4|       0.2|        0|
|4.7|3.2|   1.3|       0.2|        0|
|4.6|3.1|   1.5|       0.2|        0|
|5.0|3.6|   1.4|       0.2|        0|
+---+---+------+----------+---------+
only showing top 5 rows

root
 |-- 150: double (nullable = true)
 |-- 4: double (nullable = true)
 |-- setosa: double (nullable = true)
 |-- versicolor: double (nullable = true)
 |-- virginica: integer (nullable = true)



In [11]:
# Rename columns manually
df = df.toDF("sepal_length", "sepal_width", "petal_length", "petal_width", "species")

df.show(5)
df.printSchema()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2|      0|
|         4.9|        3.0|         1.4|        0.2|      0|
|         4.7|        3.2|         1.3|        0.2|      0|
|         4.6|        3.1|         1.5|        0.2|      0|
|         5.0|        3.6|         1.4|        0.2|      0|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: integer (nullable = true)



In [12]:
# Convert species label to numeric
label_indexer = StringIndexer(inputCol="species", outputCol="label")
df = label_indexer.fit(df).transform(df)

# Assemble numeric features
assembler = VectorAssembler(
    inputCols=['sepal_length','sepal_width','petal_length','petal_width'],
    outputCol='features'
)
df = assembler.transform(df).select("features", "label")

df.show(5, truncate=False)

+-----------------+-----+
|features         |label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|0.0  |
|[4.9,3.0,1.4,0.2]|0.0  |
|[4.7,3.2,1.3,0.2]|0.0  |
|[4.6,3.1,1.5,0.2]|0.0  |
|[5.0,3.6,1.4,0.2]|0.0  |
+-----------------+-----+
only showing top 5 rows



In [19]:
# Train-Test Split
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [20]:
# Logistic Regression
lr = LogisticRegression(featuresCol='features', labelCol='label')
model = lr.fit(train_data)

print("Coefficient Matrix:")
print(model.coefficientMatrix)

print("\nIntercept Vector:")
print(model.interceptVector)

Coefficient Matrix:
DenseMatrix([[ -6.37538421,  32.85085741, -11.0779855 , -24.10633376],
             [  4.27600176, -13.03274831,   1.15984727,   3.80712874],
             [  2.09938245, -19.8181091 ,   9.91813823,  20.29920502]])

Intercept Vector:
[3.5534453166814455,17.04030323121303,-20.593748547894478]


In [22]:
# Model Evaluasi
predictions = model.transform(test_data)
predictions.select("features", "label", "prediction").show(10)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Akurasi model sebelum tuning: {accuracy:.4f}")

+-----------------+-----+----------+
|         features|label|prediction|
+-----------------+-----+----------+
|[4.4,3.0,1.3,0.2]|  0.0|       0.0|
|[4.6,3.2,1.4,0.2]|  0.0|       0.0|
|[4.6,3.6,1.0,0.2]|  0.0|       0.0|
|[4.8,3.1,1.6,0.2]|  0.0|       0.0|
|[4.9,3.1,1.5,0.1]|  0.0|       0.0|
|[5.0,2.3,3.3,1.0]|  1.0|       1.0|
|[5.0,3.5,1.3,0.3]|  0.0|       0.0|
|[5.1,3.5,1.4,0.2]|  0.0|       0.0|
|[5.3,3.7,1.5,0.2]|  0.0|       0.0|
|[5.4,3.0,4.5,1.5]|  1.0|       1.0|
+-----------------+-----+----------+
only showing top 10 rows

Akurasi model sebelum tuning: 1.0000


In [24]:
# Tentukan parameter grid
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.1, 1.0])
             .addGrid(lr.maxIter, [10, 50, 100])
             .build())

# Tentukan cross-validator
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)

# Train with cross-validation
cv_model = cv.fit(train_data)

# Evaluate model terbaik
best_predictions = cv_model.transform(test_data)
best_accuracy = evaluator.evaluate(best_predictions)

print(f"Model akurasi terbaik setelah tuning: {best_accuracy:.4f}")

Model akurasi terbaik setelah tuning: 1.0000
