In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import PipelineModel



# Create a Spark session
spark = SparkSession.builder.appName("IrisDataset").getOrCreate()

# Read the CSV file
data = spark.read.csv("/FileStore/tables/iris_2_.csv", header=True, inferSchema=True)


In [0]:
string_indexer = StringIndexer(inputCol="species", outputCol="species_numeric")



# Fit and transform the data
data = string_indexer.fit(data).transform(data)

# Show the transformed data
data.show()





+------------+-----------+------------+-----------+-------+---------------+
|sepal_length|sepal_width|petal_length|petal_width|species|species_numeric|
+------------+-----------+------------+-----------+-------+---------------+
|         5.1|        3.5|         1.4|        0.2| setosa|            0.0|
|         4.9|        3.0|         1.4|        0.2| setosa|            0.0|
|         4.7|        3.2|         1.3|        0.2| setosa|            0.0|
|         4.6|        3.1|         1.5|        0.2| setosa|            0.0|
|         5.0|        3.6|         1.4|        0.2| setosa|            0.0|
|         5.4|        3.9|         1.7|        0.4| setosa|            0.0|
|         4.6|        3.4|         1.4|        0.3| setosa|            0.0|
|         5.0|        3.4|         1.5|        0.2| setosa|            0.0|
|         4.4|        2.9|         1.4|        0.2| setosa|            0.0|
|         4.9|        3.1|         1.5|        0.1| setosa|            0.0|
|         5.

In [0]:
feature_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

# Create a VectorAssembler to assemble the features
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Transform the data by applying the VectorAssembler
assembled_data = assembler.transform(data)


In [0]:
train_data, test_data = assembled_data.randomSplit([0.8, 0.2], seed=42)

# Show the sizes of train and test sets
print("Train data size:", train_data.count())
print("Test data size:", test_data.count())


Train data size: 126
Test data size: 24


In [0]:
lr = LogisticRegression(featuresCol="features", labelCol="species_numeric", maxIter=10)

# Train the model
lr_model = lr.fit(train_data)

# Make predictions on the test data
predictions = lr_model.transform(test_data)

In [0]:
# Evaluate the model's performance
evaluator = MulticlassClassificationEvaluator(
    labelCol="species_numeric", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)

print("Logistic Regression Accuracy:", accuracy)

Logistic Regression Accuracy: 1.0


In [0]:
# by using  Random Forest model
rf = RandomForestClassifier(featuresCol="features", labelCol="species_numeric", numTrees=100)

# Train the model
rf_model = rf.fit(train_data)

# Make predictions on the test data
predictions = rf_model.transform(test_data)

# Evaluate the model's performance
evaluator = MulticlassClassificationEvaluator(
    labelCol="species_numeric", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)

print("Random Forest Accuracy:", accuracy)





Random Forest Accuracy: 0.9583333333333334
