In [6]:
# Install pyspark and findspark
!pip install --ignore-install -q pyspark
# Install findspark library
!pip install --ignore-install -q findspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [7]:
# Import findspark
import findspark
findspark.init()

In [8]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [9]:
import sys
sys.version_info

sys.version_info(major=3, minor=10, micro=12, releaselevel='final', serial=0)

### 1 Import libraries and create spark session

In [21]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType
from sklearn.datasets import load_iris
from pyspark.ml.feature import StringIndexer

# Create a Spark session
spark = SparkSession.builder.appName("MLPipeline_IrisData").getOrCreate()


### 2. Load dataset

In [22]:
# Load Iris dataset using scikit-learn
iris = load_iris()
iris_data = iris.data
iris_target = iris.target

### 3. Define schema and convert into dataframe

In [23]:
# Define the schema for the DataFrame
schema = StructType([
    StructField("sepal_length", DoubleType(), True),
    StructField("sepal_width", DoubleType(), True),
    StructField("petal_length", DoubleType(), True),
    StructField("petal_width", DoubleType(), True),
    StructField("label", IntegerType(), True)
])

# Convert the data to a DataFrame with the specified schema
data = spark.createDataFrame(
    [(float(x[0]), float(x[1]), float(x[2]), float(x[3]), int(y)) for x, y in zip(iris_data, iris_target)],
    schema=schema
)

In [24]:
data.show()

+------------+-----------+------------+-----------+-----+
|sepal_length|sepal_width|petal_length|petal_width|label|
+------------+-----------+------------+-----------+-----+
|         5.1|        3.5|         1.4|        0.2|    0|
|         4.9|        3.0|         1.4|        0.2|    0|
|         4.7|        3.2|         1.3|        0.2|    0|
|         4.6|        3.1|         1.5|        0.2|    0|
|         5.0|        3.6|         1.4|        0.2|    0|
|         5.4|        3.9|         1.7|        0.4|    0|
|         4.6|        3.4|         1.4|        0.3|    0|
|         5.0|        3.4|         1.5|        0.2|    0|
|         4.4|        2.9|         1.4|        0.2|    0|
|         4.9|        3.1|         1.5|        0.1|    0|
|         5.4|        3.7|         1.5|        0.2|    0|
|         4.8|        3.4|         1.6|        0.2|    0|
|         4.8|        3.0|         1.4|        0.1|    0|
|         4.3|        3.0|         1.1|        0.1|    0|
|         5.8|

### 4. Split into training and testing sets

In [25]:
# Split the data into training and testing sets
(trainingData, testData) = data.randomSplit([0.8, 0.2], seed=1234)

### 5. Form feature columns

In [26]:
# Define the feature columns
feature_columns = data.columns
feature_columns.remove("label")

# Create a vector assembler to assemble feature columns into a single feature vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Create a StringIndexer to convert labels to indices
indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

### 6. Create model

In [27]:
# Create a RandomForestClassifier
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features")

### 7. Create Pipeline

In [28]:
# Create a pipeline with the stages: vector assembler, label indexer, and random forest
pipeline = Pipeline(stages=[assembler, indexer, rf])

### 8. Parameter searching

In [29]:
# Define the parameter grid for hyperparameter tuning
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [5, 10, 15])
             .addGrid(rf.numTrees, [20, 50, 100])
             .build())

### 9. Cross evaluator

In [30]:
# Create a multi-class classification evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

# Create a cross-validator with the pipeline, parameter grid, and evaluator
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

# Fit the cross-validator to the training data
cvModel = crossval.fit(trainingData)

# Make predictions on the test data
predictions = cvModel.transform(testData)

# Evaluate the model
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)


Accuracy = 0.972973


In [31]:

# Stop the Spark session
spark.stop()