In [2]:
# Install pyspark and findspark
!pip install --ignore-install -q pyspark
# Install findspark library
!pip install --ignore-install -q findspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [3]:
# Import findspark
import findspark
findspark.init()

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### 1. Set up spark context and SparkSession

In [5]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("PySpark-RandomForestClassifier_Iris") \
    .getOrCreate()

### 2. Load dataset

In [7]:
# Load the Iris dataset (assuming you have it in a CSV format)
iris_data = spark.read.csv("/content/drive/MyDrive/iris-data.csv", header=True, inferSchema=True)

In [8]:
# Assuming the target variable is "class" and other columns are features
feature_cols = iris_data.columns[:-1]

In [9]:
# Convert string labels into numerical labels
indexer = StringIndexer(inputCol="class", outputCol="label")
iris_data = indexer.fit(iris_data).transform(iris_data)

In [10]:
# Create a feature vector by assembling the feature columns
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(iris_data)

In [11]:
# Split the data into training and testing sets
(training_data, testing_data) = data.randomSplit([0.8, 0.2], seed=123)

In [12]:
# Customized parameters
num_trees = 10
max_depth = 5

In [13]:
# Create and train a RandomForestClassifier with customized parameters
rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
    numTrees=num_trees,
    maxDepth=max_depth
)

In [14]:
model = rf.fit(training_data)

In [15]:
# Make predictions on the testing data
predictions = model.transform(testing_data)

In [16]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

In [17]:
# Print the accuracy
print("Accuracy: {:.2f}".format(accuracy))


Accuracy: 0.97


In [18]:
# Show the feature importances
print("Feature Importances: ", model.featureImportances)

Feature Importances:  (4,[0,1,2,3],[0.10971043638291314,0.027259903033869593,0.5388647415001908,0.3241649190830265])


In [19]:
# Stop the Spark session
spark.stop()