In [1]:
# Install pyspark and findspark
!pip install --ignore-install -q pyspark
# Install findspark library
!pip install --ignore-install -q findspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
# Import findspark
import findspark
findspark.init()

In [3]:
import sys
sys.version_info
print(sys.version)

3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]


In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### 1. Set up spark context and SparkSession

In [5]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("PySpark-DecisitonTreeClassifier_Iris") \
    .getOrCreate()

### 2. Load dataset

In [7]:
# Load the Iris dataset (assuming you have it in a CSV format)
iris_data = spark.read.csv("/content/drive/MyDrive/iris-data.csv", header=True, inferSchema=True)

In [8]:
# Let's assume that the "class" column is our target variable (label)
# and the other columns are our features
feature_cols = iris_data.columns[:-1]

In [9]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
# Convert string labels into numerical labels
indexer = StringIndexer(inputCol="class", outputCol="label")
iris_data = indexer.fit(iris_data).transform(iris_data)


In [10]:
# Create a feature vector by assembling the feature columns
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(iris_data)

In [11]:
# Split the data into training and testing sets
(training_data, testing_data) = data.randomSplit([0.8, 0.2], seed=123)

### 3. Create DecisionTree Classifier

In [12]:
# Create a DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5, minInfoGain=0.001, impurity="entropy")

In [13]:
# Train the model
model = dt.fit(training_data)

In [14]:
# Make predictions on the testing data
predictions = model.transform(testing_data)

### 3. Evaluation

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.93


### 4. Feature importance

In [16]:
feature_importance = model.featureImportances.toArray()

# Show feature importance
for i, column in enumerate(assembler.getInputCols()):
    print(f"Feature '{column}': {feature_importance[i]:.2f}")

Feature 'sepal length': 0.03
Feature 'sepal width': 0.00
Feature 'petal length': 0.63
Feature 'petal width': 0.34


### 5. Visualize the Decision Tree

In [17]:
print(model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_9314f6319ad3, depth=4, numNodes=15, numClasses=3, numFeatures=4
  If (feature 2 <= 2.45)
   Predict: 0.0
  Else (feature 2 > 2.45)
   If (feature 3 <= 1.75)
    If (feature 2 <= 4.95)
     If (feature 3 <= 1.65)
      Predict: 1.0
     Else (feature 3 > 1.65)
      Predict: 2.0
    Else (feature 2 > 4.95)
     If (feature 0 <= 6.35)
      Predict: 2.0
     Else (feature 0 > 6.35)
      Predict: 1.0
   Else (feature 3 > 1.75)
    If (feature 2 <= 4.85)
     If (feature 0 <= 5.95)
      Predict: 1.0
     Else (feature 0 > 5.95)
      Predict: 2.0
    Else (feature 2 > 4.85)
     Predict: 2.0



In [18]:
# Stop the Spark session
spark.stop()