# Random Forest

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import pandas as pd

In [2]:
data_pd = pd.read_csv('/Users/halilergul/Desktop/master/fall-23_24/leaf-dataset/leaf.csv', header=None)
# column names are like numbers 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 ,15 
# I want to give them features names: feature1, etc.
data_pd.columns = ['class', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6', 'feature7', 'feature8',
                'feature9', 'feature10', 'feature11', 'feature12', 'feature13', 'feature14', 'feature15', 'feature16']
spark = SparkSession.builder.appName("LeafClassification").getOrCreate()
data = spark.createDataFrame(data_pd) # by this, I convert pandas dataframe to spark dataframe
# Vectorizing features
columns = data.columns # result is: ['class', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6', 'feature7', 'feature8', 'feature9', 'feature10', 'feature11', 'feature12', 'feature13', 'feature14', 'feature15', 'feature16']
assembler = VectorAssembler(inputCols=columns[1:], outputCol="features")
data = assembler.transform(data)
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)# Split the data into training and test sets


23/12/15 15:21:59 WARN Utils: Your hostname, Halils-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.51.47.94 instead (on interface en0)
23/12/15 15:21:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/15 15:22:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# see whether assembler took the right columns
data.select("features").show(truncate=False)

23/12/15 15:22:03 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 0:>                                                          (0 + 1) / 1]

+---------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                         |
+---------------------------------------------------------------------------------------------------------------------------------+
|[1.0,0.72694,1.4742,0.32396,0.98535,1.0,0.83592,0.0046566,0.0039465,0.04779,0.12795,0.016108,0.0052323,2.7477E-4,1.1756]         |
|[2.0,0.74173,1.5257,0.36116,0.98152,0.99825,0.79867,0.0052423,0.0050016,0.02416,0.090476,0.0081195,0.002708,7.4846E-5,0.69659]   |
|[3.0,0.76722,1.5725,0.38998,0.97755,1.0,0.80812,0.0074573,0.010121,0.011897,0.057445,0.0032891,9.2068E-4,3.7886E-5,0.44348]      |
|[4.0,0.73797,1.4597,0.35376,0.97566,1.0,0.81697,0.0068768,0.0086068,0.01595,0.065491,0.0042707,0.0011544,6.6272E-5,0.58785]      |
|[5.0,0.82301,1.7707,0.44462,0.97698,1.0,0.75493,0.007428,0.010042,0.0079379

                                                                                

In [5]:
spark_context = spark.sparkContext
spark_context.setLogLevel("ERROR") # I do this to see only errors, not warnings
rf = RandomForestClassifier(labelCol="class", featuresCol="features") # classifier
#parameter grid for tuning
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 15, 20, 25, 30, 40, 50, 60, 70]) \
    .addGrid(rf.maxDepth, [5, 10, 15, 20, 25, 30]) \
    .build()

evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="accuracy") # multiclass evaluator
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)
cvModel = crossval.fit(train_data) # this will run the classifier with all the parameters in the grid
predictions = cvModel.transform(test_data) #the best parameters to make predictions on the test set
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy)) # also print best models parameters
print("Best model parameters: numTrees: {}, maxDepth: {}".format(cvModel.bestModel.getNumTrees,
                                                                 cvModel.bestModel.getMaxDepth))

Test set accuracy = 0.69
Best model parameters: numTrees: 60, maxDepth: <bound method _DecisionTreeParams.getMaxDepth of RandomForestClassificationModel: uid=RandomForestClassifier_d4346c77ae8e, numTrees=60, numClasses=37, numFeatures=15>


# Logistic Regression

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
spark = SparkSession.builder.appName("Leaf Classification").getOrCreate()
spark_context = spark.sparkContext
spark_context.setLogLevel("ERROR")

# Now I will define a schema to load the data and give the columns names as feature1, feature2, etc.
schema = StructType([StructField("label", IntegerType(), True)])
for i in range(1, 16):
    schema.add(StructField(f"feature{i}", DoubleType(), True))
data = spark.read.csv("leaf.csv", schema=schema, header=False) # this is my data

feature_columns = data.columns[1:]  # excluding the first column which is the label
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)
train_data, test_data = data.randomSplit([0.7, 0.3])

# now classifier
lr = LogisticRegression(featuresCol="features", labelCol="label")
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.maxIter, [10, 20]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)
cvModel = crossval.fit(train_data)
prediction = cvModel.transform(test_data)

#now model evaluation
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))
print("Best model parameters: regParam: {}, maxIter: {}, elasticNetParam: {}".format(cvModel.bestModel.getRegParam(),
                                                                                     cvModel.bestModel.getMaxIter(),
                                                                                     cvModel.bestModel.getElasticNetParam()))
spark.stop()


Test set accuracy = 0.6789891046771781
Best model parameters: regParam: 0.01, maxIter: 20, elasticNetParam: 0.0


# DecisionTreeClassifier

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [12]:
spark = SparkSession.builder.appName("Leaf Classification").getOrCreate()
spark_context = spark.sparkContext
spark_context.setLogLevel("ERROR")

# I use schema logic to define the columns names similar to previous examples
schema = StructType([StructField("label", IntegerType(), True)])
for i in range(1, 16):
    schema.add(StructField(f"feature{i}", DoubleType(), True))

data = spark.read.csv("leaf.csv", schema=schema, header=False)

feature_columns = data.columns[1:]  # excluding the first column which is the label
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

train_data, test_data = data.randomSplit([0.7, 0.3])

dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")

# add a lot of maxDepth and maxBins parameters to the grid to get the best model
paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15, 20, 25, 30]) \
    .addGrid(dt.maxBins, [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) \
    .build()
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label") # usual evaluator as multiclass

# CrossValidator
crossval = CrossValidator(estimator=dt,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)
cvModel = crossval.fit(train_data)
prediction = cvModel.transform(test_data)
accuracy = evaluator.evaluate(prediction)

print("Test set accuracy = " + str(accuracy))
print("Best model parameters: maxDepth: {}, maxBins: {}".format(cvModel.bestModel.getMaxDepth(),
                                                                cvModel.bestModel.getMaxBins()))
spark.stop()


Test set accuracy = 0.5118523581681477
Best model parameters: maxDepth: 15, maxBins: 40


# One-vs-Rest classifier (a.k.a. One-vs-All)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.appName("Leaf Classification").getOrCreate()
spark_context = spark.sparkContext
spark_context.setLogLevel("ERROR")

schema = StructType([StructField("label", IntegerType(), True)])
for i in range(1, 16):
    schema.add(StructField(f"feature{i}", DoubleType(), True))

data = spark.read.csv("leaf.csv", schema=schema, header=False)

feature_columns = data.columns[1:]  # excluding the first column which is the label
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

train_data, test_data = data.randomSplit([0.7, 0.3])

lr = LogisticRegression(featuresCol="features", labelCol="label")

ovr = OneVsRest(classifier=lr, featuresCol="features", labelCol="label") # this is the one vs rest classifier from spark
paramGrid = ParamGridBuilder() \
    .addGrid(lr.maxIter, [10, 100, 200]) \
    .addGrid(lr.regParam, [0.1, 0.01, 0.001]) \
    .build()
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label") # the same evaluator as before
crossval = CrossValidator(estimator=ovr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

cvModel = crossval.fit(train_data)
prediction = cvModel.transform(test_data)
accuracy = evaluator.evaluate(prediction)

print("Test set accuracy = " + str(accuracy))
bestModel = cvModel.bestModel.getClassifier()
print("Best model parameters: maxIter: {}, regParam: {}".format(bestModel.getMaxIter(),
                                                                bestModel.getRegParam()))
# Stop Spark session
spark.stop()


                                                                                

Test set accuracy = 0.7856254856254857
Best model parameters: maxIter: 10, regParam: 0.001


# Results Table

In [6]:
import pandas as pd

# Create a DataFrame to hold the classification results
classification_results = pd.DataFrame({
    "Method": [
        "Random Forest Classifier",
        "Logistic Regression",
        "Decision Tree Classifier",
        "One-vs-Rest Classifier"
    ],
    "Parameters": [
        "numTrees: 25, maxDepth: 5",
        "regParam: 0.01, maxIter: 20, elasticNetParam: 0.0",
        "maxDepth: 15, maxBins: 40",
        "maxIter: 10, regParam: 0.001"
    ],
    "Accuracy": [
        0.67,
        0.6789891046771781,
        0.5938716227707053,
        0.7856254856254857
    ]
})

# Set the display format for the accuracy to be more readable
pd.options.display.float_format = '{:,.4f}'.format

# Print out the DataFrame
classification_results

Unnamed: 0,Method,Parameters,Accuracy
0,Random Forest Classifier,"numTrees: 25, maxDepth: 5",0.67
1,Logistic Regression,"regParam: 0.01, maxIter: 20, elasticNetParam: 0.0",0.679
2,Decision Tree Classifier,"maxDepth: 15, maxBins: 40",0.5939
3,One-vs-Rest Classifier,"maxIter: 10, regParam: 0.001",0.7856
