In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("ClassificationWithIrıs") \
        .master("local[4]") \
        .config("spark.executor.memory", "4g") \
        .getOrCreate()

In [3]:
df = spark.read.format("csv") \
    .option("header", True) \
    .option("sep", ",") \
    .option("inferSchema", True) \
    .load("C:/Users/htcso/OneDrive/Masaüstü/pySpark/dataset/iris.csv")

In [4]:
df.limit(5).toPandas().head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
df.describe().toPandas().head()

Unnamed: 0,summary,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,count,150.0,150.0,150.0,150.0,150
1,mean,5.843333333333335,3.0540000000000007,3.758666666666669,1.1986666666666672,
2,stddev,0.8280661279778637,0.4335943113621737,1.764420419952262,0.7631607417008414,
3,min,4.3,2.0,1.0,0.1,Iris-setosa
4,max,7.9,4.4,6.9,2.5,Iris-virginica


In [6]:
import pyspark.sql.functions as f

df.groupby("Species").agg(f.count("*").alias("Count")).show()

+---------------+-----+
|        Species|Count|
+---------------+-----+
| Iris-virginica|   50|
|    Iris-setosa|   50|
|Iris-versicolor|   50|
+---------------+-----+



In [7]:
df.printSchema()

root
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



In [8]:
df.columns

['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']

In [9]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

indexer = StringIndexer() \
    .setHandleInvalid("skip") \
    .setInputCol("Species") \
    .setOutputCol("label")

In [10]:
indexer_df = indexer.fit(df).transform(df)

In [11]:
indexer_df.toPandas().head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,label
0,5.1,3.5,1.4,0.2,Iris-setosa,0.0
1,4.9,3.0,1.4,0.2,Iris-setosa,0.0
2,4.7,3.2,1.3,0.2,Iris-setosa,0.0
3,4.6,3.1,1.5,0.2,Iris-setosa,0.0
4,5.0,3.6,1.4,0.2,Iris-setosa,0.0


In [12]:
assembler = VectorAssembler() \
    .setInputCols(["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm"]) \
    .setOutputCol("features")

In [13]:
assembler_df = assembler.transform(indexer_df)

In [14]:
assembler_df.toPandas().head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,label,features
0,5.1,3.5,1.4,0.2,Iris-setosa,0.0,"[5.1, 3.5, 1.4, 0.2]"
1,4.9,3.0,1.4,0.2,Iris-setosa,0.0,"[4.9, 3.0, 1.4, 0.2]"
2,4.7,3.2,1.3,0.2,Iris-setosa,0.0,"[4.7, 3.2, 1.3, 0.2]"
3,4.6,3.1,1.5,0.2,Iris-setosa,0.0,"[4.6, 3.1, 1.5, 0.2]"
4,5.0,3.6,1.4,0.2,Iris-setosa,0.0,"[5.0, 3.6, 1.4, 0.2]"


In [15]:
train_df, test_df = assembler_df.randomSplit([0.8, 0.2], seed = 142)

In [16]:
from pyspark.ml.classification import LogisticRegression

logreg_obj = LogisticRegression() \
    .setLabelCol("label") \
    .setFeaturesCol("features")

In [17]:
logreg_model = logreg_obj.fit(train_df)

In [18]:
transformed_df = logreg_model.transform(test_df)
transformed_df.limit(5).toPandas().head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,label,features,rawPrediction,probability,prediction
0,4.4,3.2,1.3,0.2,Iris-setosa,0.0,"[4.4, 3.2, 1.3, 0.2]","[205.65022251774505, 100.14313964375161, -305....","[1.0, 1.5095799053355713e-46, 7.63613097812419...",0.0
1,4.6,3.6,1.0,0.2,Iris-setosa,0.0,"[4.6, 3.6, 1.0, 0.2]","[231.55263484273888, 110.23467969214714, -341....","[1.0, 2.0524922328379425e-53, 1.00374577584111...",0.0
2,4.7,3.2,1.6,0.2,Iris-setosa,0.0,"[4.7, 3.2, 1.6, 0.2]","[182.58723676115665, 106.49218597825107, -289....","[1.0, 8.960646504393038e-34, 1.438045015735978...",0.0
3,4.8,3.4,1.6,0.2,Iris-setosa,0.0,"[4.8, 3.4, 1.6, 0.2]","[190.4982207679723, 107.58811142353348, -298.0...","[1.0, 9.83098464355822e-37, 6.462859379400751e...",0.0
4,5.0,3.0,1.6,0.2,Iris-setosa,0.0,"[5.0, 3.0, 1.6, 0.2]","[157.36619749403974, 124.3945745227422, -281.7...","[0.9999999999999951, 4.7929851581900556e-15, 1...",0.0


In [19]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator() \
    .setLabelCol("label") \
    .setPredictionCol("prediction") \
    .setMetricName("accuracy")

accuracy = evaluator.evaluate(transformed_df)

accuracy

0.8888888888888888

In [20]:
spark.stop()