# Iris

### Introduction:

This exercise may seem a little bit strange, but keep doing it.

### Step 1. Import the necessary libraries

In [49]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

spark = SparkSession.builder.appName("Iris").getOrCreate()

### Step 2. Import the dataset from this [address](https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data). 

### Step 3. Assign it to a variable called iris

In [50]:
schema = T.StructType([
    T.StructField('sepal_length', T.DoubleType()),
    T.StructField('sepal_width', T.DoubleType()),
    T.StructField('petal_length', T.DoubleType()),
    T.StructField('petal_width', T.DoubleType()),
    T.StructField('class', T.StringType())
])

In [51]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

from pyspark import SparkFiles
spark.sparkContext.addFile(url)

iris = spark.read.csv(SparkFiles.get("iris.data"), schema=schema)

### Step 4. Create columns for the dataset

In [52]:
# 1. sepal_length (in cm)
# 2. sepal_width (in cm)
# 3. petal_length (in cm)
# 4. petal_width (in cm)
# 5. class

In [53]:
iris.head(5)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, class='Iris-setosa'),
 Row(sepal_length=4.9, sepal_width=3.0, petal_length=1.4, petal_width=0.2, class='Iris-setosa'),
 Row(sepal_length=4.7, sepal_width=3.2, petal_length=1.3, petal_width=0.2, class='Iris-setosa'),
 Row(sepal_length=4.6, sepal_width=3.1, petal_length=1.5, petal_width=0.2, class='Iris-setosa'),
 Row(sepal_length=5.0, sepal_width=3.6, petal_length=1.4, petal_width=0.2, class='Iris-setosa')]

### Step 5.  Is there any missing value in the dataframe?

In [54]:
iris.select([
    F.count(F.when(F.isnan(c), c)).alias(c) for c in iris.columns
]).show()

+------------+-----------+------------+-----------+-----+
|sepal_length|sepal_width|petal_length|petal_width|class|
+------------+-----------+------------+-----------+-----+
|           0|          0|           0|          0|    0|
+------------+-----------+------------+-----------+-----+



### Step 6.  Lets set the values of the rows 10 to 29 of the column 'petal_length' to NaN

In [55]:
iris = iris.withColumn('id', F.monotonically_increasing_id())

iris.show()

+------------+-----------+------------+-----------+-----------+---+
|sepal_length|sepal_width|petal_length|petal_width|      class| id|
+------------+-----------+------------+-----------+-----------+---+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|  0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|  1|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|  2|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|  3|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|  4|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|  5|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|  6|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|  7|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|  8|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|  9|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa| 10|
|         4.8|        3.4|         1.6|        0

In [56]:
import numpy as np

In [57]:
iris_na = iris.withColumn("petal_length", 
               F.when((F.col('id') > 10) & (F.col('id') < 30), F.lit(np.nan))
               .otherwise(F.col("petal_length")))

In [59]:
iris_na.head(20)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, class='Iris-setosa', id=0),
 Row(sepal_length=4.9, sepal_width=3.0, petal_length=1.4, petal_width=0.2, class='Iris-setosa', id=1),
 Row(sepal_length=4.7, sepal_width=3.2, petal_length=1.3, petal_width=0.2, class='Iris-setosa', id=2),
 Row(sepal_length=4.6, sepal_width=3.1, petal_length=1.5, petal_width=0.2, class='Iris-setosa', id=3),
 Row(sepal_length=5.0, sepal_width=3.6, petal_length=1.4, petal_width=0.2, class='Iris-setosa', id=4),
 Row(sepal_length=5.4, sepal_width=3.9, petal_length=1.7, petal_width=0.4, class='Iris-setosa', id=5),
 Row(sepal_length=4.6, sepal_width=3.4, petal_length=1.4, petal_width=0.3, class='Iris-setosa', id=6),
 Row(sepal_length=5.0, sepal_width=3.4, petal_length=1.5, petal_width=0.2, class='Iris-setosa', id=7),
 Row(sepal_length=4.4, sepal_width=2.9, petal_length=1.4, petal_width=0.2, class='Iris-setosa', id=8),
 Row(sepal_length=4.9, sepal_width=3.1, petal_length=1.5, petal_width=0.1

### Step 7. Good, now lets substitute the NaN values to 1.0

In [62]:
iris_1 = iris_na.withColumn("petal_length", F.when(F.isnan('petal_length'), 1).otherwise(F.col("petal_length")))

In [65]:
iris_1.head(31)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, class='Iris-setosa', id=0),
 Row(sepal_length=4.9, sepal_width=3.0, petal_length=1.4, petal_width=0.2, class='Iris-setosa', id=1),
 Row(sepal_length=4.7, sepal_width=3.2, petal_length=1.3, petal_width=0.2, class='Iris-setosa', id=2),
 Row(sepal_length=4.6, sepal_width=3.1, petal_length=1.5, petal_width=0.2, class='Iris-setosa', id=3),
 Row(sepal_length=5.0, sepal_width=3.6, petal_length=1.4, petal_width=0.2, class='Iris-setosa', id=4),
 Row(sepal_length=5.4, sepal_width=3.9, petal_length=1.7, petal_width=0.4, class='Iris-setosa', id=5),
 Row(sepal_length=4.6, sepal_width=3.4, petal_length=1.4, petal_width=0.3, class='Iris-setosa', id=6),
 Row(sepal_length=5.0, sepal_width=3.4, petal_length=1.5, petal_width=0.2, class='Iris-setosa', id=7),
 Row(sepal_length=4.4, sepal_width=2.9, petal_length=1.4, petal_width=0.2, class='Iris-setosa', id=8),
 Row(sepal_length=4.9, sepal_width=3.1, petal_length=1.5, petal_width=0.1

### Step 8. Now let's delete the column class

In [67]:
iris_no_class = iris_1.drop('class')

In [68]:
iris_no_class.show()

+------------+-----------+------------+-----------+---+
|sepal_length|sepal_width|petal_length|petal_width| id|
+------------+-----------+------------+-----------+---+
|         5.1|        3.5|         1.4|        0.2|  0|
|         4.9|        3.0|         1.4|        0.2|  1|
|         4.7|        3.2|         1.3|        0.2|  2|
|         4.6|        3.1|         1.5|        0.2|  3|
|         5.0|        3.6|         1.4|        0.2|  4|
|         5.4|        3.9|         1.7|        0.4|  5|
|         4.6|        3.4|         1.4|        0.3|  6|
|         5.0|        3.4|         1.5|        0.2|  7|
|         4.4|        2.9|         1.4|        0.2|  8|
|         4.9|        3.1|         1.5|        0.1|  9|
|         5.4|        3.7|         1.5|        0.2| 10|
|         4.8|        3.4|         1.0|        0.2| 11|
|         4.8|        3.0|         1.0|        0.1| 12|
|         4.3|        3.0|         1.0|        0.1| 13|
|         5.8|        4.0|         1.0|        0

### Step 9.  Set the first 3 rows as NaN

In [72]:
wo_first_3_iris = iris_no_class.\
withColumn("sepal_length", F.when(F.col('id') < 4, F.lit(np.nan)).otherwise(F.col("sepal_length"))).\
withColumn("sepal_width", F.when(F.col('id') < 4, F.lit(np.nan)).otherwise(F.col("sepal_width"))).\
withColumn("petal_length", F.when(F.col('id') < 4, F.lit(np.nan)).otherwise(F.col("petal_length"))).\
withColumn("petal_width", F.when(F.col('id') < 4, F.lit(np.nan)).otherwise(F.col("petal_width")))

In [73]:
wo_first_3_iris.head(5)

[Row(sepal_length=nan, sepal_width=nan, petal_length=nan, petal_width=nan, id=0),
 Row(sepal_length=nan, sepal_width=nan, petal_length=nan, petal_width=nan, id=1),
 Row(sepal_length=nan, sepal_width=nan, petal_length=nan, petal_width=nan, id=2),
 Row(sepal_length=nan, sepal_width=nan, petal_length=nan, petal_width=nan, id=3),
 Row(sepal_length=5.0, sepal_width=3.6, petal_length=1.4, petal_width=0.2, id=4)]

### Step 10.  Delete the rows that have NaN

In [82]:
no_na_iris = wo_first_3_iris.dropna()

In [85]:
no_na_iris.head(5)

[Row(sepal_length=5.0, sepal_width=3.6, petal_length=1.4, petal_width=0.2, id=4),
 Row(sepal_length=5.4, sepal_width=3.9, petal_length=1.7, petal_width=0.4, id=5),
 Row(sepal_length=4.6, sepal_width=3.4, petal_length=1.4, petal_width=0.3, id=6),
 Row(sepal_length=5.0, sepal_width=3.4, petal_length=1.5, petal_width=0.2, id=7),
 Row(sepal_length=4.4, sepal_width=2.9, petal_length=1.4, petal_width=0.2, id=8)]

### Step 11. Reset the index so it begins with 0 again

In [86]:
reset_iris =  no_na_iris.withColumn('id', F.monotonically_increasing_id())

In [88]:
reset_iris.head(5)

[Row(sepal_length=5.0, sepal_width=3.6, petal_length=1.4, petal_width=0.2, id=0),
 Row(sepal_length=5.4, sepal_width=3.9, petal_length=1.7, petal_width=0.4, id=1),
 Row(sepal_length=4.6, sepal_width=3.4, petal_length=1.4, petal_width=0.3, id=2),
 Row(sepal_length=5.0, sepal_width=3.4, petal_length=1.5, petal_width=0.2, id=3),
 Row(sepal_length=4.4, sepal_width=2.9, petal_length=1.4, petal_width=0.2, id=4)]

### BONUS: Create your own question and answer it.