# Spark Preparation
We check if we are in Google Colab.  If this is the case, install all necessary packages.

To run spark in Colab, we need to first install all the dependencies in Colab environment i.e. Apache Spark 3.2.1 with hadoop 3.2, Java 8 and Findspark to locate the spark in the system. The tools installation can be carried out inside the Jupyter Notebook of the Colab.
Learn more from [A Must-Read Guide on How to Work with PySpark on Google Colab for Data Scientists!](https://www.analyticsvidhya.com/blog/2020/11/a-must-read-guide-on-how-to-work-with-pyspark-on-google-colab-for-data-scientists/)

credit: Natawut Nupairoj

In [30]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [31]:
if IN_COLAB:
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
    !tar xf spark-3.2.1-bin-hadoop3.2.tgz
    !mv spark-3.2.1-bin-hadoop3.2 spark
    !pip install -q findspark

In [32]:
if IN_COLAB:
  import os
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["SPARK_HOME"] = "/content/spark"

# Pyspark_Basic_DataFrame

In [33]:
import findspark
findspark.init()

In [34]:
#1 - import module
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import numpy
import pandas

In [35]:
#2 - Create SparkContext
sc = SparkContext.getOrCreate()

sc

In [36]:
#3 - Setup SparkSession(SparkSQL)
spark = (SparkSession
         .builder
         .appName("DataFrameHandOn")
         .getOrCreate())
print(spark)

<pyspark.sql.session.SparkSession object at 0x7f4d73a08f10>


In [37]:
!wget https://github.com/kaopanboonyuen/GISTDA2022/raw/main/dataset/iris.csv

--2022-03-21 03:35:07--  https://github.com/kaopanboonyuen/GISTDA2022/raw/main/dataset/iris.csv
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/kaopanboonyuen/GISTDA2022/main/dataset/iris.csv [following]
--2022-03-21 03:35:08--  https://raw.githubusercontent.com/kaopanboonyuen/GISTDA2022/main/dataset/iris.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4550 (4.4K) [text/plain]
Saving to: ‘iris.csv.1’


2022-03-21 03:35:08 (52.3 MB/s) - ‘iris.csv.1’ saved [4550/4550]



In [38]:
#4 - Read file to spark DataFrame
df = (spark
        .read
        .option("header","false")
        .option("inferSchema", "true")
        .csv("iris.csv"))
# If the path don't have file:/// -> it will call hdfs instead of local file system
df.cache()
print("finish caching data")

# Attribute Information:
# 1. sepal length in cm 
# 2. sepal width in cm 
# 3. petal length in cm 
# 4. petal width in cm 
# 5. class: 
# -- Iris Setosa 
# -- Iris Versicolour 
# -- Iris Virginica


finish caching data


In [39]:
#5 - Print sample 5 rows of all variables and schema
df.show(5)

print("\n")

df.printSchema()

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
+---+---+---+---+-----------+
only showing top 5 rows



root
 |-- _c0: double (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: string (nullable = true)



In [40]:
df.sample(False, 0.05, 1234).toPandas()

Unnamed: 0,_c0,_c1,_c2,_c3,_c4
0,4.8,3.1,1.6,0.2,Iris-setosa
1,5.1,3.8,1.6,0.2,Iris-setosa
2,7.0,3.2,4.7,1.4,Iris-versicolor
3,6.5,2.8,4.6,1.5,Iris-versicolor
4,5.0,2.0,3.5,1.0,Iris-versicolor
5,6.0,2.2,4.0,1.0,Iris-versicolor
6,6.3,2.3,4.4,1.3,Iris-versicolor
7,6.2,2.9,4.3,1.3,Iris-versicolor
8,6.9,3.2,5.7,2.3,Iris-virginica


In [41]:
df.sort('_c0').toPandas()


Unnamed: 0,_c0,_c1,_c2,_c3,_c4
0,4.3,3.0,1.1,0.1,Iris-setosa
1,4.4,2.9,1.4,0.2,Iris-setosa
2,4.4,3.0,1.3,0.2,Iris-setosa
3,4.4,3.2,1.3,0.2,Iris-setosa
4,4.5,2.3,1.3,0.3,Iris-setosa
...,...,...,...,...,...
145,7.7,3.8,6.7,2.2,Iris-virginica
146,7.7,2.6,6.9,2.3,Iris-virginica
147,7.7,2.8,6.7,2.0,Iris-virginica
148,7.7,3.0,6.1,2.3,Iris-virginica


In [42]:
df.sort(desc('_c0')).toPandas()

Unnamed: 0,_c0,_c1,_c2,_c3,_c4
0,7.9,3.8,6.4,2.0,Iris-virginica
1,7.7,3.8,6.7,2.2,Iris-virginica
2,7.7,2.6,6.9,2.3,Iris-virginica
3,7.7,2.8,6.7,2.0,Iris-virginica
4,7.7,3.0,6.1,2.3,Iris-virginica
...,...,...,...,...,...
145,4.5,2.3,1.3,0.3,Iris-setosa
146,4.4,2.9,1.4,0.2,Iris-setosa
147,4.4,3.0,1.3,0.2,Iris-setosa
148,4.4,3.2,1.3,0.2,Iris-setosa


In [43]:
#6 - change column name
renamed_df = df.selectExpr("_c0 as sepal_length", "_c1 as sepal_width", "_c2 as petal_length","_c3 as petal_width","_c4")
renamed_df = renamed_df.withColumnRenamed("_c4","label")
#6 - Print sample 5 rows of all variables
renamed_df.show(5)

print("\n")

renamed_df.printSchema()

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|      label|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- label: string (nullable = true)



In [44]:
(renamed_df
    .sample(False, 0.05, 1234)
    .toPandas())

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,4.8,3.1,1.6,0.2,Iris-setosa
1,5.1,3.8,1.6,0.2,Iris-setosa
2,7.0,3.2,4.7,1.4,Iris-versicolor
3,6.5,2.8,4.6,1.5,Iris-versicolor
4,5.0,2.0,3.5,1.0,Iris-versicolor
5,6.0,2.2,4.0,1.0,Iris-versicolor
6,6.3,2.3,4.4,1.3,Iris-versicolor
7,6.2,2.9,4.3,1.3,Iris-versicolor
8,6.9,3.2,5.7,2.3,Iris-virginica


In [45]:
#7 - create unique id
(renamed_df
    .withColumn("id",monotonically_increasing_id())
    .sample(False, 0.05, 1234) 
    .toPandas())

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label,id
0,4.8,3.1,1.6,0.2,Iris-setosa,30
1,5.1,3.8,1.6,0.2,Iris-setosa,46
2,7.0,3.2,4.7,1.4,Iris-versicolor,50
3,6.5,2.8,4.6,1.5,Iris-versicolor,54
4,5.0,2.0,3.5,1.0,Iris-versicolor,60
5,6.0,2.2,4.0,1.0,Iris-versicolor,62
6,6.3,2.3,4.4,1.3,Iris-versicolor,87
7,6.2,2.9,4.3,1.3,Iris-versicolor,97
8,6.9,3.2,5.7,2.3,Iris-virginica,120


In [46]:
#8 - sample data
sample_df = renamed_df.sample(withReplacement=False, fraction=0.5, seed=50)
print("sample_df count : " + str(sample_df.count()))

sample_df count : 71


In [47]:
#9 - union and intersect
sample1_df = renamed_df.sample(withReplacement=False, fraction=0.5, seed=25)
sample2_df = renamed_df.sample(withReplacement=False, fraction=0.5, seed=50)
union_df = sample1_df.union(sample2_df)
intersected_df = sample1_df.intersect(sample2_df)

print("sample1_df count : " + str(sample1_df.count()))
print("sample2_df count : " + str(sample2_df.count()))
print("union_df count : " + str(union_df.count()))
print("intersected_df count : " + str(intersected_df.count()))

sample1_df count : 84
sample2_df count : 71
union_df count : 155
intersected_df count : 43


In [48]:
#10 - groupBy with count
renamed_df.groupBy("label").count().toPandas()

Unnamed: 0,label,count
0,Iris-virginica,50
1,Iris-setosa,50
2,Iris-versicolor,50


In [49]:
#11 - groupBy with average
avg_df = renamed_df.groupBy("label").avg("sepal_length")
avg_df.toPandas()


Unnamed: 0,label,avg(sepal_length)
0,Iris-virginica,6.588
1,Iris-setosa,5.006
2,Iris-versicolor,5.936


In [50]:
avg_df = renamed_df.groupBy("label").avg("sepal_length","sepal_width","petal_length","petal_width")
avg_df.toPandas()

Unnamed: 0,label,avg(sepal_length),avg(sepal_width),avg(petal_length),avg(petal_width)
0,Iris-virginica,6.588,2.974,5.552,2.026
1,Iris-setosa,5.006,3.418,1.464,0.244
2,Iris-versicolor,5.936,2.77,4.26,1.326


In [51]:
#12 - compute dataframe using sql command via string
renamed_df.createOrReplaceTempView("iris")
all_df = spark.sql("select * from iris")
(all_df
    .sample(False, 0.05, 1234)
    .toPandas())

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,4.8,3.1,1.6,0.2,Iris-setosa
1,5.1,3.8,1.6,0.2,Iris-setosa
2,7.0,3.2,4.7,1.4,Iris-versicolor
3,6.5,2.8,4.6,1.5,Iris-versicolor
4,5.0,2.0,3.5,1.0,Iris-versicolor
5,6.0,2.2,4.0,1.0,Iris-versicolor
6,6.3,2.3,4.4,1.3,Iris-versicolor
7,6.2,2.9,4.3,1.3,Iris-versicolor
8,6.9,3.2,5.7,2.3,Iris-virginica


In [52]:
avg_df2 = spark.sql("select label,avg(sepal_length),avg(sepal_width),avg(petal_length),avg(petal_width) from iris group by label")
avg_df2.toPandas()

Unnamed: 0,label,avg(sepal_length),avg(sepal_width),avg(petal_length),avg(petal_width)
0,Iris-virginica,6.588,2.974,5.552,2.026
1,Iris-setosa,5.006,3.418,1.464,0.244
2,Iris-versicolor,5.936,2.77,4.26,1.326


In [53]:
#13 - collect dataframe
avg_row_list = avg_df2.collect()
for row in avg_row_list :
    print(row)

Row(label='Iris-virginica', avg(sepal_length)=6.587999999999998, avg(sepal_width)=2.9739999999999998, avg(petal_length)=5.552, avg(petal_width)=2.026)
Row(label='Iris-setosa', avg(sepal_length)=5.005999999999999, avg(sepal_width)=3.4180000000000006, avg(petal_length)=1.464, avg(petal_width)=0.2439999999999999)
Row(label='Iris-versicolor', avg(sepal_length)=5.936, avg(sepal_width)=2.7700000000000005, avg(petal_length)=4.26, avg(petal_width)=1.3259999999999998)


In [54]:
#14 - row operations and properties
temp_row = avg_row_list[0]
print(row.label)
print(row["label"])
print("label" in row)
print("wrong label" in row)
print("all keys : " + str(list(row.asDict().keys())))

Iris-versicolor
Iris-versicolor
True
False
all keys : ['label', 'avg(sepal_length)', 'avg(sepal_width)', 'avg(petal_length)', 'avg(petal_width)']


In [55]:
#15 - collect dataframe as rdd
avg_row_rdd = avg_df2.rdd
for row in avg_row_rdd.collect() :
    print(row)

Row(label='Iris-virginica', avg(sepal_length)=6.587999999999998, avg(sepal_width)=2.9739999999999998, avg(petal_length)=5.552, avg(petal_width)=2.026)
Row(label='Iris-setosa', avg(sepal_length)=5.005999999999999, avg(sepal_width)=3.4180000000000006, avg(petal_length)=1.464, avg(petal_width)=0.2439999999999999)
Row(label='Iris-versicolor', avg(sepal_length)=5.936, avg(sepal_width)=2.7700000000000005, avg(petal_length)=4.26, avg(petal_width)=1.3259999999999998)


In [56]:
#16 - Extract row in rdd 1
avg_rdd = avg_row_rdd.map(lambda row : (row["label"],row["avg(sepal_length)"],row["avg(sepal_width)"],row["avg(petal_length)"],row["avg(petal_width)"]) )
for row in avg_rdd.collect() :
    print(row)

('Iris-virginica', 6.587999999999998, 2.9739999999999998, 5.552, 2.026)
('Iris-setosa', 5.005999999999999, 3.4180000000000006, 1.464, 0.2439999999999999)
('Iris-versicolor', 5.936, 2.7700000000000005, 4.26, 1.3259999999999998)


In [57]:
#17 - Extract row in rdd 2
keys = ["label","avg(sepal_length)","avg(sepal_width)","avg(petal_length)","avg(petal_width)"]
avg_rdd = avg_row_rdd.map(lambda row : [row[key] for key in keys] )
for row in avg_rdd.collect() :
    print(row)

['Iris-virginica', 6.587999999999998, 2.9739999999999998, 5.552, 2.026]
['Iris-setosa', 5.005999999999999, 3.4180000000000006, 1.464, 0.2439999999999999]
['Iris-versicolor', 5.936, 2.7700000000000005, 4.26, 1.3259999999999998]


In [58]:
#18 - Extract row in rdd 3
avg_rdd = avg_row_rdd.map(lambda row : [row[key] for key in row.asDict().keys()] )
for row in avg_rdd.collect() :
    print(row)

['Iris-virginica', 6.587999999999998, 2.9739999999999998, 5.552, 2.026]
['Iris-setosa', 5.005999999999999, 3.4180000000000006, 1.464, 0.2439999999999999]
['Iris-versicolor', 5.936, 2.7700000000000005, 4.26, 1.3259999999999998]
