# Spark Preparation
We check if we are in Google Colab.  If this is the case, install all necessary packages.

To run spark in Colab, we need to first install all the dependencies in Colab environment i.e. Apache Spark 3.2.1 with hadoop 3.2, Java 8 and Findspark to locate the spark in the system. The tools installation can be carried out inside the Jupyter Notebook of the Colab.
Learn more from [A Must-Read Guide on How to Work with PySpark on Google Colab for Data Scientists!](https://www.analyticsvidhya.com/blog/2020/11/a-must-read-guide-on-how-to-work-with-pyspark-on-google-colab-for-data-scientists/)

credit: Natawut Nupairoj

In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [None]:
if IN_COLAB:
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
    !tar xf spark-3.2.1-bin-hadoop3.2.tgz
    !mv spark-3.2.1-bin-hadoop3.2 spark
    !pip install -q findspark

In [None]:
if IN_COLAB:
  import os
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["SPARK_HOME"] = "/content/spark"

# Pyspark_Basic_DataFrame

In [None]:
import findspark
findspark.init()

In [None]:
#1 - import module
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import numpy
import pandas

In [None]:
#2 - Create SparkContext
sc = SparkContext.getOrCreate()

sc

In [None]:
#3 - Setup SparkSession(SparkSQL)
spark = (SparkSession
         .builder
         .appName("DataFrameHandOn")
         .getOrCreate())
print(spark)

In [None]:
!wget https://github.com/kaopanboonyuen/GISTDA2022/raw/main/dataset/iris.csv

In [None]:
#4 - Read file to spark DataFrame
df = (spark
        .read
        .option("header","false")
        .option("inferSchema", "true")
        .csv("iris.csv"))
# If the path don't have file:/// -> it will call hdfs instead of local file system
df.cache()
print("finish caching data")

# Attribute Information:
# 1. sepal length in cm 
# 2. sepal width in cm 
# 3. petal length in cm 
# 4. petal width in cm 
# 5. class: 
# -- Iris Setosa 
# -- Iris Versicolour 
# -- Iris Virginica


In [None]:
#5 - Print sample 5 rows of all variables and schema
df.show(5)

print("\n")

df.printSchema()

In [None]:
df.sample(False, 0.05, 1234).toPandas()

In [None]:
df.sort('_c0').toPandas()


In [None]:
df.sort(desc('_c0')).toPandas()

In [None]:
#6 - change column name
renamed_df = df.selectExpr("_c0 as sepal_length", "_c1 as sepal_width", "_c2 as petal_length","_c3 as petal_width","_c4")
renamed_df = renamed_df.withColumnRenamed("_c4","label")
#6 - Print sample 5 rows of all variables
renamed_df.show(5)

print("\n")

renamed_df.printSchema()

In [None]:
(renamed_df
    .sample(False, 0.05, 1234)
    .toPandas())

In [None]:
#7 - create unique id
(renamed_df
    .withColumn("id",monotonically_increasing_id())
    .sample(False, 0.05, 1234) 
    .toPandas())

In [None]:
#8 - sample data
sample_df = renamed_df.sample(withReplacement=False, fraction=0.5, seed=50)
print("sample_df count : " + str(sample_df.count()))

In [None]:
#9 - union and intersect
sample1_df = renamed_df.sample(withReplacement=False, fraction=0.5, seed=25)
sample2_df = renamed_df.sample(withReplacement=False, fraction=0.5, seed=50)
union_df = sample1_df.union(sample2_df)
intersected_df = sample1_df.intersect(sample2_df)

print("sample1_df count : " + str(sample1_df.count()))
print("sample2_df count : " + str(sample2_df.count()))
print("union_df count : " + str(union_df.count()))
print("intersected_df count : " + str(intersected_df.count()))

In [None]:
#10 - groupBy with count
renamed_df.groupBy("label").count().toPandas()

In [None]:
#11 - groupBy with average
avg_df = renamed_df.groupBy("label").avg("sepal_length")
avg_df.toPandas()


In [None]:
avg_df = renamed_df.groupBy("label").avg("sepal_length","sepal_width","petal_length","petal_width")
avg_df.toPandas()

In [None]:
#12 - compute dataframe using sql command via string
renamed_df.createOrReplaceTempView("iris")
all_df = spark.sql("select * from iris")
(all_df
    .sample(False, 0.05, 1234)
    .toPandas())

In [None]:
avg_df2 = spark.sql("select label,avg(sepal_length),avg(sepal_width),avg(petal_length),avg(petal_width) from iris group by label")
avg_df2.toPandas()

In [None]:
#13 - collect dataframe
avg_row_list = avg_df2.collect()
for row in avg_row_list :
    print(row)

In [None]:
#14 - row operations and properties
temp_row = avg_row_list[0]
print(row.label)
print(row["label"])
print("label" in row)
print("wrong label" in row)
print("all keys : " + str(list(row.asDict().keys())))

In [None]:
#15 - collect dataframe as rdd
avg_row_rdd = avg_df2.rdd
for row in avg_row_rdd.collect() :
    print(row)

In [None]:
#16 - Extract row in rdd 1
avg_rdd = avg_row_rdd.map(lambda row : (row["label"],row["avg(sepal_length)"],row["avg(sepal_width)"],row["avg(petal_length)"],row["avg(petal_width)"]) )
for row in avg_rdd.collect() :
    print(row)

In [None]:
#17 - Extract row in rdd 2
keys = ["label","avg(sepal_length)","avg(sepal_width)","avg(petal_length)","avg(petal_width)"]
avg_rdd = avg_row_rdd.map(lambda row : [row[key] for key in keys] )
for row in avg_rdd.collect() :
    print(row)

In [None]:
#18 - Extract row in rdd 3
avg_rdd = avg_row_rdd.map(lambda row : [row[key] for key in row.asDict().keys()] )
for row in avg_rdd.collect() :
    print(row)