# Spark Preparation
We check if we are in Google Colab.  If this is the case, install all necessary packages.

To run spark in Colab, we need to first install all the dependencies in Colab environment i.e. Apache Spark 3.2.1 with hadoop 3.2, Java 8 and Findspark to locate the spark in the system. The tools installation can be carried out inside the Jupyter Notebook of the Colab.
Learn more from [A Must-Read Guide on How to Work with PySpark on Google Colab for Data Scientists!](https://www.analyticsvidhya.com/blog/2020/11/a-must-read-guide-on-how-to-work-with-pyspark-on-google-colab-for-data-scientists/)

In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [None]:
if IN_COLAB:
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
    !tar xf spark-3.2.1-bin-hadoop3.2.tgz
    !mv spark-3.2.1-bin-hadoop3.2 spark
    !pip install -q findspark

In [None]:
if IN_COLAB:
  import os
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["SPARK_HOME"] = "/content/spark"

In [None]:
import findspark
findspark.init()

# Pyspark_Basic_RDD

In [None]:
#1 - import module
from pyspark import SparkContext

In [None]:
#2 - Create SparkContext
sc = sc = SparkContext.getOrCreate()
sc

In [None]:
import multiprocessing

multiprocessing.cpu_count()

In [None]:
#rdd.getNumPartitions

In [None]:
#3 - Print top 5 rows
def printRDD(data,num):
    for line in data.take(num):
        print(line)

In [None]:
!wget https://github.com/kaopanboonyuen/GISTDA2022/raw/main/dataset/iris.csv

In [None]:
#4 - Read file to spark RDD
rdd = sc.textFile("iris.csv")
rdd.cache()

# Attribute Information:
# 1. sepal length in cm 
# 2. sepal width in cm 
# 3. petal length in cm 
# 4. petal width in cm 
# 5. class: 
# -- Iris Setosa 
# -- Iris Versicolour 
# -- Iris Virginica


printRDD(rdd,5)

In [None]:
#5 - map
mapped_rdd = rdd.map(lambda line : line.split(","))

printRDD(mapped_rdd,5)

In [None]:
#6 - flatMap
flatMaped_rdd = rdd.flatMap(lambda line : line.split(","))

printRDD(flatMaped_rdd,25)

In [None]:
#7 - create unique id
zipedWithUniqueId_rdd = rdd.zipWithUniqueId()

print("zipedWithUniqueId_rdd count : " + str(zipedWithUniqueId_rdd.count()))
printRDD(zipedWithUniqueId_rdd,5)

In [None]:
#8 - sample data
sampled_rdd = zipedWithUniqueId_rdd.sample(withReplacement=False, fraction=0.5, seed=50)

print("rdd count : " + str(zipedWithUniqueId_rdd.count()))
print("sampled_rdd count : " + str(sampled_rdd.count()))
printRDD(sampled_rdd,5)

In [None]:
#9 - union and intersect
sampled1_rdd = zipedWithUniqueId_rdd.sample(withReplacement=False, fraction=0.5, seed=25)
sampled2_rdd = zipedWithUniqueId_rdd.sample(withReplacement=False, fraction=0.5, seed=50)
unioned_rdd = sampled1_rdd.union(sampled2_rdd)
intersected_rdd = sampled1_rdd.intersection(sampled2_rdd)

print("sampled1_rdd count : " + str(sampled1_rdd.count()))
print("sampled2_rdd count : " + str(sampled2_rdd.count()))
print("unioned_rdd count : " + str(unioned_rdd.count()))
print("intersected_rdd count : " + str(intersected_rdd.count()))

In [None]:
#10 - distinct
label_rdd = mapped_rdd.map(lambda line : line[-1])
printRDD(label_rdd,5)

print("\n")

label_list = label_rdd.distinct().collect()
print(label_list)

In [None]:
#11 - zip 2 rdd together
feature_rdd = mapped_rdd.map(lambda line : line[0:-1])
printRDD(feature_rdd,5)

print("\n")

zip_rdd = feature_rdd.zip(label_rdd)
printRDD(zip_rdd,5)

print("\n")

zip_rdd = zip_rdd.map(lambda features : features + [label])

In [None]:
#12 - collect
data_list = rdd.collect()
#Too many result => not a good method when deal with big data
print("data_list size : " + str(len(data_list)))
for data in data_list:
    print(data)

In [None]:
#13 - take
data_list = rdd.take(5)
#Select first n rows
print("data_list size : " + str(len(data_list)))
for data in data_list:
    print(data)

In [None]:
#14 - top
data_list = rdd.top(5)
#Select top n rows
print("data_list size : " + str(len(data_list)))
for data in data_list:
    print(data)