# Spark Preparation
We check if we are in Google Colab.  If this is the case, install all necessary packages.

To run spark in Colab, we need to first install all the dependencies in Colab environment i.e. Apache Spark 3.2.1 with hadoop 3.2, Java 8 and Findspark to locate the spark in the system. The tools installation can be carried out inside the Jupyter Notebook of the Colab.
Learn more from [A Must-Read Guide on How to Work with PySpark on Google Colab for Data Scientists!](https://www.analyticsvidhya.com/blog/2020/11/a-must-read-guide-on-how-to-work-with-pyspark-on-google-colab-for-data-scientists/)

In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [2]:
if IN_COLAB:
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
    !tar xf spark-3.2.1-bin-hadoop3.2.tgz
    !mv spark-3.2.1-bin-hadoop3.2 spark
    !pip install -q findspark

In [3]:
if IN_COLAB:
  import os
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["SPARK_HOME"] = "/content/spark"

In [4]:
import findspark
findspark.init()

# Pyspark_Basic_RDD

In [5]:
#1 - import module
from pyspark import SparkContext

In [6]:
#2 - Create SparkContext
sc = sc = SparkContext.getOrCreate()
sc

In [7]:
import multiprocessing

multiprocessing.cpu_count()

2

In [8]:
#rdd.getNumPartitions

In [9]:
#3 - Print top 5 rows
def printRDD(data,num):
    for line in data.take(num):
        print(line)

In [10]:
!wget https://github.com/kaopanboonyuen/GISTDA2022/raw/main/dataset/iris.csv

--2022-03-21 03:30:50--  https://github.com/kaopanboonyuen/GISTDA2022/raw/main/dataset/iris.csv
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/kaopanboonyuen/GISTDA2022/main/dataset/iris.csv [following]
--2022-03-21 03:30:50--  https://raw.githubusercontent.com/kaopanboonyuen/GISTDA2022/main/dataset/iris.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4550 (4.4K) [text/plain]
Saving to: ‘iris.csv’


2022-03-21 03:30:50 (40.1 MB/s) - ‘iris.csv’ saved [4550/4550]



In [11]:
#4 - Read file to spark RDD
rdd = sc.textFile("iris.csv")
rdd.cache()

# Attribute Information:
# 1. sepal length in cm 
# 2. sepal width in cm 
# 3. petal length in cm 
# 4. petal width in cm 
# 5. class: 
# -- Iris Setosa 
# -- Iris Versicolour 
# -- Iris Virginica


printRDD(rdd,5)

5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa


In [12]:
#5 - map
mapped_rdd = rdd.map(lambda line : line.split(","))

printRDD(mapped_rdd,5)

['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']
['4.9', '3.0', '1.4', '0.2', 'Iris-setosa']
['4.7', '3.2', '1.3', '0.2', 'Iris-setosa']
['4.6', '3.1', '1.5', '0.2', 'Iris-setosa']
['5.0', '3.6', '1.4', '0.2', 'Iris-setosa']


In [13]:
#6 - flatMap
flatMaped_rdd = rdd.flatMap(lambda line : line.split(","))

printRDD(flatMaped_rdd,25)

5.1
3.5
1.4
0.2
Iris-setosa
4.9
3.0
1.4
0.2
Iris-setosa
4.7
3.2
1.3
0.2
Iris-setosa
4.6
3.1
1.5
0.2
Iris-setosa
5.0
3.6
1.4
0.2
Iris-setosa


In [14]:
#7 - create unique id
zipedWithUniqueId_rdd = rdd.zipWithUniqueId()

print("zipedWithUniqueId_rdd count : " + str(zipedWithUniqueId_rdd.count()))
printRDD(zipedWithUniqueId_rdd,5)

zipedWithUniqueId_rdd count : 150
('5.1,3.5,1.4,0.2,Iris-setosa', 0)
('4.9,3.0,1.4,0.2,Iris-setosa', 2)
('4.7,3.2,1.3,0.2,Iris-setosa', 4)
('4.6,3.1,1.5,0.2,Iris-setosa', 6)
('5.0,3.6,1.4,0.2,Iris-setosa', 8)


In [15]:
#8 - sample data
sampled_rdd = zipedWithUniqueId_rdd.sample(withReplacement=False, fraction=0.5, seed=50)

print("rdd count : " + str(zipedWithUniqueId_rdd.count()))
print("sampled_rdd count : " + str(sampled_rdd.count()))
printRDD(sampled_rdd,5)

rdd count : 150
sampled_rdd count : 61
('4.9,3.0,1.4,0.2,Iris-setosa', 2)
('4.7,3.2,1.3,0.2,Iris-setosa', 4)
('4.6,3.1,1.5,0.2,Iris-setosa', 6)
('5.0,3.6,1.4,0.2,Iris-setosa', 8)
('5.4,3.9,1.7,0.4,Iris-setosa', 10)


In [16]:
#9 - union and intersect
sampled1_rdd = zipedWithUniqueId_rdd.sample(withReplacement=False, fraction=0.5, seed=25)
sampled2_rdd = zipedWithUniqueId_rdd.sample(withReplacement=False, fraction=0.5, seed=50)
unioned_rdd = sampled1_rdd.union(sampled2_rdd)
intersected_rdd = sampled1_rdd.intersection(sampled2_rdd)

print("sampled1_rdd count : " + str(sampled1_rdd.count()))
print("sampled2_rdd count : " + str(sampled2_rdd.count()))
print("unioned_rdd count : " + str(unioned_rdd.count()))
print("intersected_rdd count : " + str(intersected_rdd.count()))

sampled1_rdd count : 64
sampled2_rdd count : 61
unioned_rdd count : 125
intersected_rdd count : 26


In [17]:
#10 - distinct
label_rdd = mapped_rdd.map(lambda line : line[-1])
printRDD(label_rdd,5)

print("\n")

label_list = label_rdd.distinct().collect()
print(label_list)

Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa


['Iris-setosa', 'Iris-virginica', 'Iris-versicolor']


In [18]:
#11 - zip 2 rdd together
feature_rdd = mapped_rdd.map(lambda line : line[0:-1])
printRDD(feature_rdd,5)

print("\n")

zip_rdd = feature_rdd.zip(label_rdd)
printRDD(zip_rdd,5)

print("\n")

zip_rdd = zip_rdd.map(lambda features : features + [label])

['5.1', '3.5', '1.4', '0.2']
['4.9', '3.0', '1.4', '0.2']
['4.7', '3.2', '1.3', '0.2']
['4.6', '3.1', '1.5', '0.2']
['5.0', '3.6', '1.4', '0.2']


(['5.1', '3.5', '1.4', '0.2'], 'Iris-setosa')
(['4.9', '3.0', '1.4', '0.2'], 'Iris-setosa')
(['4.7', '3.2', '1.3', '0.2'], 'Iris-setosa')
(['4.6', '3.1', '1.5', '0.2'], 'Iris-setosa')
(['5.0', '3.6', '1.4', '0.2'], 'Iris-setosa')




In [19]:
#12 - collect
data_list = rdd.collect()
#Too many result => not a good method when deal with big data
print("data_list size : " + str(len(data_list)))
for data in data_list:
    print(data)

data_list size : 150
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.4,3.7,1.5,0.2,Iris-setosa
4.8,3.4,1.6,0.2,Iris-setosa
4.8,3.0,1.4,0.1,Iris-setosa
4.3,3.0,1.1,0.1,Iris-setosa
5.8,4.0,1.2,0.2,Iris-setosa
5.7,4.4,1.5,0.4,Iris-setosa
5.4,3.9,1.3,0.4,Iris-setosa
5.1,3.5,1.4,0.3,Iris-setosa
5.7,3.8,1.7,0.3,Iris-setosa
5.1,3.8,1.5,0.3,Iris-setosa
5.4,3.4,1.7,0.2,Iris-setosa
5.1,3.7,1.5,0.4,Iris-setosa
4.6,3.6,1.0,0.2,Iris-setosa
5.1,3.3,1.7,0.5,Iris-setosa
4.8,3.4,1.9,0.2,Iris-setosa
5.0,3.0,1.6,0.2,Iris-setosa
5.0,3.4,1.6,0.4,Iris-setosa
5.2,3.5,1.5,0.2,Iris-setosa
5.2,3.4,1.4,0.2,Iris-setosa
4.7,3.2,1.6,0.2,Iris-setosa
4.8,3.1,1.6,0.2,Iris-setosa
5.4,3.4,1.5,0.4,Iris-setosa
5.2,4.1,1.5,0.1,Iris-setosa
5.5,4.2,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa

In [20]:
#13 - take
data_list = rdd.take(5)
#Select first n rows
print("data_list size : " + str(len(data_list)))
for data in data_list:
    print(data)

data_list size : 5
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa


In [21]:
#14 - top
data_list = rdd.top(5)
#Select top n rows
print("data_list size : " + str(len(data_list)))
for data in data_list:
    print(data)

data_list size : 5
7.9,3.8,6.4,2.0,Iris-virginica
7.7,3.8,6.7,2.2,Iris-virginica
7.7,3.0,6.1,2.3,Iris-virginica
7.7,2.8,6.7,2.0,Iris-virginica
7.7,2.6,6.9,2.3,Iris-virginica
