<a href="https://colab.research.google.com/github/jugalpanchal/bd-chef/blob/main/spark_rdd_and_df.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Follow the steps to install the dependencies:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install java
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz # spark package download
!tar xf spark-3.1.2-bin-hadoop3.2.tgz # unzip spark package
!pip install -q findspark # install spark

# Set the location of Java and Spark:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# create or get spark session
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Spark_App1") \
        .getOrCreate()

# Spark 2.x does not give direct sparkcontext so we need to get it from SparkSession
sc = spark.sparkContext

In [3]:
# Read a file and create a RDD - RDD Started in Spark 1.x
collection_rdd = sc.textFile("sample_data/anscombe.json")
#collection_rdd = sc.textFile("sample_data/*.json") # multiple files

collection_rdd

sample_data/anscombe.json MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [4]:
collection_rdd.collect() # this is like a collection

['[',
 '  {"Series":"I", "X":10.0, "Y":8.04},',
 '  {"Series":"I", "X":8.0, "Y":6.95},',
 '  {"Series":"I", "X":13.0, "Y":7.58},',
 '  {"Series":"I", "X":9.0, "Y":8.81},',
 '  {"Series":"I", "X":11.0, "Y":8.33},',
 '  {"Series":"I", "X":14.0, "Y":9.96},',
 '  {"Series":"I", "X":6.0, "Y":7.24},',
 '  {"Series":"I", "X":4.0, "Y":4.26},',
 '  {"Series":"I", "X":12.0, "Y":10.84},',
 '  {"Series":"I", "X":7.0, "Y":4.81},',
 '  {"Series":"I", "X":5.0, "Y":5.68},',
 '',
 '  {"Series":"II", "X":10.0, "Y":9.14},',
 '  {"Series":"II", "X":8.0, "Y":8.14},',
 '  {"Series":"II", "X":13.0, "Y":8.74},',
 '  {"Series":"II", "X":9.0, "Y":8.77},',
 '  {"Series":"II", "X":11.0, "Y":9.26},',
 '  {"Series":"II", "X":14.0, "Y":8.10},',
 '  {"Series":"II", "X":6.0, "Y":6.13},',
 '  {"Series":"II", "X":4.0, "Y":3.10},',
 '  {"Series":"II", "X":12.0, "Y":9.13},',
 '  {"Series":"II", "X":7.0, "Y":7.26},',
 '  {"Series":"II", "X":5.0, "Y":4.74},',
 '',
 '  {"Series":"III", "X":10.0, "Y":7.46},',
 '  {"Series":"I

In [5]:
# Read a file and create a DataFrame - DataFrame Started in Spark 2.x but on top of the RDD.
json_df = spark.read.json('sample_data/anscombe.json') # first row is always a columns row.
json_df

DataFrame[Series: string, X: double, Y: double, _corrupt_record: string]

In [6]:
json_df.printSchema()

root
 |-- Series: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- _corrupt_record: string (nullable = true)



In [7]:
json_df.collect()

[Row(Series=None, X=None, Y=None, _corrupt_record='['),
 Row(Series='I', X=10.0, Y=8.04, _corrupt_record=None),
 Row(Series='I', X=8.0, Y=6.95, _corrupt_record=None),
 Row(Series='I', X=13.0, Y=7.58, _corrupt_record=None),
 Row(Series='I', X=9.0, Y=8.81, _corrupt_record=None),
 Row(Series='I', X=11.0, Y=8.33, _corrupt_record=None),
 Row(Series='I', X=14.0, Y=9.96, _corrupt_record=None),
 Row(Series='I', X=6.0, Y=7.24, _corrupt_record=None),
 Row(Series='I', X=4.0, Y=4.26, _corrupt_record=None),
 Row(Series='I', X=12.0, Y=10.84, _corrupt_record=None),
 Row(Series='I', X=7.0, Y=4.81, _corrupt_record=None),
 Row(Series='I', X=5.0, Y=5.68, _corrupt_record=None),
 Row(Series='II', X=10.0, Y=9.14, _corrupt_record=None),
 Row(Series='II', X=8.0, Y=8.14, _corrupt_record=None),
 Row(Series='II', X=13.0, Y=8.74, _corrupt_record=None),
 Row(Series='II', X=9.0, Y=8.77, _corrupt_record=None),
 Row(Series='II', X=11.0, Y=9.26, _corrupt_record=None),
 Row(Series='II', X=14.0, Y=8.1, _corrupt_record=N