<a href="https://colab.research.google.com/github/jugalpanchal/bd-chef/blob/main/spark_rdd_and_df.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Follow the steps to install the dependencies:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install java
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz # spark package download
!tar xf spark-3.1.2-bin-hadoop3.2.tgz # unzip spark package
!pip install -q findspark # install spark

# Set the location of Java and Spark:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import Row

# create or get spark session
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Spark_App1") \
        .getOrCreate()

# Spark 2.x does not give direct sparkcontext so we need to get it from SparkSession
sc = spark.sparkContext
#sc.appName #Spark_App1
#sc.uiWebUrl # WebUI
#sc.applicationId # application_xxxxxxxxxx_xxxx

""" 
#sc: SparkContext
1. play with RDD.
2. It comes from the Spark-Core
3. One only per application
#spark: SparkSession
1. play with DataFrame. 
2. It comes from the Spark-SQL.
3. It merges SQLContext and HiveContext into one object.
4. We can have multiple spark session objects in single application
5. It allows us to access the SparkContext
"""

' \n#sc: SparkContext\n1. play with RDD.\n2. It comes from the Spark-Core\n3. One only per application\n#spark: SparkSession\n1. play with DataFrame. \n2. It comes from the Spark-SQL.\n3. It merges SQLContext and HiveContext into one object.\n4. We can have multiple spark session objects in single application\n5. It allows us to access the SparkContext\n'

### RDD

In [118]:
# There are three way we can create a RDD. 1. Using the parallelize method 2. Reading a file 3. From the another RDD
# Read a file and create a RDD - RDD Started in Spark 1.x
collection_rdd = sc.textFile("sample_data/anscombe.json")+
#collection_rdd = sc.textFile("sample_data/*.json") # multiple files

# collection_rdd # sample_data/anscombe.json MapPartitionsRDD[260] at textFile at NativeMethodAccessorImpl.java:0
collection_rdd.setName("anscombe_rdd") # The Spark gives a internal name but we can specify the way we want. Even it helps in the SparkUI to identify specific RDD.
collection_rdd

anscombe_rdd MapPartitionsRDD[262] at textFile at NativeMethodAccessorImpl.java:0

In [None]:
#collection_rdd.count() #49
#collection_rdd.take(5) # top 5 entities

In [None]:
collection_rdd.collect() # this is like a collection

['[',
 '  {"Series":"I", "X":10.0, "Y":8.04},',
 '  {"Series":"I", "X":8.0, "Y":6.95},',
 '  {"Series":"I", "X":13.0, "Y":7.58},',
 '  {"Series":"I", "X":9.0, "Y":8.81},',
 '  {"Series":"I", "X":11.0, "Y":8.33},',
 '  {"Series":"I", "X":14.0, "Y":9.96},',
 '  {"Series":"I", "X":6.0, "Y":7.24},',
 '  {"Series":"I", "X":4.0, "Y":4.26},',
 '  {"Series":"I", "X":12.0, "Y":10.84},',
 '  {"Series":"I", "X":7.0, "Y":4.81},',
 '  {"Series":"I", "X":5.0, "Y":5.68},',
 '',
 '  {"Series":"II", "X":10.0, "Y":9.14},',
 '  {"Series":"II", "X":8.0, "Y":8.14},',
 '  {"Series":"II", "X":13.0, "Y":8.74},',
 '  {"Series":"II", "X":9.0, "Y":8.77},',
 '  {"Series":"II", "X":11.0, "Y":9.26},',
 '  {"Series":"II", "X":14.0, "Y":8.10},',
 '  {"Series":"II", "X":6.0, "Y":6.13},',
 '  {"Series":"II", "X":4.0, "Y":3.10},',
 '  {"Series":"II", "X":12.0, "Y":9.13},',
 '  {"Series":"II", "X":7.0, "Y":7.26},',
 '  {"Series":"II", "X":5.0, "Y":4.74},',
 '',
 '  {"Series":"III", "X":10.0, "Y":7.46},',
 '  {"Series":"I

### DataFrame

In [None]:
# Read a file and create a DataFrame - DataFrame Started in Spark 2.x but on top of the RDD.
json_df = spark.read.json('sample_data/anscombe.json') # first row is always a columns row.
json_df

DataFrame[Series: string, X: double, Y: double, _corrupt_record: string]

In [None]:
json_df.printSchema()

root
 |-- Series: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- _corrupt_record: string (nullable = true)



In [None]:
json_df.collect()

[Row(Series=None, X=None, Y=None, _corrupt_record='['),
 Row(Series='I', X=10.0, Y=8.04, _corrupt_record=None),
 Row(Series='I', X=8.0, Y=6.95, _corrupt_record=None),
 Row(Series='I', X=13.0, Y=7.58, _corrupt_record=None),
 Row(Series='I', X=9.0, Y=8.81, _corrupt_record=None),
 Row(Series='I', X=11.0, Y=8.33, _corrupt_record=None),
 Row(Series='I', X=14.0, Y=9.96, _corrupt_record=None),
 Row(Series='I', X=6.0, Y=7.24, _corrupt_record=None),
 Row(Series='I', X=4.0, Y=4.26, _corrupt_record=None),
 Row(Series='I', X=12.0, Y=10.84, _corrupt_record=None),
 Row(Series='I', X=7.0, Y=4.81, _corrupt_record=None),
 Row(Series='I', X=5.0, Y=5.68, _corrupt_record=None),
 Row(Series='II', X=10.0, Y=9.14, _corrupt_record=None),
 Row(Series='II', X=8.0, Y=8.14, _corrupt_record=None),
 Row(Series='II', X=13.0, Y=8.74, _corrupt_record=None),
 Row(Series='II', X=9.0, Y=8.77, _corrupt_record=None),
 Row(Series='II', X=11.0, Y=9.26, _corrupt_record=None),
 Row(Series='II', X=14.0, Y=8.1, _corrupt_record=N

### RDD to DataFrame and vice versa

In [None]:
generic_data = sc.parallelize([1, 'Alice', 50, 'Bob', 'Canton']) # RDD collection supports different type of items even in a nested colletion.
#generic_data.toDF() # It does not work because it contains object with differnt types.

In [None]:
people_list = [('Alice', 25), ('Bob', 20), ('Charly', 30), ('Don', 15)]
people_rdd_pair = sc.parallelize(people_list) # create a RDD but with Tuple/Pair objects
#people_rdd_pair # ParallelCollectionRDD[2] at readRDDFromFile

people_rdd_pair.collect()

[('Alice', 25), ('Bob', 20), ('Charly', 30), ('Don', 15)]

In [None]:
people_df = people_rdd_pair.toDF() # Column names have been automatically generated and assigned. It infers data types too.
#people_df # DataFrame[_1: string, _2: bigint]
people_df.collect() # shows data with its class

[Row(_1='Alice', _2=25),
 Row(_1='Bob', _2=20),
 Row(_1='Charly', _2=30),
 Row(_1='Don', _2=15)]

In [None]:
people_df.show() # shows in tabular format

+------+---+
|    _1| _2|
+------+---+
| Alice| 25|
|   Bob| 20|
|Charly| 30|
|   Don| 15|
+------+---+



In [None]:
from pyspark import Row
# There are two way to get custom column names.
# 1. Create the row class object while creating a RDD
data_rdd_row = sc.parallelize([Row(name='Alice', age=50), 
                               Row(name='Bob', age=20)])
data_rdd_row.collect() # [Row(name='Alice', age=50), Row(name='Bob', age=20)]

[Row(name='Alice', age=50), Row(name='Bob', age=20)]

In [None]:
# 2. Transform existing RDD to new RDD
people_rdd_row = people_rdd_pair.map(lambda x: Row(name=x[0], age=int(x[1]))) # create a RDD with Row objects
people_rdd_row # PythonRDD[63] at RDD
people_rdd_row.collect()

[Row(name='Alice', age=25),
 Row(name='Bob', age=20),
 Row(name='Charly', age=30),
 Row(name='Don', age=15)]

In [None]:
data_rdd_row.toDF().show()
people_rdd_row.toDF().show()

+-----+---+
| name|age|
+-----+---+
|Alice| 50|
|  Bob| 20|
+-----+---+

+------+---+
|  name|age|
+------+---+
| Alice| 25|
|   Bob| 20|
|Charly| 30|
|   Don| 15|
+------+---+



In [None]:
people_df.rdd # Every DataFrame contains a RDD and we can get the dataframe's RDD easily.

MapPartitionsRDD[204] at javaToPython at NativeMethodAccessorImpl.java:0

In [None]:
people_df = data_rdd_row.toDF()

### Transformation on DataFrame
####FlatMap, Map, ReduceByKey

In [None]:
# map operation on RDD
people_rdd = people_df.rdd.map(lambda x: (x.name, x.age + 5))
#people_rdd # PythonRDD[78] at RDD
people_rdd.collect()

[('Alice', 55), ('Bob', 25)]

In [None]:
# Map: the map operation performs a transformation on every element in the RDD. 
# Similar kind of method is not available for the DataFrame, 
# so we compute and assign computed value in new column within the DataFrame. 

people_df_new_age = people_df.select('name', 'age').withColumn('new_age', people_df.age + 10) # we can type people_df.age to mention column name.
people_df_new_age.show()

people_df_new_age = people_df_new_age.withColumnRenamed('new_age', 'after_ten_years')
people_df_new_age = people_df_new_age.drop('age')
people_df_new_age.show()

+-----+---+-------+
| name|age|new_age|
+-----+---+-------+
|Alice| 50|     60|
|  Bob| 20|     30|
+-----+---+-------+

+-----+---------------+
| name|after_ten_years|
+-----+---------------+
|Alice|             60|
|  Bob|             30|
+-----+---------------+



In [None]:
people_df.first()

Row(name='Alice', age=50)

In [None]:
people_df.take(2)

[Row(name='Alice', age=50), Row(name='Bob', age=20)]

In [None]:
people_df.collect()[1][0] # It is a tabular format so can access specific cell.
# It always creates a clone then return so if the cell has any class type and we modify then it won't change in the dataframe.

'Bob'

In [None]:
"""
Text in the datasets/simple_titles.txt:
How can I use DataFrames in 2.0
What is an RDD and Schema RDD
How do I group by a field
Can I use Hive from HUE
"""

# read lines
lines_rdd = sc.textFile("datasets/simple_titles.txt")
print('lines', lines_rdd.collect())

# flat all words
words_rdd = lines_rdd.flatMap(lambda line: line.split(' '))
print('words', words_rdd.collect())

# map each word with 1 value
word_and_count_rdd = words_rdd.map(lambda x: (x,1))
print('word_for_count', word_and_count_rdd.collect())

# count each word by key
word_with_count_rdd = word_and_count_rdd.reduceByKey(lambda x,y: x + y)
print('word_with_count_rdd', word_with_count_rdd.collect())

lines ['How can I use DataFrames in 2.0', 'What is an RDD and Schema RDD', 'How do I group by a field', 'Can I use Hive from HUE']
words ['How', 'can', 'I', 'use', 'DataFrames', 'in', '2.0', 'What', 'is', 'an', 'RDD', 'and', 'Schema', 'RDD', 'How', 'do', 'I', 'group', 'by', 'a', 'field', 'Can', 'I', 'use', 'Hive', 'from', 'HUE']
word_for_count [('How', 1), ('can', 1), ('I', 1), ('use', 1), ('DataFrames', 1), ('in', 1), ('2.0', 1), ('What', 1), ('is', 1), ('an', 1), ('RDD', 1), ('and', 1), ('Schema', 1), ('RDD', 1), ('How', 1), ('do', 1), ('I', 1), ('group', 1), ('by', 1), ('a', 1), ('field', 1), ('Can', 1), ('I', 1), ('use', 1), ('Hive', 1), ('from', 1), ('HUE', 1)]
word_with_count_rdd [('use', 2), ('DataFrames', 1), ('in', 1), ('2.0', 1), ('What', 1), ('is', 1), ('an', 1), ('do', 1), ('group', 1), ('field', 1), ('Hive', 1), ('How', 2), ('can', 1), ('I', 3), ('RDD', 2), ('and', 1), ('Schema', 1), ('by', 1), ('a', 1), ('Can', 1), ('from', 1), ('HUE', 1)]


In [None]:
dir(people_df) # check all the methods which can be used with the RDD.

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_collect_as_arrow',
 '_jcols',
 '_jdf',
 '_jmap',
 '_jseq',
 '_lazy_rdd',
 '_repr_html_',
 '_sc',
 '_schema',
 '_sort_cols',
 '_support_repr_html',
 '_to_corrected_pandas_type',
 'agg',
 'alias',
 'approxQuantile',
 'cache',
 'checkpoint',
 'coalesce',
 'colRegex',
 'collect',
 'columns',
 'corr',
 'count',
 'cov',
 'createGlobalTempView',
 'createOrReplaceGlobalTempView',
 'createOrReplaceTempView',
 'createTempView',
 'crossJoin',
 'crosstab',
 'cube',
 'describe',
 'distinct',
 'drop',
 'dropDuplicates',
 'drop_duplicates',
 'dropna',
 'dtypes',
 'exceptAll',
 'explain',
 'fillna',
 'fi

In [None]:
help(people_df) # even better with example

Help on DataFrame in module pyspark.sql.dataframe object:

class DataFrame(pyspark.sql.pandas.map_ops.PandasMapOpsMixin, pyspark.sql.pandas.conversion.PandasConversionMixin)
 |  DataFrame(jdf, sql_ctx)
 |  
 |  A distributed collection of data grouped into named columns.
 |  
 |  A :class:`DataFrame` is equivalent to a relational table in Spark SQL,
 |  and can be created using various functions in :class:`SparkSession`::
 |  
 |      people = spark.read.parquet("...")
 |  
 |  Once created, it can be manipulated using the various domain-specific-language
 |  (DSL) functions defined in: :class:`DataFrame`, :class:`Column`.
 |  
 |  To select a column from the :class:`DataFrame`, use the apply method::
 |  
 |      ageCol = people.age
 |  
 |  A more concrete example::
 |  
 |      # To create DataFrame using SparkSession
 |      people = spark.read.parquet("...")
 |      department = spark.read.parquet("...")
 |  
 |      people.filter(people.age > 30).join(department, people.deptId ==

###Partition

In [None]:
list_one_to_five_rdd = sc.parallelize([1, 2, 3, 4, 5, 6])
#list_one_to_five_rdd # ParallelCollectionRDD[1] at readRDDFromFile at PythonRDD.scala:274
#list_one_to_five_rdd.collect() # [1, 2, 3, 4, 5, 6]
#list_one_to_five_rdd.getNumPartitions() # 2 - even though it is small data it has created 2 partition.
partitions_array = list_one_to_five_rdd.glom() # gluam method bring all the partition as individual partition. don't try with big valume data.
partitions_array.collect()

[[1, 2, 3], [4, 5, 6]]

In [None]:
list_one_to_five_rdd = sc.parallelize([1, 2, 3, 4, 5, 6], 3) # specify partition explicitly
list_one_to_five_rdd.getNumPartitions() # 3
partitions_array = list_one_to_five_rdd.glom()
partitions_array.collect()

[[1, 2], [3, 4], [5, 6]]

###Inferred Schemas

In [None]:
lines_rdd = sc.textFile("datasets/students.txt")
lines_rdd.collect()

['Emily,44,55,78', 'Andy,47,34,89', 'Rick,55,78,55', 'Aaron,66,34,98']

In [None]:
parts_rdd = lines_rdd.map(lambda l: l.split(","))
parts_rdd.collect()

[['Emily', '44', '55', '78'],
 ['Andy', '47', '34', '89'],
 ['Rick', '55', '78', '55'],
 ['Aaron', '66', '34', '98']]

In [None]:
students_rdd = parts_rdd.map(lambda p: Row(name=p[0], math=int(p[1]), english=int(p[2]), science=int(p[3])))
students_rdd.collect()

[Row(name='Emily', math=44, english=55, science=78),
 Row(name='Andy', math=47, english=34, science=89),
 Row(name='Rick', math=55, english=78, science=55),
 Row(name='Aaron', math=66, english=34, science=98)]

In [None]:
students_df = spark.createDataFrame(students_rdd)
students_df.createOrReplaceTempView("students_tbl")
students_df.columns

['name', 'math', 'english', 'science']

In [None]:
students_df.schema # Spark generates schema automatically.

StructType(List(StructField(name,StringType,true),StructField(math,LongType,true),StructField(english,LongType,true),StructField(science,LongType,true)))

In [None]:
spark.sql("SELECT * FROM students_tbl").show()

+-----+----+-------+-------+
| name|math|english|science|
+-----+----+-------+-------+
|Emily|  44|     55|     78|
| Andy|  47|     34|     89|
| Rick|  55|     78|     55|
|Aaron|  66|     34|     98|
+-----+----+-------+-------+



 ### Explicit Schemas

In [None]:
parts_rdd.collect()

[['Emily', '44', '55', '78'],
 ['Andy', '47', '34', '89'],
 ['Rick', '55', '78', '55'],
 ['Aaron', '66', '34', '98']]

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, LongType

fields = [StructField('name', StringType(), True),
          StructField('math', LongType(), True),
          StructField('english', LongType(), True),
          StructField('science', LongType(), True),
]
student_schema = StructType(fields)

students_explicit_df = spark.createDataFrame(parts_rdd, student_schema)
students_explicit_df.columns

['name', 'math', 'english', 'science']

In [None]:
students_explicit_df.schema

StructType(List(StructField(name,StringType,true),StructField(math,LongType,true),StructField(english,LongType,true),StructField(science,LongType,true)))