<a href="https://colab.research.google.com/github/jugalpanchal/bd-chef/blob/main/spark_rdd_and_df.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Follow the steps to install the dependencies:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install java
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz # spark package download
!tar xf spark-3.1.2-bin-hadoop3.2.tgz # unzip spark package
!pip install -q findspark # install spark

# Set the location of Java and Spark:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import Row

# create or get spark session
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Spark_App1") \
        .getOrCreate()

# Spark 2.x does not give direct sparkcontext so we need to get it from SparkSession
sc = spark.sparkContext
#sc.appName #Spark_App1
#sc.uiWebUrl # WebUI
#sc.applicationId # application_xxxxxxxxxx_xxxx


#sc: SparkContext
  #1. play with RDD.
  #2. It comes from the Spark-Core
  #3. One only per application
#spark: SparkSession
  #1. play with DataFrame. 
  #2. It comes from the Spark-SQL.
  #3. It merges SQLContext and HiveContext into one object.
  #4. We can have multiple spark session objects in single application
  #5. It allows us to access the SparkContext


### RDD

In [3]:
# A RDD is a collection of object of any type. It's schemaless.
# There are three way we can create a RDD. 1. Using the parallelize method 2. Reading a file 3. From the another RDD
# Read a file and create a RDD - RDD Started in Spark 1.x
collection_rdd = sc.textFile("sample_data/anscombe.json")
#collection_rdd = sc.textFile("sample_data/*.json") # multiple files

# collection_rdd # sample_data/anscombe.json MapPartitionsRDD[260] at textFile at NativeMethodAccessorImpl.java:0
collection_rdd.setName("anscombe_rdd") # The Spark gives a internal name but we can specify the way we want. Even it helps in the SparkUI to identify specific RDD.
collection_rdd

anscombe_rdd MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [4]:
#collection_rdd.count() #49
#collection_rdd.take(5) # top 5 entities

In [5]:
collection_rdd.collect() # this is like a collection

['[',
 '  {"Series":"I", "X":10.0, "Y":8.04},',
 '  {"Series":"I", "X":8.0, "Y":6.95},',
 '  {"Series":"I", "X":13.0, "Y":7.58},',
 '  {"Series":"I", "X":9.0, "Y":8.81},',
 '  {"Series":"I", "X":11.0, "Y":8.33},',
 '  {"Series":"I", "X":14.0, "Y":9.96},',
 '  {"Series":"I", "X":6.0, "Y":7.24},',
 '  {"Series":"I", "X":4.0, "Y":4.26},',
 '  {"Series":"I", "X":12.0, "Y":10.84},',
 '  {"Series":"I", "X":7.0, "Y":4.81},',
 '  {"Series":"I", "X":5.0, "Y":5.68},',
 '',
 '  {"Series":"II", "X":10.0, "Y":9.14},',
 '  {"Series":"II", "X":8.0, "Y":8.14},',
 '  {"Series":"II", "X":13.0, "Y":8.74},',
 '  {"Series":"II", "X":9.0, "Y":8.77},',
 '  {"Series":"II", "X":11.0, "Y":9.26},',
 '  {"Series":"II", "X":14.0, "Y":8.10},',
 '  {"Series":"II", "X":6.0, "Y":6.13},',
 '  {"Series":"II", "X":4.0, "Y":3.10},',
 '  {"Series":"II", "X":12.0, "Y":9.13},',
 '  {"Series":"II", "X":7.0, "Y":7.26},',
 '  {"Series":"II", "X":5.0, "Y":4.74},',
 '',
 '  {"Series":"III", "X":10.0, "Y":7.46},',
 '  {"Series":"I

### DataFrame

In [6]:
# Read a file and create a DataFrame - DataFrame Started in Spark 2.x but on top of the RDD.
# Note: The loading(spark.read.json) the file into the dataframe is not lazy-loading like other transformation.
# But the RDD has lazy-loading. The dataframe inferes the schema so it needs to have eager loading.
json_df = spark.read.json('sample_data/anscombe.json') # first row is always a columns row.
json_df

DataFrame[Series: string, X: double, Y: double, _corrupt_record: string]

In [7]:
json_df.printSchema()

root
 |-- Series: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- _corrupt_record: string (nullable = true)



In [8]:
json_df.collect()

[Row(Series=None, X=None, Y=None, _corrupt_record='['),
 Row(Series='I', X=10.0, Y=8.04, _corrupt_record=None),
 Row(Series='I', X=8.0, Y=6.95, _corrupt_record=None),
 Row(Series='I', X=13.0, Y=7.58, _corrupt_record=None),
 Row(Series='I', X=9.0, Y=8.81, _corrupt_record=None),
 Row(Series='I', X=11.0, Y=8.33, _corrupt_record=None),
 Row(Series='I', X=14.0, Y=9.96, _corrupt_record=None),
 Row(Series='I', X=6.0, Y=7.24, _corrupt_record=None),
 Row(Series='I', X=4.0, Y=4.26, _corrupt_record=None),
 Row(Series='I', X=12.0, Y=10.84, _corrupt_record=None),
 Row(Series='I', X=7.0, Y=4.81, _corrupt_record=None),
 Row(Series='I', X=5.0, Y=5.68, _corrupt_record=None),
 Row(Series='II', X=10.0, Y=9.14, _corrupt_record=None),
 Row(Series='II', X=8.0, Y=8.14, _corrupt_record=None),
 Row(Series='II', X=13.0, Y=8.74, _corrupt_record=None),
 Row(Series='II', X=9.0, Y=8.77, _corrupt_record=None),
 Row(Series='II', X=11.0, Y=9.26, _corrupt_record=None),
 Row(Series='II', X=14.0, Y=8.1, _corrupt_record=N

### RDD to DataFrame and vice versa

In [9]:
generic_data_rdd = sc.parallelize([1, 'Alice', 50, 'Bob', 'Canton']) # RDD collection supports different type of items even in a nested colletion.
#generic_data_rdd.toDF() # It does not work because it contains object with differnt types.

In [123]:
people_list = [('Alice', 25, 1500, 15), ('Bob', 20, 1000, 10), ('Charly', 30, 2000, 20), ('Don', 15, 500, 5), ('Eric', 20, 1500, 15)]
people_rdd_pair = sc.parallelize(people_list) # create a RDD but with Tuple/Pair objects
#people_rdd_pair # ParallelCollectionRDD[2] at readRDDFromFile

people_rdd_pair.collect()

[('Alice', 25, 1500, 15),
 ('Bob', 20, 1000, 10),
 ('Charly', 30, 2000, 20),
 ('Don', 15, 500, 5),
 ('Eric', 20, 1500, 15)]

In [125]:
people_df = people_rdd_pair.toDF() # Column names have been automatically generated and assigned. It infers data types too.
#people_df # DataFrame[_1: string, _2: bigint]
people_df.collect() # shows data with its class

[Row(_1='Alice', _2=25, _3=1500, _4=15),
 Row(_1='Bob', _2=20, _3=1000, _4=10),
 Row(_1='Charly', _2=30, _3=2000, _4=20),
 Row(_1='Don', _2=15, _3=500, _4=5),
 Row(_1='Eric', _2=20, _3=1500, _4=15)]

In [126]:
people_df.show() # shows in tabular format

+------+---+----+---+
|    _1| _2|  _3| _4|
+------+---+----+---+
| Alice| 25|1500| 15|
|   Bob| 20|1000| 10|
|Charly| 30|2000| 20|
|   Don| 15| 500|  5|
|  Eric| 20|1500| 15|
+------+---+----+---+



In [127]:
from pyspark import Row
# There are two way to get custom column names.
# 1. Create the row class object while creating a RDD
data_rdd_row = sc.parallelize([Row(name='Alice', age=50), 
                               Row(name='Bob', age=20)])
data_rdd_row.collect() # [Row(name='Alice', age=50), Row(name='Bob', age=20)]

[Row(name='Alice', age=50), Row(name='Bob', age=20)]

In [129]:
# 2. Transform existing RDD to new RDD
people_rdd_row = people_rdd_pair.map(lambda x: 
                                      Row(
                                          name=x[0], 
                                          age=int(x[1]), 
                                          salary=int(x[2]),
                                          tax=int(x[3])
                                          )
                                     ) # create a RDD with Row objects
people_rdd_row # PythonRDD[63] at RDD
people_rdd_row.collect()

[Row(name='Alice', age=25, salary=1500, tax=15),
 Row(name='Bob', age=20, salary=1000, tax=10),
 Row(name='Charly', age=30, salary=2000, tax=20),
 Row(name='Don', age=15, salary=500, tax=5),
 Row(name='Eric', age=20, salary=1500, tax=15)]

In [130]:
data_rdd_row.toDF().show()
people_rdd_row.toDF().show()

+-----+---+
| name|age|
+-----+---+
|Alice| 50|
|  Bob| 20|
+-----+---+

+------+---+------+---+
|  name|age|salary|tax|
+------+---+------+---+
| Alice| 25|  1500| 15|
|   Bob| 20|  1000| 10|
|Charly| 30|  2000| 20|
|   Don| 15|   500|  5|
|  Eric| 20|  1500| 15|
+------+---+------+---+



In [131]:
people_df.rdd # Every DataFrame contains a RDD and we can get the dataframe's RDD easily.

MapPartitionsRDD[504] at javaToPython at NativeMethodAccessorImpl.java:0

In [132]:
people_df = people_rdd_row.toDF()
people_df.show()

+------+---+------+---+
|  name|age|salary|tax|
+------+---+------+---+
| Alice| 25|  1500| 15|
|   Bob| 20|  1000| 10|
|Charly| 30|  2000| 20|
|   Don| 15|   500|  5|
|  Eric| 20|  1500| 15|
+------+---+------+---+



In [133]:
# Create a dataframe
names_df = spark.createDataFrame([(1, 'Jugal'), (2, 'Garvik')])
#names_df # DataFrame[_1: bigint, _2: string]

#names_df.collect()
# [Row(_1=1, _2='Jugal'), Row(_1=2, _2='Garvik')]

#names_df.take(1) # only 1 rows
# [Row(_1=1, _2='Jugal')]

#names_df.show()
#+---+------+
#| _1|    _2|
#+---+------+
#|  1| Jugal|
#|  2|Garvik|
#+---+------+

#names_df.limit(1).show()
#+---+-----+
#| _1|   _2|
#+---+-----+
#|  1|Jugal|
#+---+-----+

# Note: _1 and _2 are two column names.

#names_with_schema_df = names_df.toDF("Id", "Name") # the toDF() is also avaiable with the dataframe to specify the schema.
#names_with_schema_df.show()
#+---+------+
#| Id|  Name|
#+---+------+
#|  1| Jugal|
#|  2|Garvik|
#+---+------+

#names_df.printSchema()
#root
# |-- _1: long (nullable = true)
# |-- _2: string (nullable = true)

#names_with_schema_df.printSchema()
#root
# |-- Id: long (nullable = true)
# |-- Name: string (nullable = true)

names_rdd = names_df.rdd
names_rdd.collect() # It contains the row objects.

[Row(_1=1, _2='Jugal'), Row(_1=2, _2='Garvik')]

### Transformation on DataFrame
####FlatMap, Map, ReduceByKey

In [134]:
# combining - within partition
# merging - across partition

# map operation on RDD
people_rdd = people_df.rdd.map(lambda x: (x.age, x.age + 5))
#people_rdd # PythonRDD[78] at RDD
people_rdd.collect()

[(25, 30), (20, 25), (30, 35), (15, 20), (20, 25)]

In [135]:
# Map: the map operation performs a transformation on every element in the RDD. 
# Similar kind of method is not available for the DataFrame, 
# so we compute and assign computed value in new column within the DataFrame. 

people_df_new_age = people_df.select('name', 'age').withColumn('new_age', people_df.age + 10) # we can type people_df.age to mention column name.
people_df_new_age.show()

people_df_new_age = people_df_new_age.withColumnRenamed('new_age', 'after_ten_years')
people_df_new_age = people_df_new_age.drop('age')
people_df_new_age.show()

+------+---+-------+
|  name|age|new_age|
+------+---+-------+
| Alice| 25|     35|
|   Bob| 20|     30|
|Charly| 30|     40|
|   Don| 15|     25|
|  Eric| 20|     30|
+------+---+-------+

+------+---------------+
|  name|after_ten_years|
+------+---------------+
| Alice|             35|
|   Bob|             30|
|Charly|             40|
|   Don|             25|
|  Eric|             30|
+------+---------------+



In [136]:
people_df.first()

Row(name='Alice', age=25, salary=1500, tax=15)

In [137]:
people_df.take(2)

[Row(name='Alice', age=25, salary=1500, tax=15),
 Row(name='Bob', age=20, salary=1000, tax=10)]

In [138]:
people_df.collect()[1][0] # It is a tabular format so can access specific cell.
# It always creates a clone then return so if the cell has any class type and we modify then it won't change in the dataframe.

'Bob'

In [139]:
"""
Text in the datasets/simple_titles.txt:
How can I use DataFrames in 2.0
What is an RDD and Schema RDD
How do I group by a field
Can I use Hive from HUE
"""

# read lines
lines_rdd = sc.textFile("datasets/simple_titles.txt")
print('lines', lines_rdd.collect())

# flat all words
words_rdd = lines_rdd.flatMap(lambda line: line.split(' '))
print('words', words_rdd.collect())

# map each word with 1 value
word_and_count_rdd = words_rdd.map(lambda x: (x,1))
print('word_for_count', word_and_count_rdd.collect())

# count each word by key
word_with_count_rdd = word_and_count_rdd.reduceByKey(lambda x,y: x + y)
print('word_with_count_rdd', word_with_count_rdd.collect())

lines ['How can I use DataFrames in 2.0', 'What is an RDD and Schema RDD', 'How do I group by a field', 'Can I use Hive from HUE']
words ['How', 'can', 'I', 'use', 'DataFrames', 'in', '2.0', 'What', 'is', 'an', 'RDD', 'and', 'Schema', 'RDD', 'How', 'do', 'I', 'group', 'by', 'a', 'field', 'Can', 'I', 'use', 'Hive', 'from', 'HUE']
word_for_count [('How', 1), ('can', 1), ('I', 1), ('use', 1), ('DataFrames', 1), ('in', 1), ('2.0', 1), ('What', 1), ('is', 1), ('an', 1), ('RDD', 1), ('and', 1), ('Schema', 1), ('RDD', 1), ('How', 1), ('do', 1), ('I', 1), ('group', 1), ('by', 1), ('a', 1), ('field', 1), ('Can', 1), ('I', 1), ('use', 1), ('Hive', 1), ('from', 1), ('HUE', 1)]
word_with_count_rdd [('use', 2), ('DataFrames', 1), ('in', 1), ('2.0', 1), ('What', 1), ('is', 1), ('an', 1), ('do', 1), ('group', 1), ('field', 1), ('Hive', 1), ('How', 2), ('can', 1), ('I', 3), ('RDD', 2), ('and', 1), ('Schema', 1), ('by', 1), ('a', 1), ('Can', 1), ('from', 1), ('HUE', 1)]


In [140]:
dir(people_df) # check all the methods which can be used with the RDD.

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_collect_as_arrow',
 '_jcols',
 '_jdf',
 '_jmap',
 '_jseq',
 '_lazy_rdd',
 '_repr_html_',
 '_sc',
 '_schema',
 '_sort_cols',
 '_support_repr_html',
 '_to_corrected_pandas_type',
 'agg',
 'alias',
 'approxQuantile',
 'cache',
 'checkpoint',
 'coalesce',
 'colRegex',
 'collect',
 'columns',
 'corr',
 'count',
 'cov',
 'createGlobalTempView',
 'createOrReplaceGlobalTempView',
 'createOrReplaceTempView',
 'createTempView',
 'crossJoin',
 'crosstab',
 'cube',
 'describe',
 'distinct',
 'drop',
 'dropDuplicates',
 'drop_duplicates',
 'dropna',
 'dtypes',
 'exceptAll',
 'explain',
 'fillna',
 'fi

In [141]:
#help(people_df) # even better with example but it gives details about all the methods.
help(people_df.collect) # for specific method

Help on method collect in module pyspark.sql.dataframe:

collect() method of pyspark.sql.dataframe.DataFrame instance
    Returns all the records as a list of :class:`Row`.
    
    .. versionadded:: 1.3.0
    
    Examples
    --------
    >>> df.collect()
    [Row(age=2, name='Alice'), Row(age=5, name='Bob')]



### Filter, Select, When

#### The where is the Filter alias.

In [142]:
from pyspark.sql.functions import col, lit, concat
from pyspark.sql.functions import when
std_df = people_df
std_df.show()

# 1. Canonical Notation
std_df['name']
std_df['Name']

# 2. Dot(.) Notation
std_df.name
#std_df.Name #AttributeError: 'DataFrame' object has no attribute 'Name'

# 3. Col Notation
#std_df[col['name']]

# 4. SQL Notation
#'name'

# Specify a constant with the lit(literal) otherwise it is considered as a column name.
std_df.select(std_df['Name'], std_df.name, col('name'), 'name', concat(lit('Std '), col('name')) , std_df.age + 10).show()
#std_df.select(col['name']).show()

std_df.select('name', when(col('age') > 30, 'Adult').otherwise('Young')).show()

+------+---+------+---+
|  name|age|salary|tax|
+------+---+------+---+
| Alice| 25|  1500| 15|
|   Bob| 20|  1000| 10|
|Charly| 30|  2000| 20|
|   Don| 15|   500|  5|
|  Eric| 20|  1500| 15|
+------+---+------+---+

+------+------+------+------+------------------+----------+
|  Name|  name|  name|  name|concat(Std , name)|(age + 10)|
+------+------+------+------+------------------+----------+
| Alice| Alice| Alice| Alice|         Std Alice|        35|
|   Bob|   Bob|   Bob|   Bob|           Std Bob|        30|
|Charly|Charly|Charly|Charly|        Std Charly|        40|
|   Don|   Don|   Don|   Don|           Std Don|        25|
|  Eric|  Eric|  Eric|  Eric|          Std Eric|        30|
+------+------+------+------+------------------+----------+

+------+----------------------------------------------+
|  name|CASE WHEN (age > 30) THEN Adult ELSE Young END|
+------+----------------------------------------------+
| Alice|                                         Young|
|   Bob|          

In [143]:
from pyspark.sql import functions as F
std_df.filter(F.col("name").like("%Bob%")).show() # the name contains Bob.

+----+---+------+---+
|name|age|salary|tax|
+----+---+------+---+
| Bob| 20|  1000| 10|
+----+---+------+---+



### OrderBy, GroupBy

In [144]:
people_df.orderBy(F.col("name"), ascending=False).show()

+------+---+------+---+
|  name|age|salary|tax|
+------+---+------+---+
|  Eric| 20|  1500| 15|
|   Don| 15|   500|  5|
|Charly| 30|  2000| 20|
|   Bob| 20|  1000| 10|
| Alice| 25|  1500| 15|
+------+---+------+---+



In [168]:
people_df.groupBy("age").count().show() # age count
people_df.groupBy("age").max("salary").show() # max salary of each age

# Multiple aggregation functions
people_df.groupBy("age")\
        .agg(
            F.min("salary"),
            F.max("salary"),
            F.avg("salary"),
            F.sum("salary"),
            F.sum("tax")
            )\
        .show()

+---+-----+
|age|count|
+---+-----+
| 25|    1|
| 15|    1|
| 30|    1|
| 20|    2|
+---+-----+

+---+-----------+
|age|max(salary)|
+---+-----------+
| 25|       1500|
| 15|        500|
| 30|       2000|
| 20|       1500|
+---+-----------+

+---+-----------+-----------+-----------+-----------+--------+
|age|min(salary)|max(salary)|avg(salary)|sum(salary)|sum(tax)|
+---+-----------+-----------+-----------+-----------+--------+
| 25|       1500|       1500|     1500.0|       1500|      15|
| 15|        500|        500|      500.0|        500|       5|
| 30|       2000|       2000|     2000.0|       2000|      20|
| 20|       1000|       1500|     1250.0|       2500|      25|
+---+-----------+-----------+-----------+-----------+--------+



###null

In [146]:
# Drop all rows if any column has null.
std_df.select("name", "age").dropna().show()

# Parameters: any, how, subset
std_df.select("name", "age").dropna(how='any')
std_df.select("name", "age").dropna(how='all')
std_df.select("name", "age").dropna(subset='age')

# Replace null with any value.
std_df.select("name", "age").fillna('[N/A]')

+------+---+
|  name|age|
+------+---+
| Alice| 25|
|   Bob| 20|
|Charly| 30|
|   Don| 15|
|  Eric| 20|
+------+---+



DataFrame[name: string, age: bigint]

###Explode

In [147]:
from pyspark.sql.functions import explode

veg_list_df = spark.createDataFrame([(['a', 'b', 'c'], 1), (['b', 'd'], 2)])
veg_list_df.show()

veg_list_df.select(explode(veg_list_df._1), veg_list_df._2).show()

veg_list_df.select(explode(veg_list_df._1), veg_list_df._2).distinct().show()

+---------+---+
|       _1| _2|
+---------+---+
|[a, b, c]|  1|
|   [b, d]|  2|
+---------+---+

+---+---+
|col| _2|
+---+---+
|  a|  1|
|  b|  1|
|  c|  1|
|  b|  2|
|  d|  2|
+---+---+

+---+---+
|col| _2|
+---+---+
|  a|  1|
|  b|  1|
|  d|  2|
|  c|  1|
|  b|  2|
+---+---+



###Partition

In [148]:
list1_rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9])
#list1_rdd # ParallelCollectionRDD[1] at readRDDFromFile at PythonRDD.scala:274
#list1_rdd.collect() # [1, 2, 3, 4, 5, 6, 7, 8, 9]
#list1_rdd.getNumPartitions() # 2 - even though it is small data it has created 2 partition.
partitions_array = list1_rdd.glom() # gluam method bring all the partition as individual partition. don't try with big valume data.
partitions_array.collect()

[[1, 2, 3, 4], [5, 6, 7, 8, 9]]

In [149]:
list2_rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9], 3) # specify partition explicitly
list2_rdd.getNumPartitions() # 3
partitions_array = list2_rdd.glom()
partitions_array.collect()

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [150]:
list3_rdd = list2_rdd.repartition(2) # specifies new partition, up or down but there would be shuffling.
#partitions_array.getNumPartitions() # 2
#list2_rdd.glom().collect() # this is still having 2 only.
list3_rdd.glom().collect()

[[1, 2, 3, 7, 8, 9], [4, 5, 6]]

In [151]:
partitions_array = list3_rdd.coalesce(2) # only decreases and it doesnot have shuffling.
# Note: If the existing partation is x then the coalesce value cannot be more than the x. Even we pass more than the x but it does not change the number of partition.
partitions_array.glom().collect()

[[1, 2, 3, 7, 8, 9], [4, 5, 6]]

###Inferred Schemas

In [152]:
lines_rdd = sc.textFile("datasets/students.txt")
lines_rdd.collect()

['Emily,44,55,78', 'Andy,47,34,89', 'Rick,55,78,55', 'Aaron,66,34,98']

In [153]:
parts_rdd = lines_rdd.map(lambda l: l.split(","))
parts_rdd.collect()

[['Emily', '44', '55', '78'],
 ['Andy', '47', '34', '89'],
 ['Rick', '55', '78', '55'],
 ['Aaron', '66', '34', '98']]

In [154]:
students_rdd = parts_rdd.map(lambda p: Row(name=p[0], math=int(p[1]), english=int(p[2]), science=int(p[3])))
students_rdd.collect()

[Row(name='Emily', math=44, english=55, science=78),
 Row(name='Andy', math=47, english=34, science=89),
 Row(name='Rick', math=55, english=78, science=55),
 Row(name='Aaron', math=66, english=34, science=98)]

In [155]:
students_df = spark.createDataFrame(students_rdd)
students_df.createOrReplaceTempView("students_tbl")
students_df.columns

['name', 'math', 'english', 'science']

In [156]:
students_df.schema # Spark generates schema automatically.

StructType(List(StructField(name,StringType,true),StructField(math,LongType,true),StructField(english,LongType,true),StructField(science,LongType,true)))

In [157]:
spark.sql("SELECT * FROM students_tbl").show()

+-----+----+-------+-------+
| name|math|english|science|
+-----+----+-------+-------+
|Emily|  44|     55|     78|
| Andy|  47|     34|     89|
| Rick|  55|     78|     55|
|Aaron|  66|     34|     98|
+-----+----+-------+-------+



 ### Explicit Schemas

In [158]:
students_rdd.collect()

[Row(name='Emily', math=44, english=55, science=78),
 Row(name='Andy', math=47, english=34, science=89),
 Row(name='Rick', math=55, english=78, science=55),
 Row(name='Aaron', math=66, english=34, science=98)]

In [159]:
from pyspark.sql.types import StructType, StructField, StringType, LongType

fields = [StructField('name', StringType(), True),
          StructField('math', LongType(), True),
          StructField('english', LongType(), True),
          StructField('science', LongType(), True),
]
student_schema = StructType(fields)

students_explicit_df = spark.createDataFrame(students_rdd, student_schema)
students_explicit_df.columns

['name', 'math', 'english', 'science']

In [160]:
students_explicit_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- math: long (nullable = true)
 |-- english: long (nullable = true)
 |-- science: long (nullable = true)



In [161]:
students_explicit_df.schema

StructType(List(StructField(name,StringType,true),StructField(math,LongType,true),StructField(english,LongType,true),StructField(science,LongType,true)))

In [162]:
students_explicit_df.dtypes # It shows column names with its data types.

[('name', 'string'),
 ('math', 'bigint'),
 ('english', 'bigint'),
 ('science', 'bigint')]

In [163]:
from pyspark.sql.types import IntegerType

# explicit change the data type
students_dt_df = students_explicit_df.withColumn('ScienceInt', students_explicit_df.math.cast('integer'))\
                                    .withColumn('MathInt', F.col("math").cast(IntegerType()))
students_dt_df

DataFrame[name: string, math: bigint, english: bigint, science: bigint, ScienceInt: int, MathInt: int]

### RDD Lineage(Operator/Dependency Graph)
Graph of transformation operations required to execute when an action is called.

In [164]:
parts_rdd.toDebugString()

b'(2) PythonRDD[605] at collect at <ipython-input-153-1c3a6166eca7>:2 []\n |  datasets/students.txt MapPartitionsRDD[604] at textFile at NativeMethodAccessorImpl.java:0 []\n |  datasets/students.txt HadoopRDD[603] at textFile at NativeMethodAccessorImpl.java:0 []'