# Apache Spark

## Import PySpark

In [1]:
import findspark
findspark.init()

import pyspark

## Initiate Spark Session

In [2]:
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as F

conf=SparkConf()
conf.set("spark.driver.memory",      "1g") 
conf.set("spark.executor.memory",    "1g")
conf.set("spark.executor.instances", "2" )

spark = SparkSession.builder.master("yarn").appName("Spark BDP Example").enableHiveSupport().config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
spark

In [4]:
sc = spark.sparkContext

In [5]:
sc

### RDD
Resilient Distributed Dataset

### RDD Creation

#### 1) Collections

In [8]:
li = ["Vishnu", "Tharun", "Sundar", "Raja", "Priya"]

In [9]:
type(li)

list

In [10]:
rdd = sc.parallelize(li)

In [11]:
type(rdd)

pyspark.rdd.RDD

In [12]:
rdd.count()

                                                                                

5

In [15]:
rdd_filter = rdd.filter(lambda x: x[0] == "R")

In [22]:
idx0 = lambda x: x[0]

In [24]:
[idx0(i) for i in li]

['V', 'T', 'S', 'R', 'P']

In [21]:
rdd_filter.collect()

['Raja']

In [25]:
rdd_filter1 = rdd.filter(lambda x: x[0] == "T")

In [26]:
type(rdd_filter1)

pyspark.rdd.PipelinedRDD

In [27]:
rdd_op = rdd_filter1.collect()

In [28]:
type(rdd_op)

list

In [29]:
rdd_op

['Tharun']

#### 2) TextFile

In [6]:
!hdfs dfs -mkdir /user/bigdatapedia/spark_txt

In [7]:
!hdfs dfs -put /home/bigdatapedia/data/sample_de30_patient.txt /user/bigdatapedia/spark_txt/

In [8]:
!hdfs dfs -cat /user/bigdatapedia/spark_txt/*

id,name,drug,gender,age
51,senthil,paracetamol,male,12
32,saravanan,avil,male,44
33,rajesh,metacin,male,26
34,usha,paracetamol,female,20
25,alex,paracetamol,male,48
16,nasir,metacin,male,37
17,singh,paracetamol,male,15
18,santhosh,paracetamol,male,12
19,sarah,avil,female,14
40,raj,metacin,male,27


In [9]:
rdd_txt = sc.textFile("/user/bigdatapedia/spark_txt")

In [10]:
rdd_txt.count()

                                                                                

11

In [15]:
op_header = rdd_txt.first()

In [16]:
op_header

'id,name,drug,gender,age'

In [17]:
rdd_row = rdd_txt.filter(lambda x: x != op_header)

In [40]:
type(rdd_row)

pyspark.rdd.PipelinedRDD

In [18]:
rdd_row.collect()

['51,senthil,paracetamol,male,12',
 '32,saravanan,avil,male,44',
 '33,rajesh,metacin,male,26',
 '34,usha,paracetamol,female,20',
 '25,alex,paracetamol,male,48',
 '16,nasir,metacin,male,37',
 '17,singh,paracetamol,male,15',
 '18,santhosh,paracetamol,male,12',
 '19,sarah,avil,female,14',
 '40,raj,metacin,male,27']

In [45]:
rdd_row.collectAsMap(lambda x : x[1], x[2])

NameError: name 'x' is not defined

In [46]:
result_map = rdd_row.map(lambda x: (x[1], x[2])).collectAsMap()

In [47]:
result_map

{'1': ',',
 '2': ',',
 '3': ',',
 '4': ',',
 '5': ',',
 '6': ',',
 '7': ',',
 '8': ',',
 '9': ',',
 '0': ','}

In [20]:
rdd_row.count()

10

In [26]:
rdd_split = rdd_row.map(lambda y : y.split(","))

In [31]:
rdd_split.first()

['51', 'senthil', 'paracetamol', 'male', '12']

In [32]:
rdd_split.first()[1]

'senthil'

In [34]:
rdd_row.map(lambda y : y.split(",")).collect()

[['51', 'senthil', 'paracetamol', 'male', '12'],
 ['32', 'saravanan', 'avil', 'male', '44'],
 ['33', 'rajesh', 'metacin', 'male', '26'],
 ['34', 'usha', 'paracetamol', 'female', '20'],
 ['25', 'alex', 'paracetamol', 'male', '48'],
 ['16', 'nasir', 'metacin', 'male', '37'],
 ['17', 'singh', 'paracetamol', 'male', '15'],
 ['18', 'santhosh', 'paracetamol', 'male', '12'],
 ['19', 'sarah', 'avil', 'female', '14'],
 ['40', 'raj', 'metacin', 'male', '27']]

In [35]:
rdd_op = rdd_row.map(lambda y : y.split(",")).collect()

In [36]:
type(rdd_op)

list

In [37]:
rdd_op

[['51', 'senthil', 'paracetamol', 'male', '12'],
 ['32', 'saravanan', 'avil', 'male', '44'],
 ['33', 'rajesh', 'metacin', 'male', '26'],
 ['34', 'usha', 'paracetamol', 'female', '20'],
 ['25', 'alex', 'paracetamol', 'male', '48'],
 ['16', 'nasir', 'metacin', 'male', '37'],
 ['17', 'singh', 'paracetamol', 'male', '15'],
 ['18', 'santhosh', 'paracetamol', 'male', '12'],
 ['19', 'sarah', 'avil', 'female', '14'],
 ['40', 'raj', 'metacin', 'male', '27']]

In [38]:
rdd_op[0]

['51', 'senthil', 'paracetamol', 'male', '12']

In [39]:
rdd_op[0][1]

'senthil'

In [48]:
result_map = rdd_row.map(lambda y : y.split(",")).map(lambda x: (x[1], x[2])).collectAsMap()

In [49]:
result_map

{'senthil': 'paracetamol',
 'saravanan': 'avil',
 'rajesh': 'metacin',
 'usha': 'paracetamol',
 'alex': 'paracetamol',
 'nasir': 'metacin',
 'singh': 'paracetamol',
 'santhosh': 'paracetamol',
 'sarah': 'avil',
 'raj': 'metacin'}

##### Lambda

In [21]:
def add(x,y):
    z = x+y
    return z

In [23]:
add(1000, 540)

1540

In [24]:
add_lmda = lambda x,y : x+y

In [25]:
add_lmda(1000, 540)

1540

## RDD to Dataframe

In [52]:
from pyspark.sql import Row

In [50]:
rdd_split = rdd_row.map(lambda y : y.split(","))

In [51]:
rdd_split.first()

['51', 'senthil', 'paracetamol', 'male', '12']

In [66]:
rdd_df_row = rdd_split.map(lambda l : Row(id= int(l[0]),
                                          name = l[1],
                                          gender = l[3],                                          
                                          drug = l[2],                                          
                                          age = int(l[4])
                                         )
                          )

In [67]:
rdd_row_op = rdd_df_row.collect()
rdd_row_op

[Row(id=51, name='senthil', gender='male', drug='paracetamol', age=12),
 Row(id=32, name='saravanan', gender='male', drug='avil', age=44),
 Row(id=33, name='rajesh', gender='male', drug='metacin', age=26),
 Row(id=34, name='usha', gender='female', drug='paracetamol', age=20),
 Row(id=25, name='alex', gender='male', drug='paracetamol', age=48),
 Row(id=16, name='nasir', gender='male', drug='metacin', age=37),
 Row(id=17, name='singh', gender='male', drug='paracetamol', age=15),
 Row(id=18, name='santhosh', gender='male', drug='paracetamol', age=12),
 Row(id=19, name='sarah', gender='female', drug='avil', age=14),
 Row(id=40, name='raj', gender='male', drug='metacin', age=27)]

In [59]:
[x.id for x in rdd_row_op]

[51, 32, 33, 34, 25, 16, 17, 18, 19, 40]

In [61]:
[x.name for x in rdd_row_op]

['senthil',
 'saravanan',
 'rajesh',
 'usha',
 'alex',
 'nasir',
 'singh',
 'santhosh',
 'sarah',
 'raj']

In [63]:
[x.age for x in rdd_row_op]

[12, 44, 26, 20, 48, 37, 15, 12, 14, 27]

In [73]:
df = spark.createDataFrame(rdd_df_row)

In [74]:
df.show()

+---+---------+------+-----------+---+
| id|     name|gender|       drug|age|
+---+---------+------+-----------+---+
| 51|  senthil|  male|paracetamol| 12|
| 32|saravanan|  male|       avil| 44|
| 33|   rajesh|  male|    metacin| 26|
| 34|     usha|female|paracetamol| 20|
| 25|     alex|  male|paracetamol| 48|
| 16|    nasir|  male|    metacin| 37|
| 17|    singh|  male|paracetamol| 15|
| 18| santhosh|  male|paracetamol| 12|
| 19|    sarah|female|       avil| 14|
| 40|      raj|  male|    metacin| 27|
+---+---------+------+-----------+---+



In [71]:
df_1 = rdd_df_row.toDF()

In [72]:
df_1.show()

+---+---------+------+-----------+---+
| id|     name|gender|       drug|age|
+---+---------+------+-----------+---+
| 51|  senthil|  male|paracetamol| 12|
| 32|saravanan|  male|       avil| 44|
| 33|   rajesh|  male|    metacin| 26|
| 34|     usha|female|paracetamol| 20|
| 25|     alex|  male|paracetamol| 48|
| 16|    nasir|  male|    metacin| 37|
| 17|    singh|  male|paracetamol| 15|
| 18| santhosh|  male|paracetamol| 12|
| 19|    sarah|female|       avil| 14|
| 40|      raj|  male|    metacin| 27|
+---+---------+------+-----------+---+



In [77]:
type(df_1)

pyspark.sql.dataframe.DataFrame

### DF to RDD

In [78]:
df_to_rdd = df_1.rdd

In [79]:
type(df_to_rdd)

pyspark.rdd.RDD

In [81]:
df_to_rdd.collect()

                                                                                

[Row(id=51, name='senthil', gender='male', drug='paracetamol', age=12),
 Row(id=32, name='saravanan', gender='male', drug='avil', age=44),
 Row(id=33, name='rajesh', gender='male', drug='metacin', age=26),
 Row(id=34, name='usha', gender='female', drug='paracetamol', age=20),
 Row(id=25, name='alex', gender='male', drug='paracetamol', age=48),
 Row(id=16, name='nasir', gender='male', drug='metacin', age=37),
 Row(id=17, name='singh', gender='male', drug='paracetamol', age=15),
 Row(id=18, name='santhosh', gender='male', drug='paracetamol', age=12),
 Row(id=19, name='sarah', gender='female', drug='avil', age=14),
 Row(id=40, name='raj', gender='male', drug='metacin', age=27)]

## Dataframes

In [76]:
df.select("id", "name", "age").show()

+---+---------+---+
| id|     name|age|
+---+---------+---+
| 51|  senthil| 12|
| 32|saravanan| 44|
| 33|   rajesh| 26|
| 34|     usha| 20|
| 25|     alex| 48|
| 16|    nasir| 37|
| 17|    singh| 15|
| 18| santhosh| 12|
| 19|    sarah| 14|
| 40|      raj| 27|
+---+---------+---+



In [80]:
df.rdd.toDebugString()

b'(2) MapPartitionsRDD[64] at javaToPython at NativeMethodAccessorImpl.java:0 []\n |  MapPartitionsRDD[63] at javaToPython at NativeMethodAccessorImpl.java:0 []\n |  SQLExecutionRDD[62] at javaToPython at NativeMethodAccessorImpl.java:0 []\n |  MapPartitionsRDD[61] at javaToPython at NativeMethodAccessorImpl.java:0 []\n |  MapPartitionsRDD[50] at applySchemaToPythonRDD at NativeMethodAccessorImpl.java:0 []\n |  MapPartitionsRDD[49] at map at SerDeUtil.scala:69 []\n |  MapPartitionsRDD[48] at mapPartitions at SerDeUtil.scala:117 []\n |  PythonRDD[47] at RDD at PythonRDD.scala:53 []\n |  /user/bigdatapedia/spark_txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []\n |  /user/bigdatapedia/spark_txt HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []'

In [None]:
(2) MapPartitionsRDD[64] at javaToPython at NativeMethodAccessorImpl.java:0 []\n
|  MapPartitionsRDD[63] at javaToPython at NativeMethodAccessorImpl.java:0 []\n
|  SQLExecutionRDD[62] at javaToPython at NativeMethodAccessorImpl.java:0 []\n
|  MapPartitionsRDD[61] at javaToPython at NativeMethodAccessorImpl.java:0 []\n
|  MapPartitionsRDD[50] at applySchemaToPythonRDD at NativeMethodAccessorImpl.java:0 []\n
|  MapPartitionsRDD[49] at map at SerDeUtil.scala:69 []\n
|  MapPartitionsRDD[48] at mapPartitions at SerDeUtil.scala:117 []\n
|  PythonRDD[47] at RDD at PythonRDD.scala:53 []\n
|  /user/bigdatapedia/spark_txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []\n
|  /user/bigdatapedia/spark_txt HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []

### Load Sample data 

In [84]:
!hdfs dfs -mkdir -p /user/bigdatapedia/input/parquet

In [85]:
!hdfs dfs -put /home/bigdatapedia/data/customer_parq.parquet /user/bigdatapedia/input/parquet/

In [87]:
!hdfs dfs -ls /user/bigdatapedia/input/parquet

Found 1 items
-rw-r--r--   3 bigdatapedia supergroup     254648 2023-07-16 03:19 /user/bigdatapedia/input/parquet/customer_parq.parquet


### Transformations
    Narrow Transformation
    Wide Transformation (Shuffle)

In [89]:
df_cust = spark.read.parquet("/user/bigdatapedia/input/parquet")

                                                                                

In [90]:
df_cust

DataFrame[customer_id: int, customer_fname: string, customer_lname: string, customer_email: string, customer_password: string, customer_street: string, customer_city: string, customer_state: string, customer_zipcode: string]

### Narrow Transformation

In [99]:
from pyspark.sql.functions import lit, current_timestamp

#### 1) select 

In [91]:
df_cust.select("customer_id", "customer_fname", "customer_state").show(5)

[Stage 41:>                                                         (0 + 1) / 1]

+-----------+--------------+--------------+
|customer_id|customer_fname|customer_state|
+-----------+--------------+--------------+
|          1|       Richard|            TX|
|          2|          Mary|            CO|
|          3|           Ann|            PR|
|          4|          Mary|            CA|
|          5|        Robert|            PR|
+-----------+--------------+--------------+
only showing top 5 rows



                                                                                

#### 2) withColumn

In [94]:
df_cust_wc = df_cust.withColumn("new_cust_id", df_cust["customer_id"] + 100)

In [95]:
df_cust_wc.show(10)

+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+-----------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|     customer_street|customer_city|customer_state|customer_zipcode|new_cust_id|
+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+-----------+
|          1|       Richard|     Hernandez|     XXXXXXXXX|        XXXXXXXXX|  6303 Heather Plaza|  Brownsville|            TX|           78521|        101|
|          2|          Mary|       Barrett|     XXXXXXXXX|        XXXXXXXXX|9526 Noble Embers...|    Littleton|            CO|           80126|        102|
|          3|           Ann|         Smith|     XXXXXXXXX|        XXXXXXXXX|3422 Blue Pioneer...|       Caguas|            PR|           00725|        103|
|          4|          Mary|         Jones|     XXXXXXXXX|      

In [97]:
df_cust_wc_2 = df_cust.withColumn("newnull", lit(None))

In [98]:
df_cust_wc_2.show()

+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+-------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|     customer_street|customer_city|customer_state|customer_zipcode|newnull|
+-----------+--------------+--------------+--------------+-----------------+--------------------+-------------+--------------+----------------+-------+
|          1|       Richard|     Hernandez|     XXXXXXXXX|        XXXXXXXXX|  6303 Heather Plaza|  Brownsville|            TX|           78521|   null|
|          2|          Mary|       Barrett|     XXXXXXXXX|        XXXXXXXXX|9526 Noble Embers...|    Littleton|            CO|           80126|   null|
|          3|           Ann|         Smith|     XXXXXXXXX|        XXXXXXXXX|3422 Blue Pioneer...|       Caguas|            PR|           00725|   null|
|          4|          Mary|         Jones|     XXXXXXXXX|        XXXXXXXXX|  8324 Littl

In [100]:
df_cust_wc_3 = df_cust.withColumn("cre_ts", current_timestamp())

In [104]:
df_cust_wc_3.select("customer_id", "customer_fname", "cre_ts").show(5, False)

+-----------+--------------+-----------------------+
|customer_id|customer_fname|cre_ts                 |
+-----------+--------------+-----------------------+
|1          |Richard       |2023-07-16 03:29:55.108|
|2          |Mary          |2023-07-16 03:29:55.108|
|3          |Ann           |2023-07-16 03:29:55.108|
|4          |Mary          |2023-07-16 03:29:55.108|
|5          |Robert        |2023-07-16 03:29:55.108|
+-----------+--------------+-----------------------+
only showing top 5 rows

