# RESILIENT DISTRIBUTED DATASET

In [1]:
#RDD CREATION

In [2]:
#In order to create an RDD, first, you need to create a SparkSession which is an entry point to the PySpark application.
#SparkSession can be created using a builder() or newSession() methods of the SparkSession.

In [3]:
#Spark session internally creates a sparkContext variable of SparkContext.
#You can create multiple SparkSession objects but only one SparkContext per JVM. 
#In case if you want to create another new SparkContext you should stop existing Sparkcontext (using stop()),
#before creating a new one.



In [4]:
#Import SparkSession
from pyspark.sql import SparkSession

In [5]:
spark1=SparkSession.builder\
.master("local[1]")\
.appName('rdd example')\
.getOrCreate()

In [6]:
# master() – If you are running it on the cluster you need to use your master name as an argument to master(). 
#usually, it would be either yarn or mesos depends on your cluster setup.
#Use local[x] when running in Standalone mode. x should be an integer value and should be greater than 0;
#this represents how many partitions it should create when using RDD, DataFrame, and Dataset.
#Ideally, x value should be the number of CPU cores you have

In [7]:
#appName() – Used to set your application name.
#getOrCreate() – This returns a SparkSession object if already exists, and creates a new one if not exist.
#Creating SparkSession object, internally creates one SparkContext per JVM.

In [8]:
spark1

# creating RDD using sparkContext.parallelize()


In [20]:
#This is a basic method to create RDD and is used when you already have data in memory that is either loaded from a file or from a database, and it required all data to be present on the driver program prior to creating RDD.

In [21]:
data=[1,2,3,4,8,9,6]
rdd=spark.sparkContext.parallelize(data)

# creating RDD using sparkContext.textFile()

In [52]:
orders=spark1.sparkContext.textFile(r"C:\Users\user\OneDrive\Desktop\Big-data-trendytech\udemy\RetailDB+SalesData\RetailDB SalesData\Orders\part-00000.txt")

# creating RDD using sparkContext.wholeTextFiles()

In [17]:
rdd3=spark1.sparkContext.wholeTextFiles(r"C:\Users\user\OneDrive\Desktop\Big-data-trendytech\udemy\RetailDB+SalesData\RetailDB SalesData\Orders\part-00000.txt")

In [25]:
# wholeTextFiles() function returns a PairRDD with the key being the file path and value being file content.

# creating empty RDD with no partition using sparkContext.emptyRDD 

In [19]:
rdd4=spark1.sparkContext.emptyRDD

# creating empty RDD with partition

In [33]:
#rdd5 = spark.sparkContext.parallelize([],10)


In [53]:
orders.take(5)# to view the content , we can use take and collect

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

In [21]:
rdd2.getNumPartitions()# to see the number of partitions

1

In [22]:
rdd2.glom().map(len).collect()# to get the number of records in each partition

[68883]

In [49]:
#creating rdd from a local text file

In [45]:
file=open('sample.txt').read().splitlines()

In [46]:
file

['0,aaa,3', '1,bbb,5', '2,ccc,8', '3,ddd,33', '4,eee,38']

In [47]:
rdd_p=spark1.sparkContext.parallelize(file)

In [48]:
rdd_p.collect()

['0,aaa,3', '1,bbb,5', '2,ccc,8', '3,ddd,33', '4,eee,38']