### RDD Basics

In [1]:
# Import spark session
from pyspark.sql import SparkSession



In [3]:
# Create SparkSession , cluster manager is 'local' with one partition. Which is minimum
spark = SparkSession.builder \
      .master("local[1]") \
      .appName("RDDExample") \
      .getOrCreate() 

In [4]:
#Create RDD from parallelize    
data = [i for i in range(15)]
rdd=spark.sparkContext.parallelize(data)

In [22]:
#Create RDD from external Data source
rdd2 = spark.sparkContext.textFile("rdd_file.txt")

In [23]:
rdd2.count() , rdd2.getNumPartitions()

(126, 1)

In [10]:
rdd.count() , rdd.getNumPartitions()

(15, 1)

In [25]:

# Creates empty RDD with no partition    
rdd = spark.sparkContext.parallelize([]) 
rdd.count() , rdd.getNumPartitions()

(0, 1)

In [24]:

#Create empty RDD with partition , creates 10 partitions
rdd2 = spark.sparkContext.parallelize([],10) 
rdd2.count() , rdd2.getNumPartitions()

(0, 10)

In [26]:
# Repartition
reparRdd = rdd2.repartition(4)
reparRdd.getNumPartitions()

4

### RDD Transformations

In [27]:
#Create RDD from external Data source
rdd = spark.sparkContext.textFile("rdd_file.txt")

#### Flatmap
Flattens the rdd. Each item becomes a new record in the RDD

In [28]:
rdd2 = rdd.flatMap(lambda x: x.split(" "))

In [29]:
rdd.count() , rdd.getNumPartitions()

(126, 1)

In [30]:
rdd2.count() , rdd2.getNumPartitions()

(522, 1)

#### Map
Used to add or update columns

In [37]:
# map returns a PairRDDFunctions
rdd3 = rdd2.map(lambda x: (x,1))
type(rdd3)

pyspark.rdd.PipelinedRDD

In [36]:
rdd3.collect()

[('Project', 1),
 ('Gutenberg’s', 1),
 ('Alice’s', 1),
 ('Adventures', 1),
 ('in', 1),
 ('Wonderland', 1),
 ('by', 1),
 ('Lewis', 1),
 ('Carroll', 1),
 ('This', 1),
 ('eBook', 1),
 ('is', 1),
 ('for', 1),
 ('the', 1),
 ('use', 1),
 ('of', 1),
 ('anyone', 1),
 ('anywhere', 1),
 ('at', 1),
 ('no', 1),
 ('cost', 1),
 ('and', 1),
 ('with', 1),
 ('Alice’s', 1),
 ('Adventures', 1),
 ('in', 1),
 ('Wonderland', 1),
 ('by', 1),
 ('Lewis', 1),
 ('Carroll', 1),
 ('This', 1),
 ('eBook', 1),
 ('is', 1),
 ('for', 1),
 ('the', 1),
 ('use', 1),
 ('of', 1),
 ('anyone', 1),
 ('anywhere', 1),
 ('at', 1),
 ('no', 1),
 ('cost', 1),
 ('and', 1),
 ('with', 1),
 ('This', 1),
 ('eBook', 1),
 ('is', 1),
 ('for', 1),
 ('the', 1),
 ('use', 1),
 ('of', 1),
 ('anyone', 1),
 ('anywhere', 1),
 ('at', 1),
 ('no', 1),
 ('cost', 1),
 ('and', 1),
 ('with', 1),
 ('Project', 1),
 ('Gutenberg’s', 1),
 ('Alice’s', 1),
 ('Adventures', 1),
 ('in', 1),
 ('Wonderland', 1),
 ('by', 1),
 ('Lewis', 1),
 ('Carroll', 1),
 ('This', 1)

In [38]:
data = [('James','Smith','M',30),
  ('Anna','Rose','F',41),
  ('Robert','Williams','M',62), 
]

In [39]:
columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|    30|
|     Anna|    Rose|     F|    41|
|   Robert|Williams|     M|    62|
+---------+--------+------+------+



In [40]:
type(df)

pyspark.sql.dataframe.DataFrame

In [41]:
# Refering columns by index.
rdd2=df.rdd.map(lambda x: 
    (x[0]+","+x[1],x[2],x[3]*2)
    )  
df2=rdd2.toDF(["name","gender","new_salary"])
df2.show()

+---------------+------+----------+
|           name|gender|new_salary|
+---------------+------+----------+
|    James,Smith|     M|        60|
|      Anna,Rose|     F|        82|
|Robert,Williams|     M|       124|
+---------------+------+----------+



In [43]:
type(df2)

pyspark.sql.dataframe.DataFrame

In [42]:
# Alternatively referring Column Names
rdd2=df.rdd.map(lambda x: 
    (x["firstname"]+","+x["lastname"],x["gender"],x["salary"]*2)
    ) 

In [45]:
rdd2.toDF(["name","gender","new_salary"]).show()

+---------------+------+----------+
|           name|gender|new_salary|
+---------------+------+----------+
|    James,Smith|     M|        60|
|      Anna,Rose|     F|        82|
|Robert,Williams|     M|       124|
+---------------+------+----------+



In [47]:
# Another alternative referring Column Names
rdd2=df.rdd.map(lambda x: 
    (x.firstname+","+x.lastname,x.gender,x.salary*2)
    ) 

In [48]:
rdd2.toDF(["name","gender","new_salary"]).show()

+---------------+------+----------+
|           name|gender|new_salary|
+---------------+------+----------+
|    James,Smith|     M|        60|
|      Anna,Rose|     F|        82|
|Robert,Williams|     M|       124|
+---------------+------+----------+



#### Filter
Used to filter records

In [58]:
rdd4 = rdd2.toDF(["name","gender","new_salary"]).rdd.filter(lambda x : x.new_salary > 70)
type(rdd4)

pyspark.rdd.PipelinedRDD

In [59]:
rdd4.toDF().show()

+---------------+------+----------+
|           name|gender|new_salary|
+---------------+------+----------+
|      Anna,Rose|     F|        82|
|Robert,Williams|     M|       124|
+---------------+------+----------+

