In [4]:
import findspark
findspark.init()

In [5]:
import pyspark
from pyspark import SparkContext
sc = SparkContext(master="local", appName="New Spark Context")

In [6]:
sc

In [7]:
# Create an RDD from a list of words 
RDD = sc.parallelize(["Spark","is","a","framework","for","Big Data processing"])

# Print out the type of the created object
print("The type of RDD is", type(RDD))

The type of RDD is <class 'pyspark.rdd.RDD'>


In [9]:
RDD1 = sc.textFile("Cung cap du lieu buoi 2/5000_points.txt", minPartitions=3)
print("The type of RDD1 is", type(RDD1))

The type of RDD1 is <class 'pyspark.rdd.RDD'>


In [10]:
RDD1.first()

'664159\t550946'

In [11]:
RDD1.getNumPartitions()

3

In [12]:
RDD2 = sc.textFile("hdfs://bigdata.laptrinhpython.net:19000/t8.shakespeare.txt")
print("The type of RDD2 is", type(RDD2))

The type of RDD2 is <class 'pyspark.rdd.RDD'>


In [13]:
RDD2.first()

'This is the 100th Etext file presented by Project Gutenberg, and'

In [14]:
RDD2.getNumPartitions()

1

In [15]:
RDD3 = sc.parallelize(RDD2.take(10))

In [16]:
type(RDD3)

pyspark.rdd.RDD

In [17]:
RDD3.first()

'This is the 100th Etext file presented by Project Gutenberg, and'

In [18]:
RDD4 = sc.parallelize([1, 4, 3, 5, 6, 7])
RDD_map = RDD4.map(lambda x: x * x)
RDD_map.collect()

[1, 16, 9, 25, 36, 49]

In [20]:
RDD_filter = RDD4.filter(lambda x: x > 3)
numbers_all = RDD_filter.collect()

In [21]:
for numb in numbers_all: 
    print(numb)

4
5
6
7


In [23]:
RDD_string = sc.parallelize(["Data Science", "Machine Learning", "Big Data"])
RDD_flatmap = RDD_string.flatMap(lambda x: x.split(" "))
RDD_flatmap.collect()

['Data', 'Science', 'Machine', 'Learning', 'Big', 'Data']

In [26]:
RDD_union = RDD4.union(RDD_map)
RDD_union.collect()

[1, 4, 3, 5, 6, 7, 1, 16, 9, 25, 36, 49]

In [25]:
RDD1 = sc.parallelize([1,2,4,4,5])
RDD2 = sc.parallelize([1,3,2,1,2])
RDD_union = RDD1.union(RDD2)
RDD_union.collect()

[1, 2, 4, 4, 5, 1, 3, 2, 1, 2]

In [27]:
RDD_flatmap.take(4)

['Data', 'Science', 'Machine', 'Learning']

In [28]:
RDD_flatmap.count()

6

In [29]:
RDD_flatmap.first()

'Data'

In [31]:
RDD4.collect()

[1, 4, 3, 5, 6, 7]

In [32]:
RDD_reduce = RDD4.reduce(lambda x, y: x + y)
RDD_reduce

26

In [33]:
numRDD = sc.parallelize(range(100))

In [34]:
numRDD.getNumPartitions()

1

In [37]:
numRDD.saveAsTextFile("number")

In [36]:
numRDD.coalesce(1).saveAsTextFile("number_all")

- Pair RDDs

In [38]:
# Create Pair RDDs from list of key-value tuple
my_tuple = [('SV001', 'Tran Van An'),('SV002', 'Nguyen Van Anh'),('SV003', 'Le Thi Cuc')]
pairRDD_tuple = sc.parallelize(my_tuple)
pairRDD_tuple.collect()

[('SV001', 'Tran Van An'),
 ('SV002', 'Nguyen Van Anh'),
 ('SV003', 'Le Thi Cuc')]

In [39]:
# Create Pair RDDs from regular RDDs
my_list = ['hello bonjour', 'happy heureux', 'morning matin']
regularRDD = sc.parallelize(my_list)
pairRDD_RDD = regularRDD.map(lambda s: (s.split(' ')[0], s.split(' ')[1]))
pairRDD_RDD.collect()

[('hello', 'bonjour'), ('happy', 'heureux'), ('morning', 'matin')]

In [42]:
# reduceByKey(): combine value and key
Rdd = sc.parallelize([('SV001',8),('SV002',9),('SV001',10),('SV002',9),('SV003',9)])

Rdd_Reduced = Rdd.reduceByKey(lambda x, y: x + y)

for num in Rdd_Reduced.collect():
    print("{} co diem tong la {}".format(num[0], num[1]))

SV001 co diem tong la 18
SV002 co diem tong la 18
SV003 co diem tong la 9


In [43]:
# sortByKey(): return a RDD sorted by Key
Rdd_Reduced_Sort = Rdd_Reduced.sortByKey(ascending=False)
for num in Rdd_Reduced_Sort.collect():
    print("{} co diem tong la {}".format(num[0], num[1]))

SV003 co diem tong la 9
SV002 co diem tong la 18
SV001 co diem tong la 18


In [44]:
# groupByKey(): group value by key
Rdd.collect()

[('SV001', 8), ('SV002', 9), ('SV001', 10), ('SV002', 9), ('SV003', 9)]

In [45]:
Rdd_Group_by = Rdd.groupByKey().collect()

In [46]:
for mssv, marks in Rdd_Group_by:
    print(mssv, list(marks))

SV001 [8, 10]
SV002 [9, 9]
SV003 [9]


In [47]:
# join(): merge RDD based on its key
Rdd_Math = sc.parallelize([('SV001',8),('SV002',9),('SV003',9)])
Rdd_English = sc.parallelize([('SV001',9),('SV002',8),('SV003',8)])

In [48]:
Rdd = Rdd_Math.join(Rdd_English)
Rdd.collect()

[('SV002', (9, 8)), ('SV001', (8, 9)), ('SV003', (9, 8))]

- Action on pair RDDs

In [49]:
# countByKey(): count number of value for each key
Rdd.collect()

[('SV002', (9, 8)), ('SV001', (8, 9)), ('SV003', (9, 8))]

In [50]:
for key, val in Rdd.countByKey().items():
    print(key, val)

SV002 1
SV001 1
SV003 1


In [51]:
# collectAsMap(): return key-value pair in RDD in dictionary format * Note: only the last pair will be returned
Rdd = sc.parallelize([('SV001', 8),('SV002', 9),('SV001', 10),('SV002', 6),('SV003', 9),('SV003', 8)])
Rdd.collectAsMap()

{'SV001': 10, 'SV002': 6, 'SV003': 8}