In [1]:
from functions import extractInformation

In [2]:
sc

# Processing data

- Parallelize data:

In [3]:
data = sc.parallelize(
[('Amber', 22), ('Alfred', 23), ('Skye', 4), 
('Albert', 12), ('Amber', 9), ])

In [4]:
data.collect()

[('Amber', 22), ('Alfred', 23), ('Skye', 4), ('Albert', 12), ('Amber', 9)]

- External or local file:

In [5]:
data_from_file = sc.textFile('./data/VS14MORT.txt')
data_from_file.take(2)

['                   1                                          2101  M1087 432311  4M4                2014U7CN                                    I64 238 070   24 0111I64                                                                                                                                                                           01 I64                                                                                                  01  11                                 100 601',
 '                   1                                          2101  M1058 371708  4D3                2014U7CN                                    I250214 062   21 0311I250 61I272 62E669                                                                                                                                                            03 I250 E669 I272                                                                                       01  11                                 100 601']

# Resilient Distributed Dataset (RDD)

Resilient Distributed Datasets (RDDs) are a distributed collection of immutable JVM
objects that allow you to perform calculations very quickly, and they are the backbone
of Apache Spark.

# Transformations

## Map

In [6]:
data_from_file_conv = data_from_file.map(extractInformation)

In [7]:
data_from_file_conv.take(1)

[array(['1', '  ', '2', '1', '01', 'M', '1', '087', ' ', '43', '23', '11',
        '  ', '4', 'M', '4', '2014', 'U', '7', 'C', 'N', ' ', ' ', 'I64 ',
        '238', '070', '   ', '24', '01', '11I64  ', '       ', '       ',
        '       ', '       ', '       ', '       ', '       ', '       ',
        '       ', '       ', '       ', '       ', '       ', '       ',
        '       ', '       ', '       ', '       ', '       ', '01',
        'I64  ', '     ', '     ', '     ', '     ', '     ', '     ',
        '     ', '     ', '     ', '     ', '     ', '     ', '     ',
        '     ', '     ', '     ', '     ', '     ', '     ', '01', ' ',
        ' ', '1', '1', '100', '6'], dtype='<U40')]

New dataset (Year of death into numeric value).

In [8]:
data_2014 = data_from_file_conv.map(lambda row: int(row[16]))

In [9]:
data_2014.take(5)

[2014, 2014, 2014, 2014, 2014]

More columns need to use tuples, dicts or lists.

In [10]:
data_2014_2 = data_from_file_conv.map(
    lambda row: (row[16], int(row[16])))

In [11]:
data_2014_2.take(5)

[('2014', 2014),
 ('2014', 2014),
 ('2014', 2014),
 ('2014', 2014),
 ('2014', 2014)]

## Filter

In [12]:
data_filtered = data_from_file_conv.filter(
    lambda row: (row[16] == '2014' and row[21] == '0'))

In [13]:
data_filtered.count()

22

## Flat Map

In [14]:
data_2014_flat = data_from_file_conv.flatMap(
    lambda row: (row[16], int(row[16]) + 1))

In [15]:
data_2014_flat.take(10)

['2014', 2015, '2014', 2015, '2014', 2015, '2014', 2015, '2014', 2015]

## Distinct

In [16]:
distinct_gender = data_from_file_conv.map(
    lambda row: row[5]).distinct()

In [17]:
distinct_gender.collect()

['-99', 'M', 'F']

## Sample

In [18]:
fraction = 0.1
data_sample = data_from_file_conv.sample(
    withReplacement=False,
    fraction=fraction,
    seed=42)

In [19]:
print('Original dataset: {0}, sample: {1}'.format(
    data_from_file_conv.count(), data_sample.count()))

Original dataset: 2631171, sample: 262402


## Join

- Left outer join:

In [20]:
rdd1 = sc.parallelize(
    [('a', 1), ('b', 4), ('c', 10),])
rdd2 = sc.parallelize(
    [('a', 4), ('a', 1), ('b', '6'), ('d', 15)])

In [21]:
rdd3 = rdd1.leftOuterJoin(rdd2)

In [22]:
rdd3.collect()

[('c', (10, None)), ('b', (4, '6')), ('a', (1, 4)), ('a', (1, 1))]

- Inner join:

In [23]:
rdd4 = rdd1.join(rdd2)

In [24]:
rdd4.collect()

[('b', (4, '6')), ('a', (1, 4)), ('a', (1, 1))]

- Intersection:

In [25]:
rdd5 = rdd1.intersection(rdd2)

In [26]:
rdd5.collect()

[('a', 1)]

# Actions

- take(n): return n top rows from a single data partition;
- takeSample(withReplacement, n, seed): take n random records;
- count(): counts the number of elements in the RDD;
- countByKey(): get the counts of distinct keys, if the dataset is in a key-value form;
- collect(): returns all the elements of the RDD to the driver.

## Reduce

In [27]:
rdd1.map(lambda row: row[1]).reduce(lambda x,y: x + y)

15

reduceByKey() works in a similar wai, but it performs a reduction on a key-by-key basis.

In [28]:
data_key = sc.parallelize(
 [('a', 4),('b', 3),('c', 2),('a', 8),('d', 2),('b', 1),
 ('d', 3)],4)
data_key.reduceByKey(lambda x, y: x + y).collect()

[('b', 4), ('c', 2), ('a', 12), ('d', 5)]

## Save as text file

In [29]:
from functions import parseInput

In [30]:
data_key.saveAsTextFile(
    './data/data_key.txt')

In [31]:
data_key_read = sc.textFile('./data/data_key.txt').\
    map(parseInput)

In [32]:
data_key_read.collect()

[('a', 4), ('a', 8), ('d', 2), ('b', 3), ('c', 2), ('b', 1), ('d', 3)]

## For each

Applies same function to each element in RDD in an iterative way. It is useful when you want to save the data to a database that is not natively supported by PySpark.

In [33]:
# CLI will show each RDD line printed
data_key.foreach(print)